From: Kaike Wan Date: Tue, 10 Nov 2015 12:10:27 +0000 (-0500) Subject: ibacm: Add support for pathrecord query through netlink X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=6569929ba93f6d6ecfa3aca87ddfc93ef2fcdf3c;p=~shefty%2Fibacm.git ibacm: Add support for pathrecord query through netlink This patch enables ibacm to process pathrecord queries through netlink. Since ibacm can cache pathrecords, this implementation provides an easy pathrecord cache for kernel components and therefore offers great performance advantage on large fabric systems. Signed-off-by: Kaike Wan Signed-off-by: John Fleck Signed-off-by: Ira Weiny --- diff --git a/Makefile.am b/Makefile.am index 7690cb9..a910585 100644 --- a/Makefile.am +++ b/Makefile.am @@ -40,7 +40,7 @@ man_MANS = \ EXTRA_DIST = src/acm_util.h prov/acmp/src/libibacmp.map \ include/acm_mad.h src/libacm.h ibacm.init.in \ linux/osd.h linux/dlist.h ibacm.spec.in \ - $(man_MANS) ibacm_hosts.data + $(man_MANS) ibacm_hosts.data src/acm_netlink.h install-exec-hook: install -D -m 755 ibacm.init $(DESTDIR)$(sysconfdir)/init.d/ibacm; diff --git a/src/acm.c b/src/acm.c index 7649725..ada0bfb 100644 --- a/src/acm.c +++ b/src/acm.c @@ -46,6 +46,8 @@ #include #include #include +#include +#include #include #include #include @@ -55,9 +57,14 @@ #include #include #include +#include +#include #include #include "acm_mad.h" #include "acm_util.h" +#if !defined(RDMA_NL_LS_F_ERR) + #include "acm_netlink.h" +#endif #define src_out data[0] #define src_index data[1] @@ -66,6 +73,7 @@ #define MAX_EP_ADDR 4 #define NL_MSG_BUF_SIZE 4096 #define ACM_PROV_NAME_SIZE 64 +#define NL_CLIENT_INDEX 0 struct acmc_subnet { DLIST_ENTRY entry; @@ -151,6 +159,21 @@ struct acmc_sa_req { struct acm_sa_mad mad; }; +struct acm_nl_path { + struct nlattr attr_hdr; + struct ib_path_rec_data rec; +}; + +struct acm_nl_msg { + struct nlmsghdr nlmsg_header; + union { + uint8_t data[ACM_MSG_DATA_LENGTH]; + struct rdma_ls_resolve_header resolve_header; + struct nlattr attr[0]; + struct acm_nl_path path[0]; + }; +}; + static char def_prov_name[ACM_PROV_NAME_SIZE] = "ibacmp"; static DLIST_ENTRY provider_list; static struct acmc_prov *def_provider = NULL; @@ -172,6 +195,7 @@ static struct acmc_ep *acm_find_ep(struct acmc_port *port, uint16_t pkey); static int acm_ep_insert_addr(struct acmc_ep *ep, const char *name, uint8_t *addr, size_t addr_len, uint8_t addr_type); static void acm_event_handler(struct acmc_device *dev); +static int acm_nl_send(SOCKET sock, struct acm_msg *msg); static struct sa_data { int timeout; @@ -466,7 +490,11 @@ int acm_resolve_response(uint64_t id, struct acm_msg *msg) goto release; } - ret = send(client->sock, (char *) msg, msg->hdr.length, 0); + if (id == NL_CLIENT_INDEX) + ret = acm_nl_send(client->sock, msg); + else + ret = send(client->sock, (char *) msg, msg->hdr.length, 0); + if (ret != msg->hdr.length) acm_log(0, "ERROR - failed to send response\n"); else @@ -597,6 +625,8 @@ static void acm_svr_accept(void) } for (i = 0; i < FD_SETSIZE - 1; i++) { + if (i == NL_CLIENT_INDEX) + continue; if (!atomic_get(&client_array[i].refcnt)) break; } @@ -1346,6 +1376,328 @@ static void acm_ipnl_handler(void) } } +static int acm_nl_send(SOCKET sock, struct acm_msg *msg) +{ + struct sockaddr_nl dst_addr; + struct acm_nl_msg acmnlmsg; + struct acm_nl_msg *orig; + int ret; + int datalen; + + orig = (struct acm_nl_msg *) msg->hdr.tid; + + memset(&dst_addr, 0, sizeof(dst_addr)); + dst_addr.nl_family = AF_NETLINK; + dst_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); + + memset(&acmnlmsg, 0, sizeof(acmnlmsg)); + acmnlmsg.nlmsg_header.nlmsg_len = NLMSG_HDRLEN; + acmnlmsg.nlmsg_header.nlmsg_pid = getpid(); + acmnlmsg.nlmsg_header.nlmsg_type = orig->nlmsg_header.nlmsg_type; + acmnlmsg.nlmsg_header.nlmsg_seq = orig->nlmsg_header.nlmsg_seq; + + if (msg->hdr.status != ACM_STATUS_SUCCESS) { + acm_log(2, "acm status no success = %d\n", msg->hdr.status); + acmnlmsg.nlmsg_header.nlmsg_flags |= RDMA_NL_LS_F_ERR; + } else { + acm_log(2, "acm status success\n"); + acmnlmsg.nlmsg_header.nlmsg_len += + NLA_ALIGN(sizeof(struct acm_nl_path)); + acmnlmsg.path[0].attr_hdr.nla_type = LS_NLA_TYPE_PATH_RECORD; + acmnlmsg.path[0].attr_hdr.nla_len = sizeof(struct acm_nl_path); + if (orig->resolve_header.path_use == + LS_RESOLVE_PATH_USE_UNIDIRECTIONAL) + acmnlmsg.path[0].rec.flags = IB_PATH_PRIMARY | + IB_PATH_OUTBOUND; + else + acmnlmsg.path[0].rec.flags = IB_PATH_PRIMARY | + IB_PATH_GMP | IB_PATH_BIDIRECTIONAL; + memcpy(acmnlmsg.path[0].rec.path_rec, + &msg->resolve_data[0].info.path, + sizeof(struct ibv_path_record)); + } + + datalen = NLMSG_ALIGN(acmnlmsg.nlmsg_header.nlmsg_len); + ret = sendto(sock, &acmnlmsg, datalen, 0, + (const struct sockaddr *)&dst_addr, + (socklen_t)sizeof(dst_addr)); + if (ret != datalen) { + acm_log(0, "ERROR - sendto = %d errno = %d\n", ret, errno); + ret = -1; + } else { + ret = msg->hdr.length; + } + + free(orig); + + return ret; +} + +#define NLA_LEN(nla) ((nla)->nla_len - NLA_HDRLEN) +#define NLA_DATA(nla) ((char *)(nla) + NLA_HDRLEN) + +static int acm_nl_parse_path_attr(struct nlattr *attr, + struct acm_ep_addr_data *data) +{ + struct ibv_path_record *path; + uint64_t *sid; + struct rdma_nla_ls_gid *gid; + uint8_t *tcl; + uint16_t *pkey; + uint16_t *qos; + uint16_t val; + int ret = 0; + +#define IBV_PATH_RECORD_QOS_MASK 0xfff0 + + path = &data->info.path; + switch (attr->nla_type & RDMA_NLA_TYPE_MASK) { + case LS_NLA_TYPE_SERVICE_ID: + sid = (uint64_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*sid)) { + acm_log(2, "service_id 0x%llx\n", *sid); + path->service_id = htonll(*sid); + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_DGID: + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(gid->gid)) { + acm_format_name(2, log_data, sizeof(log_data), + ACM_ADDRESS_GID, gid->gid, + sizeof(union ibv_gid)); + acm_log(2, "path dgid %s\n", log_data); + memcpy(path->dgid.raw, gid->gid, sizeof(path->dgid)); + data->flags |= ACM_EP_FLAG_DEST; + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_SGID: + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(gid->gid)) { + acm_format_name(2, log_data, sizeof(log_data), + ACM_ADDRESS_GID, gid->gid, + sizeof(union ibv_gid)); + acm_log(2, "path sgid %s\n", log_data); + memcpy(path->sgid.raw, gid->gid, sizeof(path->sgid)); + data->flags |= ACM_EP_FLAG_SOURCE; + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_TCLASS: + tcl = (uint8_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*tcl)) { + acm_log(2, "tclass 0x%x\n", *tcl); + path->tclass = *tcl; + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_PKEY: + pkey = (uint16_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*pkey)) { + acm_log(2, "pkey 0x%x\n", *pkey); + path->pkey = htons(*pkey); + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_QOS_CLASS: + qos = (uint16_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*qos)) { + acm_log(2, "qos_class 0x%x\n", *qos); + val = ntohs(path->qosclass_sl); + val &= ~IBV_PATH_RECORD_QOS_MASK; + val |= (*qos & IBV_PATH_RECORD_QOS_MASK); + path->qosclass_sl = htons(val); + } else { + ret = -1; + } + break; + + default: + acm_log(1, "WARN: unknown attr %x\n", attr->nla_type); + /* We can not ignore a mandatory attribute */ + if (attr->nla_type & RDMA_NLA_F_MANDATORY) + ret = -1; + break; + } + + return ret; +} + +static void acm_nl_process_invalid_request(struct acmc_client *client, + struct acm_nl_msg *acmnlmsg) +{ + struct acm_msg msg; + + memset(&msg, 0, sizeof(msg)); + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.version = ACM_VERSION; + msg.hdr.length = ACM_MSG_HDR_LENGTH; + msg.hdr.status = ACM_STATUS_EINVAL; + msg.hdr.tid = (uint64_t) acmnlmsg; + + acm_nl_send(client->sock, &msg); +} + +static void acm_nl_process_resolve(struct acmc_client *client, + struct acm_nl_msg *acmnlmsg) +{ + struct acm_msg msg; + struct nlattr *attr; + int payload_len; + int resolve_hdr_len; + int rem; + int total_attr_len; + int status; + unsigned char *data; + + memset(&msg, 0, sizeof(msg)); + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.version = ACM_VERSION; + msg.hdr.length = ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH; + msg.hdr.status = ACM_STATUS_SUCCESS; + msg.hdr.tid = (uint64_t) acmnlmsg; + msg.resolve_data[0].type = ACM_EP_INFO_PATH; + + /* We support only one pathrecord */ + acm_log(2, "path use 0x%x\n", acmnlmsg->resolve_header.path_use); + if (acmnlmsg->resolve_header.path_use == + LS_RESOLVE_PATH_USE_UNIDIRECTIONAL) + msg.resolve_data[0].info.path.reversible_numpath = 1; + else + msg.resolve_data[0].info.path.reversible_numpath = + IBV_PATH_RECORD_REVERSIBLE | 1; + + data = (unsigned char *) &acmnlmsg->nlmsg_header + NLMSG_HDRLEN; + resolve_hdr_len = NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header)); + attr = (struct nlattr *) (data + resolve_hdr_len); + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN - + resolve_hdr_len; + rem = payload_len; + while (1) { + if (rem < (int) sizeof(*attr) || + attr->nla_len < sizeof(*attr) || + attr->nla_len > rem) + break; + + status = acm_nl_parse_path_attr(attr, &msg.resolve_data[0]); + if (status) { + acm_nl_process_invalid_request(client, acmnlmsg); + return; + } + + /* Next attribute */ + total_attr_len = NLA_ALIGN(attr->nla_len); + rem -= total_attr_len; + attr = (struct nlattr *) ((char *) attr + total_attr_len); + } + + atomic_inc(&counter[ACM_CNTR_RESOLVE]); + acm_svr_resolve(client, &msg); +} + +static int acm_nl_is_valid_resolve_request(struct acm_nl_msg *acmnlmsg) +{ + int payload_len; + + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN; + if (payload_len < (sizeof(struct rdma_ls_resolve_header) + + sizeof(struct nlattr))) + return 0; + + return 1; +} + +static void acm_nl_receive(struct acmc_client *client) +{ + struct acm_nl_msg *acmnlmsg; + int datalen = sizeof(*acmnlmsg); + int ret; + uint16_t client_inx, op; + + acmnlmsg = calloc(1, sizeof(*acmnlmsg)); + if (!acmnlmsg) { + acm_log(0, "Out of memory for recving nl msg.\n"); + return; + } + ret = recv(client->sock, acmnlmsg, datalen, 0); + if (!NLMSG_OK(&acmnlmsg->nlmsg_header, ret)) { + acm_log(0, "Netlink receive error: %d.\n", ret); + goto rcv_cleanup; + } + + acm_log(2, "nlmsg: len %d type 0x%x flags 0x%x seq %d pid %d\n", + acmnlmsg->nlmsg_header.nlmsg_len, + acmnlmsg->nlmsg_header.nlmsg_type, + acmnlmsg->nlmsg_header.nlmsg_flags, + acmnlmsg->nlmsg_header.nlmsg_seq, + acmnlmsg->nlmsg_header.nlmsg_pid); + + /* Currently we handle only request from the local service client */ + client_inx = RDMA_NL_GET_CLIENT(acmnlmsg->nlmsg_header.nlmsg_type); + op = RDMA_NL_GET_OP(acmnlmsg->nlmsg_header.nlmsg_type); + if (client_inx != RDMA_NL_LS) + goto rcv_cleanup; + + switch (op) { + case RDMA_NL_LS_OP_RESOLVE: + if (acm_nl_is_valid_resolve_request(acmnlmsg)) + acm_nl_process_resolve(client, acmnlmsg); + else + acm_nl_process_invalid_request(client, acmnlmsg); + break; + default: + /* Not supported*/ + acm_log(1, "WARN - invalid opcode %x\n", op); + acm_nl_process_invalid_request(client, acmnlmsg); + break; + } + + return; +rcv_cleanup: + free(acmnlmsg); +} + +static int acm_init_nl(void) +{ + struct sockaddr_nl src_addr; + int ret; + SOCKET nl_rcv_socket; + + nl_rcv_socket = socket(PF_NETLINK, SOCK_RAW, NETLINK_RDMA); + if (nl_rcv_socket == INVALID_SOCKET) { + acm_log(0, "ERROR - unable to allocate netlink recv socket\n"); + return socket_errno(); + } + + memset(&src_addr, 0, sizeof(src_addr)); + src_addr.nl_family = AF_NETLINK; + src_addr.nl_pid = getpid(); + src_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); + + ret = bind(nl_rcv_socket, (struct sockaddr *)&src_addr, + sizeof(src_addr)); + if (ret == SOCKET_ERROR) { + acm_log(0, "ERROR - unable to bind netlink socket\n"); + close(nl_rcv_socket); + return socket_errno(); + } + + /* init nl client structure */ + client_array[NL_CLIENT_INDEX].sock = nl_rcv_socket; + return 0; +} + static void acm_server(void) { fd_set readfds; @@ -1360,12 +1712,14 @@ static void acm_server(void) acm_log(0, "ERROR - server listen failed\n"); return; } + ret = acm_init_nl(); + if (ret) + acm_log(1, "Warn - Netlink init failed\n"); while (1) { n = (int) listen_socket; FD_ZERO(&readfds); FD_SET(listen_socket, &readfds); - n = max(n, (int) ip_mon_socket); FD_SET(ip_mon_socket, &readfds); @@ -1399,7 +1753,10 @@ static void acm_server(void) if (client_array[i].sock != INVALID_SOCKET && FD_ISSET(client_array[i].sock, &readfds)) { acm_log(2, "receiving from client %d\n", i); - acm_svr_receive(&client_array[i]); + if (i == NL_CLIENT_INDEX) + acm_nl_receive(&client_array[i]); + else + acm_svr_receive(&client_array[i]); } } @@ -2756,6 +3113,8 @@ int CDECL_FUNC main(int argc, char **argv) acm_server(); acm_log(0, "shutting down\n"); + if (client_array[NL_CLIENT_INDEX].sock != INVALID_SOCKET) + close(client_array[NL_CLIENT_INDEX].sock); acm_close_providers(); acm_stop_sa_handler(); umad_done(); diff --git a/src/acm_netlink.h b/src/acm_netlink.h new file mode 100644 index 0000000..867ae8c --- /dev/null +++ b/src/acm_netlink.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2015 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenFabrics.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(ACM_NETLINK_H) +#define ACM_NETLINK_H + +/* + * This header file basically copies the local service related defines and + * structures from the latest kernel include/uapi/rdma/rdma_netlink.h file + * so that ibacm can be built without the latest kernel patches. + */ + +enum { + RDMA_NL_LS = 4, /* RDMA Local Services */ +}; + +enum { + RDMA_NL_GROUP_LS = 3, +}; + +/* + * Local service operations: + * RESOLVE - The client requests the local service to resolve a path. + * SET_TIMEOUT - The local service requests the client to set the timeout. + */ +enum { + RDMA_NL_LS_OP_RESOLVE = 0, + RDMA_NL_LS_OP_SET_TIMEOUT, + RDMA_NL_LS_NUM_OPS +}; + +/* Local service netlink message flags */ +#define RDMA_NL_LS_F_ERR 0x0100 /* Failed response */ + +/* + * Local service resolve operation family header. + * The layout for the resolve operation: + * nlmsg header + * family header + * attributes + */ + +/* + * Local service path use: + * Specify how the path(s) will be used. + * ALL - For connected CM operation (6 pathrecords) + * UNIDIRECTIONAL - For unidirectional UD (1 pathrecord) + * GMP - For miscellaneous GMP like operation (at least 1 reversible + * pathrecord) + */ +enum { + LS_RESOLVE_PATH_USE_ALL = 0, + LS_RESOLVE_PATH_USE_UNIDIRECTIONAL, + LS_RESOLVE_PATH_USE_GMP, + LS_RESOLVE_PATH_USE_MAX +}; + +#define LS_DEVICE_NAME_MAX 64 + +struct rdma_ls_resolve_header { + __u8 device_name[LS_DEVICE_NAME_MAX]; + __u8 port_num; + __u8 path_use; +}; + +/* Local service attribute type */ +#define RDMA_NLA_F_MANDATORY (1 << 13) +#define RDMA_NLA_TYPE_MASK (~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \ + RDMA_NLA_F_MANDATORY)) + +/* + * Local service attributes: + * Attr Name Size Byte order + * ----------------------------------------------------- + * PATH_RECORD struct ib_path_rec_data + * TIMEOUT u32 cpu + * SERVICE_ID u64 cpu + * DGID u8[16] BE + * SGID u8[16] BE + * TCLASS u8 + * PKEY u16 cpu + * QOS_CLASS u16 cpu + */ +enum { + LS_NLA_TYPE_UNSPEC = 0, + LS_NLA_TYPE_PATH_RECORD, + LS_NLA_TYPE_TIMEOUT, + LS_NLA_TYPE_SERVICE_ID, + LS_NLA_TYPE_DGID, + LS_NLA_TYPE_SGID, + LS_NLA_TYPE_TCLASS, + LS_NLA_TYPE_PKEY, + LS_NLA_TYPE_QOS_CLASS, + LS_NLA_TYPE_MAX +}; + +/* Local service DGID/SGID attribute: big endian */ +struct rdma_nla_ls_gid { + __u8 gid[16]; +}; + +#endif /* ACM_NETLINK_H */