diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx4')
| -rw-r--r-- | drivers/infiniband/hw/mlx4/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/Makefile | 2 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/ah.c | 42 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/alias_GUID.c | 688 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/cm.c | 478 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/cq.c | 132 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/doorbell.c | 4 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/mad.c | 1812 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/main.c | 1529 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/mcg.c | 1257 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/mlx4_ib.h | 432 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/mr.c | 126 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/qp.c | 1215 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/srq.c | 7 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/sysfs.c | 906 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx4/user.h | 12 |
16 files changed, 8158 insertions, 486 deletions
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig index 24ab11a9ad1..fc01deac1d3 100644 --- a/drivers/infiniband/hw/mlx4/Kconfig +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -1,6 +1,6 @@ config MLX4_INFINIBAND tristate "Mellanox ConnectX HCA support" - depends on NETDEVICES && ETHERNET && PCI + depends on NETDEVICES && ETHERNET && PCI && INET select NET_VENDOR_MELLANOX select MLX4_CORE ---help--- diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile index 70f09c7826d..f4213b3a8fe 100644 --- a/drivers/infiniband/hw/mlx4/Makefile +++ b/drivers/infiniband/hw/mlx4/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o -mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o +mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index a251becdaa9..2d8c3397774 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -39,25 +39,6 @@ #include "mlx4_ib.h" -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, - u8 *mac, int *is_mcast, u8 port) -{ - struct in6_addr in6; - - *is_mcast = 0; - - memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); - if (rdma_link_local_addr(&in6)) - rdma_get_ll_mac(&in6, mac); - else if (rdma_is_multicast_addr(&in6)) { - rdma_get_mcast_mac(&in6, mac); - *is_mcast = 1; - } else - return -EINVAL; - - return 0; -} - static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct mlx4_ib_ah *ah) { @@ -92,21 +73,18 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; - union ib_gid sgid; - u8 mac[6]; - int err; - int is_mcast; + int is_mcast = 0; + struct in6_addr in6; u16 vlan_tag; - err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); - if (err) - return ERR_PTR(err); - - memcpy(ah->av.eth.mac, mac, 6); - err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid); - if (err) - return ERR_PTR(err); - vlan_tag = rdma_get_vlan_id(&sgid); + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) { + is_mcast = 1; + rdma_get_mcast_mac(&in6, ah->av.eth.mac); + } else { + memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN); + } + vlan_tag = ah_attr->vlan_id; if (vlan_tag < 0x1000) vlan_tag |= (ah_attr->sl & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c new file mode 100644 index 00000000000..0eb141c4141 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of the Alias GUID feature. */ +/***********************************************************/ +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_sa.h> +#include <rdma/ib_pack.h> +#include <linux/mlx4/cmd.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <rdma/ib_user_verbs.h> +#include <linux/delay.h> +#include "mlx4_ib.h" + +/* +The driver keeps the current state of all guids, as they are in the HW. +Whenever we receive an smp mad GUIDInfo record, the data will be cached. +*/ + +struct mlx4_alias_guid_work_context { + u8 port; + struct mlx4_ib_dev *dev ; + struct ib_sa_query *sa_query; + struct completion done; + int query_id; + struct list_head list; + int block_num; +}; + +struct mlx4_next_alias_guid_work { + u8 port; + u8 block_num; + struct mlx4_sriov_alias_guid_info_rec_det rec_det; +}; + + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, + u8 port_num, u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + int port_index = port_num - 1; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* The location of the specific index starts from bit number 4 + * until bit num 11 */ + if (test_bit(i + 4, (unsigned long *)&guid_indexes)) { + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_slaves) { + pr_debug("The last slave: %d\n", slave_id); + return; + } + + /* cache the guid: */ + memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id], + &p_data[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } else + pr_debug("Guid number: %d in block: %d" + " was not updated\n", i, block_num); + } +} + +static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index) +{ + if (index >= NUM_ALIAS_GUID_PER_PORT) { + pr_err("%s: ERROR: asked for index:%d\n", __func__, index); + return (__force __be64) -1; + } + return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index]; +} + + +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index) +{ + return IB_SA_COMP_MASK(4 + index); +} + +/* + * Whenever new GUID is set/unset (guid table change) create event and + * notify the relevant slave (master also should be notified). + * If the GUID value is not as we have in the cache the slave will not be + * updated; in this case it waits for the smp_snoop or the port management + * event to call the function and to update the slave. + * block_number - the index of the block (16 blocks available) + * port_number - 1 or 2 + */ +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + enum slave_port_state new_state; + enum slave_port_state prev_state; + __be64 tmp_cur_ag, form_cache_ag; + enum slave_port_gen_event gen_event; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + /*calculate the slaves and notify them*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* the location of the specific index runs from bits 4..11 */ + if (!(test_bit(i + 4, (unsigned long *)&guid_indexes))) + continue; + + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_vfs + 1) + return; + tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE]; + form_cache_ag = get_cached_alias_guid(dev, port_num, + (NUM_ALIAS_GUID_IN_REC * block_num) + i); + /* + * Check if guid is not the same as in the cache, + * If it is different, wait for the snoop_smp or the port mgmt + * change event to update the slave on its port state change + */ + if (tmp_cur_ag != form_cache_ag) + continue; + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + + /*2 cases: Valid GUID, and Invalid Guid*/ + + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/ + prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num); + new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + &gen_event); + pr_debug("slave: %d, port: %d prev_port_state: %d," + " new_port_state: %d, gen_event: %d\n", + slave_id, port_num, prev_state, new_state, gen_event); + if (gen_event == SLAVE_PORT_GEN_EVENT_UP) { + pr_debug("sending PORT_UP event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, + port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE); + } + } else { /* request to invalidate GUID */ + set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, + &gen_event); + pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } + } +} + +static void aliasguid_query_handler(int status, + struct ib_sa_guidinfo_rec *guid_rec, + void *context) +{ + struct mlx4_ib_dev *dev; + struct mlx4_alias_guid_work_context *cb_ctx = context; + u8 port_index ; + int i; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags, flags1; + + if (!context) + return; + + dev = cb_ctx->dev; + port_index = cb_ctx->port - 1; + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[cb_ctx->block_num]; + + if (status) { + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + pr_debug("(port: %d) failed: status = %d\n", + cb_ctx->port, status); + goto out; + } + + if (guid_rec->block_num != cb_ctx->block_num) { + pr_err("block num mismatch: %d != %d\n", + cb_ctx->block_num, guid_rec->block_num); + goto out; + } + + pr_debug("lid/port: %d/%d, block_num: %d\n", + be16_to_cpu(guid_rec->lid), cb_ctx->port, + guid_rec->block_num); + + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[guid_rec->block_num]; + + rec->status = MLX4_GUID_INFO_STATUS_SET; + rec->method = MLX4_GUID_INFO_RECORD_SET; + + for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { + __be64 tmp_cur_ag; + tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE]; + /* check if the SM didn't assign one of the records. + * if it didn't, if it was not sysadmin request: + * ask the SM to give a new GUID, (instead of the driver request). + */ + if (tmp_cur_ag == MLX4_NOT_SET_GUID) { + mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in " + "block_num: %d was declined by SM, " + "ownership by %d (0 = driver, 1=sysAdmin," + " 2=None)\n", __func__, i, + guid_rec->block_num, rec->ownership); + if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) { + /* if it is driver assign, asks for new GUID from SM*/ + *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = + MLX4_NOT_SET_GUID; + + /* Mark the record as not assigned, and let it + * be sent again in the next work sched.*/ + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + } else { + /* properly assigned record. */ + /* We save the GUID we just got from the SM in the + * admin_guid in order to be persistent, and in the + * request from the sm the process will ask for the same GUID */ + if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN && + tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) { + /* the sysadmin assignment failed.*/ + mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" + " admin guid after SysAdmin " + "configuration. " + "Record num %d in block_num:%d " + "was declined by SM, " + "new val(0x%llx) was kept\n", + __func__, i, + guid_rec->block_num, + be64_to_cpu(*(__be64 *) & + rec->all_recs[i * GUID_REC_SIZE])); + } else { + memcpy(&rec->all_recs[i * GUID_REC_SIZE], + &guid_rec->guid_info_list[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } + } + } + /* + The func is call here to close the cases when the + sm doesn't send smp, so in the sa response the driver + notifies the slave. + */ + mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num, + cb_ctx->port, + guid_rec->guid_info_list); +out: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, + &dev->sriov.alias_guid.ports_guid[port_index]. + alias_guid_work, 0); + if (cb_ctx->sa_query) { + list_del(&cb_ctx->list); + kfree(cb_ctx); + } else + complete(&cb_ctx->done); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) +{ + int i; + u64 cur_admin_val; + ib_sa_comp_mask comp_mask = 0; + + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status + = MLX4_GUID_INFO_STATUS_IDLE; + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method + = MLX4_GUID_INFO_RECORD_SET; + + /* calculate the comp_mask for that record.*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + cur_admin_val = + *(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].all_recs[GUID_REC_SIZE * i]; + /* + check the admin value: if it's for delete (~00LL) or + it is the first guid of the first record (hw guid) or + the records is not in ownership of the sysadmin and the sm doesn't + need to assign GUIDs, then don't put it up for assignment. + */ + if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || + (!index && !i) || + MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid. + ports_guid[port - 1].all_rec_per_port[index].ownership) + continue; + comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes = comp_mask; +} + +static int set_guid_rec(struct ib_device *ibdev, + u8 port, int index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + int err; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_guidinfo_rec guid_info_rec; + ib_sa_comp_mask comp_mask; + struct ib_port_attr attr; + struct mlx4_alias_guid_work_context *callback_context; + unsigned long resched_delay, flags, flags1; + struct list_head *head = + &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; + + err = __mlx4_ib_query_port(ibdev, port, &attr, 1); + if (err) { + pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n", + err, port); + return err; + } + /*check the port was configured by the sm, otherwise no need to send */ + if (attr.state != IB_PORT_ACTIVE) { + pr_debug("port %d not active...rescheduling\n", port); + resched_delay = 5 * HZ; + err = -EAGAIN; + goto new_schedule; + } + + callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL); + if (!callback_context) { + err = -ENOMEM; + resched_delay = HZ * 5; + goto new_schedule; + } + callback_context->port = port; + callback_context->dev = dev; + callback_context->block_num = index; + + memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); + + guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.block_num = index; + + memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, + GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC); + comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM | + rec_det->guid_indexes; + + init_completion(&callback_context->done); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_add_tail(&callback_context->list, head); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + + callback_context->query_id = + ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client, + ibdev, port, &guid_info_rec, + comp_mask, rec_det->method, 1000, + GFP_KERNEL, aliasguid_query_handler, + callback_context, + &callback_context->sa_query); + if (callback_context->query_id < 0) { + pr_debug("ib_sa_guid_info_rec_query failed, query_id: " + "%d. will reschedule to the next 1 sec.\n", + callback_context->query_id); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_del(&callback_context->list); + kfree(callback_context); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + resched_delay = 1 * HZ; + err = -EAGAIN; + goto new_schedule; + } + err = 0; + goto out; + +new_schedule: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + invalidate_guid_record(dev, port, index); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + resched_delay); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + +out: + return err; +} + +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) +{ + int i; + unsigned long flags, flags1; + + pr_debug("port %d\n", port); + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) + invalidate_guid_record(dev, port, i); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) { + /* + make sure no work waits in the queue, if the work is already + queued(not on the timer) the cancel will fail. That is not a problem + because we just want the work started. + */ + cancel_delayed_work(&dev->sriov.alias_guid. + ports_guid[port - 1].alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +/* The function returns the next record that was + * not configured (or failed to be configured) */ +static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *rec) +{ + int j; + unsigned long flags; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status == + MLX4_GUID_INFO_STATUS_IDLE) { + memcpy(&rec->rec_det, + &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j], + sizeof (struct mlx4_sriov_alias_guid_info_rec_det)); + rec->port = port; + rec->block_num = j; + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status = + MLX4_GUID_INFO_STATUS_PENDING; + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return 0; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + } + return -ENOENT; +} + +static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port, + int rec_index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes = + rec_det->guid_indexes; + memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs, + rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status = + rec_det->status; +} + +static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port) +{ + int j; + struct mlx4_sriov_alias_guid_info_rec_det rec_det ; + + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) { + memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); + rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) | + IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 | + IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 | + IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 | + IB_SA_GUIDINFO_REC_GID7; + rec_det.status = MLX4_GUID_INFO_STATUS_IDLE; + set_administratively_guid_record(dev, port, j, &rec_det); + } +} + +static void alias_guid_work(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + int ret = 0; + struct mlx4_next_alias_guid_work *rec; + struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port = + container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det, + alias_guid_work); + struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent; + struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid, + struct mlx4_ib_sriov, + alias_guid); + struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov); + + rec = kzalloc(sizeof *rec, GFP_KERNEL); + if (!rec) { + pr_err("alias_guid_work: No Memory\n"); + return; + } + + pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1); + ret = get_next_record_to_update(dev, sriov_alias_port->port, rec); + if (ret) { + pr_debug("No more records to update.\n"); + goto out; + } + + set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num, + &rec->rec_det); + +out: + kfree(rec); +} + + +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) +{ + unsigned long flags, flags1; + + if (!mlx4_is_master(dev->dev)) + return; + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, + &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) +{ + int i; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct mlx4_alias_guid_work_context *cb_ctx; + struct mlx4_sriov_alias_guid_port_rec_det *det; + struct ib_sa_query *sa_query; + unsigned long flags; + + for (i = 0 ; i < dev->num_ports; i++) { + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work); + det = &sriov->alias_guid.ports_guid[i]; + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + while (!list_empty(&det->cb_list)) { + cb_ctx = list_entry(det->cb_list.next, + struct mlx4_alias_guid_work_context, + list); + sa_query = cb_ctx->sa_query; + cb_ctx->sa_query = NULL; + list_del(&cb_ctx->list); + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + ib_sa_cancel_query(cb_ctx->query_id, sa_query); + wait_for_completion(&cb_ctx->done); + kfree(cb_ctx); + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + } + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + } + for (i = 0 ; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + } + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); +} + +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) +{ + char alias_wq_name[15]; + int ret = 0; + int i, j, k; + union ib_gid gid; + + if (!mlx4_is_master(dev->dev)) + return 0; + dev->sriov.alias_guid.sa_client = + kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL); + if (!dev->sriov.alias_guid.sa_client) + return -ENOMEM; + + ib_sa_register_client(dev->sriov.alias_guid.sa_client); + + spin_lock_init(&dev->sriov.alias_guid.ag_work_lock); + + for (i = 1; i <= dev->num_ports; ++i) { + if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) { + ret = -EFAULT; + goto err_unregister; + } + } + + for (i = 0 ; i < dev->num_ports; i++) { + memset(&dev->sriov.alias_guid.ports_guid[i], 0, + sizeof (struct mlx4_sriov_alias_guid_port_rec_det)); + /*Check if the SM doesn't need to assign the GUIDs*/ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + if (mlx4_ib_sm_guid_assign) { + dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j]. + ownership = MLX4_GUID_DRIVER_ASSIGN; + continue; + } + dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j]. + ownership = MLX4_GUID_NONE_ASSIGN; + /*mark each val as it was deleted, + till the sysAdmin will give it valid val*/ + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { + *(__be64 *)&dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] = + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); + } + } + INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list); + /*prepare the records, set them to be allocated by sm*/ + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) + invalidate_guid_record(dev, i + 1, j); + + dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; + dev->sriov.alias_guid.ports_guid[i].port = i; + if (mlx4_ib_sm_guid_assign) + set_all_slaves_guids(dev, i); + + snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); + dev->sriov.alias_guid.ports_guid[i].wq = + create_singlethread_workqueue(alias_wq_name); + if (!dev->sriov.alias_guid.ports_guid[i].wq) { + ret = -ENOMEM; + goto err_thread; + } + INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work, + alias_guid_work); + } + return 0; + +err_thread: + for (--i; i >= 0; i--) { + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + dev->sriov.alias_guid.ports_guid[i].wq = NULL; + } + +err_unregister: + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); + dev->sriov.alias_guid.sa_client = NULL; + pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret); + return ret; +} diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c new file mode 100644 index 00000000000..56a593e0ae5 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_mad.h> + +#include <linux/mlx4/cmd.h> +#include <linux/rbtree.h> +#include <linux/idr.h> +#include <rdma/ib_cm.h> + +#include "mlx4_ib.h" + +#define CM_CLEANUP_CACHE_TIMEOUT (5 * HZ) + +struct id_map_entry { + struct rb_node node; + + u32 sl_cm_id; + u32 pv_cm_id; + int slave_id; + int scheduled_delete; + struct mlx4_ib_dev *dev; + + struct list_head list; + struct delayed_work timeout; +}; + +struct cm_generic_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; +}; + +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + +struct cm_req_msg { + unsigned char unused[0x60]; + union ib_gid primary_path_sgid; +}; + + +static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_local_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); + } +} + +static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_remote_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); + } +} + +static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) +{ + struct cm_req_msg *msg = (struct cm_req_msg *)mad; + + return msg->primary_path_sgid; +} + +/* Lock should be taken before called */ +static struct id_map_entry * +id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node *node = sl_id_map->rb_node; + + while (node) { + struct id_map_entry *id_map_entry = + rb_entry(node, struct id_map_entry, node); + + if (id_map_entry->sl_cm_id > sl_cm_id) + node = node->rb_left; + else if (id_map_entry->sl_cm_id < sl_cm_id) + node = node->rb_right; + else if (id_map_entry->slave_id > slave_id) + node = node->rb_left; + else if (id_map_entry->slave_id < slave_id) + node = node->rb_right; + else + return id_map_entry; + } + return NULL; +} + +static void id_map_ent_timeout(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout); + struct id_map_entry *db_ent, *found_ent; + struct mlx4_ib_dev *dev = ent->dev; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + int pv_id = (int) ent->pv_cm_id; + + spin_lock(&sriov->id_map_lock); + db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id); + if (!db_ent) + goto out; + found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_id); + +out: + list_del(&ent->list); + spin_unlock(&sriov->id_map_lock); + kfree(ent); +} + +static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct id_map_entry *ent, *found_ent; + + spin_lock(&sriov->id_map_lock); + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id); + if (!ent) + goto out; + found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_cm_id); +out: + spin_unlock(&sriov->id_map_lock); +} + +static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node **link = &sl_id_map->rb_node, *parent = NULL; + struct id_map_entry *ent; + int slave_id = new->slave_id; + int sl_cm_id = new->sl_cm_id; + + ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); + if (ent) { + pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n", + sl_cm_id); + + rb_replace_node(&ent->node, &new->node, sl_id_map); + return; + } + + /* Go to the bottom of the tree */ + while (*link) { + parent = *link; + ent = rb_entry(parent, struct id_map_entry, node); + + if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id)) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, sl_id_map); +} + +static struct id_map_entry * +id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) +{ + int ret; + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL); + if (!ent) { + mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n"); + return ERR_PTR(-ENOMEM); + } + + ent->sl_cm_id = sl_cm_id; + ent->slave_id = slave_id; + ent->scheduled_delete = 0; + ent->dev = to_mdev(ibdev); + INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); + + idr_preload(GFP_KERNEL); + spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); + + ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + ent->pv_cm_id = (u32)ret; + sl_id_map_add(ibdev, ent); + list_add_tail(&ent->list, &sriov->cm_list); + } + + spin_unlock(&sriov->id_map_lock); + idr_preload_end(); + + if (ret >= 0) + return ent; + + /*error flow*/ + kfree(ent); + mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); + return ERR_PTR(-ENOMEM); +} + +static struct id_map_entry * +id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id) +{ + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + spin_lock(&sriov->id_map_lock); + if (*pv_cm_id == -1) { + ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id); + if (ent) + *pv_cm_id = (int) ent->pv_cm_id; + } else + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id); + spin_unlock(&sriov->id_map_lock); + + return ent; +} + +static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + unsigned long flags; + + spin_lock(&sriov->id_map_lock); + spin_lock_irqsave(&sriov->going_down_lock, flags); + /*make sure that there is no schedule inside the scheduled work.*/ + if (!sriov->is_going_down) { + id->scheduled_delete = 1; + schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + } + spin_unlock_irqrestore(&sriov->going_down_lock, flags); + spin_unlock(&sriov->id_map_lock); +} + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad) +{ + struct id_map_entry *id; + u32 sl_cm_id; + int pv_cm_id = -1; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); + id = id_map_alloc(ibdev, slave_id, sl_cm_id); + if (IS_ERR(id)) { + mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", + __func__, slave_id, sl_cm_id); + return PTR_ERR(id); + } + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + return 0; + } else { + sl_cm_id = get_local_comm_id(mad); + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + } + + if (!id) { + pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n", + slave_id, sl_cm_id); + return -EINVAL; + } + + set_local_comm_id(mad, id->pv_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) + id_map_find_del(ibdev, pv_cm_id); + + return 0; +} + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad) +{ + u32 pv_cm_id; + struct id_map_entry *id; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + union ib_gid gid; + + if (!slave) + return 0; + + gid = gid_from_req_msg(ibdev, mad); + *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); + if (*slave < 0) { + mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n", + gid.global.interface_id); + return -ENOENT; + } + return 0; + } + + pv_cm_id = get_remote_comm_id(mad); + id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1); + + if (!id) { + pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id); + return -ENOENT; + } + + if (slave) + *slave = id->slave_id; + set_remote_comm_id(mad, id->sl_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) { + id_map_find_del(ibdev, (int) pv_cm_id); + } + + return 0; +} + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) +{ + spin_lock_init(&dev->sriov.id_map_lock); + INIT_LIST_HEAD(&dev->sriov.cm_list); + dev->sriov.sl_id_map = RB_ROOT; + idr_init(&dev->sriov.pv_id_table); +} + +/* slave = -1 ==> all slaves */ +/* TBD -- call paravirt clean for single slave. Need for slave RESET event */ +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) +{ + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct list_head lh; + struct rb_node *nd; + int need_flush = 1; + struct id_map_entry *map, *tmp_map; + /* cancel all delayed work queue entries */ + INIT_LIST_HEAD(&lh); + spin_lock(&sriov->id_map_lock); + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave < 0 || slave == map->slave_id) { + if (map->scheduled_delete) + need_flush &= !!cancel_delayed_work(&map->timeout); + } + } + + spin_unlock(&sriov->id_map_lock); + + if (!need_flush) + flush_scheduled_work(); /* make sure all timers were flushed */ + + /* now, remove all leftover entries from databases*/ + spin_lock(&sriov->id_map_lock); + if (slave < 0) { + while (rb_first(sl_id_map)) { + struct id_map_entry *ent = + rb_entry(rb_first(sl_id_map), + struct id_map_entry, node); + + rb_erase(&ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id); + } + list_splice_init(&dev->sriov.cm_list, &lh); + } else { + /* first, move nodes belonging to slave to db remove list */ + nd = rb_first(sl_id_map); + while (nd) { + struct id_map_entry *ent = + rb_entry(nd, struct id_map_entry, node); + nd = rb_next(nd); + if (ent->slave_id == slave) + list_move_tail(&ent->list, &lh); + } + /* remove those nodes from databases */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + rb_erase(&map->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id); + } + + /* add remaining nodes from cm_list */ + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave == map->slave_id) + list_move_tail(&map->list, &lh); + } + } + + spin_unlock(&sriov->id_map_lock); + + /* free any map entries left behind due to cancel_delayed_work above */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + list_del(&map->list); + kfree(map); + } +} diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 6d4ef71cbcd..1066eec854a 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -33,6 +33,7 @@ #include <linux/mlx4/cq.h> #include <linux/mlx4/qp.h> +#include <linux/mlx4/srq.h> #include <linux/slab.h> #include "mlx4_ib.h" @@ -66,7 +67,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) { - return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe)); + return mlx4_buf_offset(&buf->buf, n * buf->entry_size); } static void *get_cqe(struct mlx4_ib_cq *cq, int n) @@ -77,8 +78,9 @@ static void *get_cqe(struct mlx4_ib_cq *cq, int n) static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) { struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); - return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; } @@ -99,18 +101,19 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * { int err; - err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe), - PAGE_SIZE * 2, &buf->buf); + err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, + PAGE_SIZE * 2, &buf->buf, GFP_KERNEL); if (err) goto out; + buf->entry_size = dev->dev->caps.cqe_size; err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, &buf->mtt); if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL); if (err) goto err_mtt; @@ -120,8 +123,7 @@ err_mtt: mlx4_mtt_cleanup(dev->dev, &buf->mtt); err_buf: - mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe), - &buf->buf); + mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); out: return err; @@ -129,7 +131,7 @@ out: static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) { - mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf); + mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); } static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, @@ -137,8 +139,9 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont u64 buf_addr, int cqe) { int err; + int cqe_size = dev->dev->caps.cqe_size; - *umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe), + *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); @@ -206,7 +209,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector uar = &to_mucontext(context)->uar; } else { - err = mlx4_db_alloc(dev->dev, &cq->db, 1); + err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL); if (err) goto err_cq; @@ -226,7 +229,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector vector = dev->eq_table[vector % ibdev->num_comp_vectors]; err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, - cq->db.dma, &cq->mcq, vector, 0); + cq->db.dma, &cq->mcq, vector, 0, 0); if (err) goto err_dbmap; @@ -321,7 +324,7 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) u32 i; i = cq->mcq.cons_index; - while (get_sw_cqe(cq, i & cq->ibcq.cqe)) + while (get_sw_cqe(cq, i)) ++i; return i - cq->mcq.cons_index; @@ -331,16 +334,23 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) { struct mlx4_cqe *cqe, *new_cqe; int i; + int cqe_size = cq->buf.entry_size; + int cqe_inc = cqe_size == 64 ? 1 : 0; i = cq->mcq.cons_index; cqe = get_cqe(cq, i & cq->ibcq.cqe); + cqe += cqe_inc; + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, (i + 1) & cq->resize_buf->cqe); - memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe)); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); + new_cqe += cqe_inc; + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + cqe += cqe_inc; } ++cq->mcq.cons_index; } @@ -355,7 +365,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) mutex_lock(&cq->resize_mutex); - if (entries < 1 || entries > dev->dev->caps.max_cqes) { + if (entries < 1) { err = -EINVAL; goto out; } @@ -366,6 +376,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) goto out; } + if (entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + if (ibcq->uobject) { err = mlx4_alloc_resize_umem(dev, cq, entries, udata); if (err) @@ -438,6 +453,7 @@ err_buf: out: mutex_unlock(&cq->resize_mutex); + return err; } @@ -547,6 +563,34 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) checksum == cpu_to_be16(0xffff); } +static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, + unsigned tail, struct mlx4_cqe *cqe, int is_eth) +{ + struct mlx4_ib_proxy_sqp_hdr *hdr; + + ib_dma_sync_single_for_cpu(qp->ibqp.device, + qp->sqp_proxy_rcv[tail].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); + wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); + wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; + wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; + wc->dlid_path_bits = 0; + + if (is_eth) { + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } + + return 0; +} + static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_ib_qp **cur_qp, struct ib_wc *wc) @@ -555,16 +599,22 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_qp *mqp; struct mlx4_ib_wq *wq; struct mlx4_ib_srq *srq; + struct mlx4_srq *msrq = NULL; int is_send; int is_error; + int is_eth; u32 g_mlpath_rqpn; u16 wqe_ctr; + unsigned tail = 0; repoll: cqe = next_cqe_sw(cq); if (!cqe) return -EAGAIN; + if (cq->buf.entry_size == 64) + cqe++; + ++cq->mcq.cons_index; /* @@ -619,6 +669,20 @@ repoll: wc->qp = &(*cur_qp)->ibqp; + if (wc->qp->qp_type == IB_QPT_XRC_TGT) { + u32 srq_num; + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + srq_num = g_mlpath_rqpn & 0xffffff; + /* SRQ is also in the radix tree */ + msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, + srq_num); + if (unlikely(!msrq)) { + pr_warn("CQ %06x with entry for unknown SRQN %06x\n", + cq->mcq.cqn, srq_num); + return -EINVAL; + } + } + if (is_send) { wq = &(*cur_qp)->sq; if (!(*cur_qp)->sq_signal_bits) { @@ -632,9 +696,15 @@ repoll: wqe_ctr = be16_to_cpu(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_ctr]; mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else if (msrq) { + srq = to_mibsrq(msrq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + tail = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[tail]; ++wq->tail; } @@ -717,6 +787,17 @@ repoll: break; } + is_eth = (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); + if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { + if ((*cur_qp)->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + return use_tunnel_data(*cur_qp, cq, wc, tail, + cqe, is_eth); + } + wc->slid = be16_to_cpu(cqe->rlid); g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); wc->src_qp = g_mlpath_rqpn & 0xffffff; @@ -725,11 +806,21 @@ repoll: wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; - if (rdma_port_get_link_layer(wc->qp->device, - (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) + if (is_eth) { wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; - else + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + } else { + wc->vlan_id = 0xffff; + } + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->vlan_id = 0xffff; + } } return 0; @@ -778,6 +869,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) int nfreed = 0; struct mlx4_cqe *cqe, *dest; u8 owner_bit; + int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; /* * First we need to find the current producer index, so we @@ -796,12 +888,16 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) */ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe += cqe_inc; + if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); ++nfreed; } else if (nfreed) { dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest += cqe_inc; + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; memcpy(dest, cqe, sizeof *cqe); dest->owner_sr_opcode = owner_bit | diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index 8aee4233b38..c5174098636 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -45,7 +45,6 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db) { struct mlx4_ib_user_db_page *page; - struct ib_umem_chunk *chunk; int err = 0; mutex_lock(&context->db_page_mutex); @@ -73,8 +72,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, list_add(&page->list, &context->db_page_list); found: - chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); - db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); db->u.user_page = page; ++page->refcnt; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 259b0670b51..287ad0564ac 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -32,7 +32,10 @@ #include <rdma/ib_mad.h> #include <rdma/ib_smi.h> +#include <rdma/ib_sa.h> +#include <rdma/ib_cache.h> +#include <linux/random.h> #include <linux/mlx4/cmd.h> #include <linux/gfp.h> #include <rdma/ib_pma.h> @@ -44,7 +47,62 @@ enum { MLX4_IB_VENDOR_CLASS2 = 0xa }; -int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, +#define MLX4_TUN_SEND_WRID_SHIFT 34 +#define MLX4_TUN_QPN_SHIFT 32 +#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) +#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) + +#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) +#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) + + /* Port mgmt change event handling */ + +#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) +#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) +#define NUM_IDX_IN_PKEY_TBL_BLK 32 +#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ +#define GUID_TBL_BLK_NUM_ENTRIES 8 +#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) + +struct mlx4_mad_rcv_buf { + struct ib_grh grh; + u8 payload[256]; +} __packed; + +struct mlx4_mad_snd_buf { + u8 payload[256]; +} __packed; + +struct mlx4_tunnel_mad { + struct ib_grh grh; + struct mlx4_ib_tunnel_header hdr; + struct ib_mad mad; +} __packed; + +struct mlx4_rcv_tunnel_mad { + struct mlx4_rcv_tunnel_hdr hdr; + struct ib_grh grh; + struct ib_mad mad; +} __packed; + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap); + +__be64 mlx4_ib_gen_node_guid(void) +{ +#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) + return cpu_to_be64(NODE_GUID_HI | prandom_u32()); +} + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) +{ + return cpu_to_be64(atomic_inc_return(&ctx->tid)) | + cpu_to_be64(0xff00000000000000LL); +} + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad) { @@ -71,10 +129,13 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, * Key check traps can't be generated unless we have in_wc to * tell us where to send the trap. */ - if (ignore_mkey || !in_wc) + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc) op_modifier |= 0x1; - if (ignore_bkey || !in_wc) + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc) op_modifier |= 0x2; + if (mlx4_is_mfunc(dev->dev) && + (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc)) + op_modifier |= 0x8; if (in_wc) { struct { @@ -107,10 +168,10 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, in_modifier |= in_wc->slid << 16; } - err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, - in_modifier, op_modifier, + err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, + mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier, MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, - MLX4_CMD_NATIVE); + (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED); if (!err) memcpy(response_mad, outmailbox->buf, 256); @@ -125,6 +186,7 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) { struct ib_ah *new_ah; struct ib_ah_attr ah_attr; + unsigned long flags; if (!dev->send_agent[port_num - 1][0]) return; @@ -139,53 +201,134 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) if (IS_ERR(new_ah)) return; - spin_lock(&dev->sm_lock); + spin_lock_irqsave(&dev->sm_lock, flags); if (dev->sm_ah[port_num - 1]) ib_destroy_ah(dev->sm_ah[port_num - 1]); dev->sm_ah[port_num - 1] = new_ah; - spin_unlock(&dev->sm_lock); + spin_unlock_irqrestore(&dev->sm_lock, flags); } /* - * Snoop SM MADs for port info and P_Key table sets, so we can - * synthesize LID change and P_Key change events. + * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can + * synthesize LID change, Client-Rereg, GID change, and P_Key change events. */ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, - u16 prev_lid) + u16 prev_lid) { - struct ib_event event; + struct ib_port_info *pinfo; + u16 lid; + __be16 *base; + u32 bn, pkey_change_bitmap; + int i; + + struct mlx4_ib_dev *dev = to_mdev(ibdev); if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && - mad->mad_hdr.method == IB_MGMT_METHOD_SET) { - if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { - struct ib_port_info *pinfo = - (struct ib_port_info *) ((struct ib_smp *) mad)->data; - u16 lid = be16_to_cpu(pinfo->lid); + mad->mad_hdr.method == IB_MGMT_METHOD_SET) + switch (mad->mad_hdr.attr_id) { + case IB_SMP_ATTR_PORT_INFO: + pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; + lid = be16_to_cpu(pinfo->lid); - update_sm_ah(to_mdev(ibdev), port_num, + update_sm_ah(dev, port_num, be16_to_cpu(pinfo->sm_lid), pinfo->neighbormtu_mastersmsl & 0xf); - event.device = ibdev; - event.element.port_num = port_num; + if (pinfo->clientrereg_resv_subnetto & 0x80) + handle_client_rereg_event(dev, port_num); + + if (prev_lid != lid) + handle_lid_change_event(dev, port_num); + break; + + case IB_SMP_ATTR_PKEY_TABLE: + if (!mlx4_is_mfunc(dev->dev)) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + break; + } + + /* at this point, we are running in the master. + * Slaves do not receive SMPs. + */ + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; + base = (__be16 *) &(((struct ib_smp *)mad)->data[0]); + pkey_change_bitmap = 0; + for (i = 0; i < 32; i++) { + pr_debug("PKEY[%d] = x%x\n", + i + bn*32, be16_to_cpu(base[i])); + if (be16_to_cpu(base[i]) != + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { + pkey_change_bitmap |= (1 << i); + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = + be16_to_cpu(base[i]); + } + } + pr_debug("PKEY Change event: port=%d, " + "block=0x%x, change_bitmap=0x%x\n", + port_num, bn, pkey_change_bitmap); - if (pinfo->clientrereg_resv_subnetto & 0x80) { - event.event = IB_EVENT_CLIENT_REREGISTER; - ib_dispatch_event(&event); + if (pkey_change_bitmap) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + if (!dev->sriov.is_going_down) + __propagate_pkey_ev(dev, port_num, bn, + pkey_change_bitmap); } + break; - if (prev_lid != lid) { - event.event = IB_EVENT_LID_CHANGE; - ib_dispatch_event(&event); + case IB_SMP_ATTR_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + if (mlx4_is_master(dev->dev) && + !dev->sriov.is_going_down) { + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); + mlx4_ib_update_cache_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); } + break; + + default: + break; } +} + +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap) +{ + int i, ix, slave, err; + int have_event = 0; - if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) { - event.device = ibdev; - event.event = IB_EVENT_PKEY_CHANGE; - event.element.port_num = port_num; - ib_dispatch_event(&event); + for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { + if (slave == mlx4_master_func_num(dev->dev)) + continue; + if (!mlx4_is_slave_active(dev->dev, slave)) + continue; + + have_event = 0; + for (i = 0; i < 32; i++) { + if (!(change_bitmap & (1 << i))) + continue; + for (ix = 0; + ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { + if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] + [ix] == i + 32 * block) { + err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); + pr_debug("propagate_pkey_ev: slave %d," + " port %d, ix %d (%d)\n", + slave, port_num, ix, err); + have_event = 1; + break; + } + } + if (have_event) + break; } } } @@ -193,13 +336,15 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, static void node_desc_override(struct ib_device *dev, struct ib_mad *mad) { + unsigned long flags; + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { - spin_lock(&to_mdev(dev)->sm_lock); + spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags); memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); - spin_unlock(&to_mdev(dev)->sm_lock); + spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags); } } @@ -209,6 +354,7 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma struct ib_mad_send_buf *send_buf; struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; int ret; + unsigned long flags; if (agent) { send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, @@ -221,19 +367,352 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma * wrong following the IB spec strictly, but we know * it's OK for our devices). */ - spin_lock(&dev->sm_lock); + spin_lock_irqsave(&dev->sm_lock, flags); memcpy(send_buf->mad, mad, sizeof *mad); if ((send_buf->ah = dev->sm_ah[port_num - 1])) ret = ib_post_send_mad(send_buf, NULL); else ret = -EINVAL; - spin_unlock(&dev->sm_lock); + spin_unlock_irqrestore(&dev->sm_lock, flags); if (ret) ib_free_send_mad(send_buf); } } +static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int i; + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (dev->sriov.demux[port - 1].guid_cache[i] == guid) + return i; + } + return -1; +} + + +static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, + u8 port, u16 pkey, u16 *ix) +{ + int i, ret; + u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; + u16 slot_pkey; + + if (slave == mlx4_master_func_num(dev->dev)) + return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); + + unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; + + for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { + if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) + continue; + + pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; + + ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); + if (ret) + continue; + if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { + if (slot_pkey & 0x8000) { + *ix = (u16) pkey_ix; + return 0; + } else { + /* take first partial pkey index found */ + if (partial_ix == 0xFF) + partial_ix = pkey_ix; + } + } + } + + if (partial_ix < 0xFF) { + *ix = (u16) partial_ix; + return 0; + } + + return -EINVAL; +} + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *tun_ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_rcv_tunnel_mad *tun_mad; + struct ib_ah_attr attr; + struct ib_ah *ah; + struct ib_qp *src_qp = NULL; + unsigned tun_tx_ix = 0; + int dqpn; + int ret = 0; + u16 tun_pkey_ix; + u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + + if (dest_qpt > IB_QPT_GSI) + return -EINVAL; + + tun_ctx = dev->sriov.demux[port-1].tun[slave]; + + /* check if proxy qp created */ + if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (!dest_qpt) + tun_qp = &tun_ctx->qp[0]; + else + tun_qp = &tun_ctx->qp[1]; + + /* compute P_Key index to put in tunnel header for slave */ + if (dest_qpt) { + u16 pkey_ix; + ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); + if (ret) + return -EINVAL; + + ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); + if (ret) + return -EINVAL; + tun_pkey_ix = pkey_ix; + } else + tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + + dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1; + + /* get tunnel tx data buf for slave */ + src_qp = tun_qp->qp; + + /* create ah. Just need an empty one with the port num for the post send. + * The driver will set the force loopback bit in post_send */ + memset(&attr, 0, sizeof attr); + attr.port_num = port; + if (is_eth) { + memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); + attr.ah_flags = IB_AH_GRH; + } + ah = ib_create_ah(tun_ctx->pd, &attr); + if (IS_ERR(ah)) + return -ENOMEM; + + /* allocate tunnel tx buf after pass failure returns */ + spin_lock(&tun_qp->tx_lock); + if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&tun_qp->tx_lock); + if (ret) + goto out; + + tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); + if (tun_qp->tx_ring[tun_tx_ix].ah) + ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); + tun_qp->tx_ring[tun_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + /* copy over to tunnel buffer */ + if (grh) + memcpy(&tun_mad->grh, grh, sizeof *grh); + memcpy(&tun_mad->mad, mad, sizeof *mad); + + /* adjust tunnel data */ + tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); + tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); + tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + /* VST mode */ + if (vlan != wc->vlan_id) + /* Packet vlan is not the VST-assigned vlan. + * Drop the packet. + */ + goto out; + else + /* Remove the vlan tag before forwarding + * the packet to the VF. + */ + vlan = 0xffff; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + } + + ib_dma_sync_single_for_device(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; + list.length = sizeof (struct mlx4_rcv_tunnel_mad); + list.lkey = tun_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; + wr.wr.ud.remote_qpn = dqpn; + wr.next = NULL; + wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(src_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, + struct ib_wc *wc, struct ib_grh *grh, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + int slave; + u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } + + /* Initially assume that this mad is for us */ + slave = mlx4_master_func_num(dev->dev); + + /* See if the slave id is encoded in a response mad */ + if (mad->mad_hdr.method & 0x80) { + slave_id = (u8 *) &mad->mad_hdr.tid; + slave = *slave_id; + if (slave != 255) /*255 indicates the dom0*/ + *slave_id = 0; /* remap tid */ + } + + /* If a grh is present, we demux according to it */ + if (wc->wc_flags & IB_WC_GRH) { + slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id); + if (slave < 0) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + } + /* Class-specific handling */ + switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + /* 255 indicates the dom0 */ + if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { + if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) + return -EPERM; + /* for a VF. drop unsolicited MADs */ + if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { + mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", + slave, mad->mad_hdr.mgmt_class, + mad->mad_hdr.method); + return -EINVAL; + } + } + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_demux_sa_handler(ibdev, port, slave, + (struct ib_sa_mad *) mad)) + return 0; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad)) + return 0; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP) + return 0; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + pr_debug("dropping unsupported ingress mad from class:%d " + "for slave:%d\n", mad->mad_hdr.mgmt_class, slave); + return 0; + } + } + /*make sure that no slave==255 was not handled yet.*/ + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; +} + static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) @@ -242,6 +721,25 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, int err; struct ib_port_attr pattr; + if (in_wc && in_wc->qp->qp_num) { + pr_debug("received MAD: slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc->wc_flags & IB_WC_GRH) { + pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->sgid.global.subnet_prefix), + be64_to_cpu(in_grh->sgid.global.interface_id)); + pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->dgid.global.subnet_prefix), + be64_to_cpu(in_grh->dgid.global.interface_id)); + } + } + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { @@ -279,15 +777,19 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, prev_lid = pattr.lid; err = mlx4_MAD_IFC(to_mdev(ibdev), - mad_flags & IB_MAD_IGNORE_MKEY, - mad_flags & IB_MAD_IGNORE_BKEY, + (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | + (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) | + MLX4_MAD_IFC_NET_VIEW, port_num, in_wc, in_grh, in_mad, out_mad); if (err) return IB_MAD_RESULT_FAILURE; if (!out_mad->mad_hdr.status) { - smp_snoop(ibdev, port_num, in_mad, prev_lid); - node_desc_override(ibdev, out_mad); + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)) + smp_snoop(ibdev, port_num, in_mad, prev_lid); + /* slaves get node desc from FW */ + if (!mlx4_is_slave(to_mdev(ibdev)->dev)) + node_desc_override(ibdev, out_mad); } /* set return bit in status of directed route responses */ @@ -370,6 +872,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { + if (mad_send_wc->send_buf->context[0]) + ib_destroy_ah(mad_send_wc->send_buf->context[0]); ib_free_send_mad(mad_send_wc->send_buf); } @@ -427,3 +931,1233 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) ib_destroy_ah(dev->sm_ah[p]); } } + +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK); +} + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + /* re-configure the alias-guid and mcg's */ + if (mlx4_is_master(dev->dev)) { + mlx4_ib_invalidate_all_guid_record(dev, port_num); + + if (!dev->sriov.is_going_down) { + mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK); + } + } + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER); +} + +static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + struct mlx4_eqe *eqe) +{ + __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe), + GET_MASK_FROM_EQE(eqe)); +} + +static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, + u32 guid_tbl_blk_num, u32 change_bitmap) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + u16 i; + + if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) + return; + + in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n"); + goto out; + } + + guid_tbl_blk_num *= 4; + + for (i = 0; i < 4; i++) { + if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) + continue; + memset(in_mad, 0, sizeof *in_mad); + memset(out_mad, 0, sizeof *out_mad); + + in_mad->base_version = 1; + in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + in_mad->class_version = 1; + in_mad->method = IB_MGMT_METHOD_GET; + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); + + if (mlx4_MAD_IFC(dev, + MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW, + port_num, NULL, NULL, in_mad, out_mad)) { + mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); + goto out; + } + + mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + } + +out: + kfree(in_mad); + kfree(out_mad); + return; +} + +void handle_port_mgmt_change_event(struct work_struct *work) +{ + struct ib_event_work *ew = container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *dev = ew->ib_dev; + struct mlx4_eqe *eqe = &(ew->ib_eqe); + u8 port = eqe->event.port_mgmt_change.port; + u32 changed_attr; + u32 tbl_block; + u32 change_bitmap; + + switch (eqe->subtype) { + case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: + changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); + + /* Update the SM ah - This should be done before handling + the other changed attributes so that MADs can be sent to the SM */ + if (changed_attr & MSTR_SM_CHANGE_MASK) { + u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); + u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; + update_sm_ah(dev, port, lid, sl); + } + + /* Check if it is a lid change event */ + if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) + handle_lid_change_event(dev, port); + + /* Generate GUID changed event */ + if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify all slaves*/ + if (mlx4_is_master(dev->dev)) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK); + } + + if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) + handle_client_rereg_event(dev, port); + break; + + case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: + mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE); + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + propagate_pkey_ev(dev, port, eqe); + break; + case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + else if (!dev->sriov.is_going_down) { + tbl_block = GET_BLK_PTR_FROM_EQE(eqe); + change_bitmap = GET_MASK_FROM_EQE(eqe); + handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); + } + break; + default: + pr_warn("Unsupported subtype 0x%x for " + "Port Management Change event\n", eqe->subtype); + } + + kfree(ew); +} + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type) +{ + struct ib_event event; + + event.device = &dev->ib_dev; + event.element.port_num = port_num; + event.event = type; + + ib_dispatch_event(&event); +} + +static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) +{ + unsigned long flags; + struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) + queue_work(ctx->wq, &ctx->work); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, + struct mlx4_ib_demux_pv_qp *tun_qp, + int index) +{ + struct ib_sge sg_list; + struct ib_recv_wr recv_wr, *bad_recv_wr; + int size; + + size = (tun_qp->qp->qp_type == IB_QPT_UD) ? + sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); + + sg_list.addr = tun_qp->ring[index].map; + sg_list.length = size; + sg_list.lkey = ctx->mr->lkey; + + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | + MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); + ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, + size, DMA_FROM_DEVICE); + return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); +} + +static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) +{ + int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave; + + return (qpn >= proxy_start && qpn <= proxy_start + 1); +} + + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, + u8 *s_mac, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *sqp_ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct mlx4_mad_snd_buf *sqp_mad; + struct ib_ah *ah; + struct ib_qp *send_qp = NULL; + unsigned wire_tx_ix = 0; + int ret = 0; + u16 wire_pkey_ix; + int src_qpnum; + u8 sgid_index; + + + sqp_ctx = dev->sriov.sqps[port-1]; + + /* check if proxy qp created */ + if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (dest_qpt == IB_QPT_SMI) { + src_qpnum = 0; + sqp = &sqp_ctx->qp[0]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + } else { + src_qpnum = 1; + sqp = &sqp_ctx->qp[1]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; + } + + send_qp = sqp->qp; + + /* create ah */ + sgid_index = attr->grh.sgid_index; + attr->grh.sgid_index = 0; + ah = ib_create_ah(sqp_ctx->pd, attr); + if (IS_ERR(ah)) + return -ENOMEM; + attr->grh.sgid_index = sgid_index; + to_mah(ah)->av.ib.gid_index = sgid_index; + /* get rid of force-loopback bit */ + to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); + spin_lock(&sqp->tx_lock); + if (sqp->tx_ix_head - sqp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&sqp->tx_lock); + if (ret) + goto out; + + sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); + if (sqp->tx_ring[wire_tx_ix].ah) + ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); + sqp->tx_ring[wire_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + memcpy(&sqp_mad->payload, mad, sizeof *mad); + + ib_dma_sync_single_for_device(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + list.addr = sqp->tx_ring[wire_tx_ix].buf.map; + list.length = sizeof (struct mlx4_mad_snd_buf); + list.lkey = sqp_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.pkey_index = wire_pkey_ix; + wr.wr.ud.remote_qkey = qkey; + wr.wr.ud.remote_qpn = remote_qpn; + wr.next = NULL; + wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + if (s_mac) + memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); + + + ret = ib_post_send(send_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + return mlx4_get_base_gid_ix(dev->dev, slave, port); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct ib_ah_attr *ah_attr) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + ah_attr->grh.sgid_index = slave; + else + ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); +} + +static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) +{ + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; + int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); + struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; + struct mlx4_ib_ah ah; + struct ib_ah_attr ah_attr; + u8 *slave_id; + int slave; + int port; + + /* Get slave that sent this packet */ + if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || + wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX || + (wc->src_qp & 0x1) != ctx->port - 1 || + wc->src_qp & 0x4) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); + return; + } + slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8; + if (slave != ctx->slave) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "belongs to another slave\n", wc->src_qp); + return; + } + + /* Map transaction ID */ + ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, + sizeof (struct mlx4_tunnel_mad), + DMA_FROM_DEVICE); + switch (tunnel->mad.mad_hdr.method) { + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_REPORT: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_DELETE: + case IB_SA_METHOD_GET_MULTI: + case IB_SA_METHOD_GET_TRACE_TBL: + slave_id = (u8 *) &tunnel->mad.mad_hdr.tid; + if (*slave_id) { + mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " + "class:%d slave:%d\n", *slave_id, + tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } else + *slave_id = slave; + default: + /* nothing */; + } + + /* Class-specific handling */ + switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + if (slave != mlx4_master_func_num(dev->dev) && + !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) + return; + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_sa_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET && + tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET) + return; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " + "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } + } + + /* We are using standard ib_core services to send the mad, so generate a + * stadard address handle by decoding the tunnelled mlx4_ah fields */ + memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); + ah.ibah.device = ctx->ib_dev; + mlx4_ib_query_ah(&ah.ibah, &ah_attr); + if (ah_attr.ah_flags & IB_AH_GRH) + fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + + port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num); + if (port < 0) + return; + ah_attr.port_num = port; + memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); + ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan); + /* if slave have default vlan use it */ + mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &ah_attr.vlan_id, &ah_attr.sl); + + mlx4_ib_send_to_wire(dev, slave, ctx->port, + is_proxy_qp0(dev, wc->src_qp, slave) ? + IB_QPT_SMI : IB_QPT_GSI, + be16_to_cpu(tunnel->hdr.pkey_index), + be32_to_cpu(tunnel->hdr.remote_qpn), + be32_to_cpu(tunnel->hdr.qkey), + &ah_attr, wc->smac, &tunnel->mad); +} + +static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS, + GFP_KERNEL); + if (!tun_qp->ring) + return -ENOMEM; + + tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, + sizeof (struct mlx4_ib_tun_tx_buf), + GFP_KERNEL); + if (!tun_qp->tx_ring) { + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; + } + + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); + if (!tun_qp->ring[i].addr) + goto err; + tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, + tun_qp->ring[i].addr, + rx_buf_size, + DMA_FROM_DEVICE); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->tx_ring[i].buf.addr = + kmalloc(tx_buf_size, GFP_KERNEL); + if (!tun_qp->tx_ring[i].buf.addr) + goto tx_err; + tun_qp->tx_ring[i].buf.map = + ib_dma_map_single(ctx->ib_dev, + tun_qp->tx_ring[i].buf.addr, + tx_buf_size, + DMA_TO_DEVICE); + tun_qp->tx_ring[i].ah = NULL; + } + spin_lock_init(&tun_qp->tx_lock); + tun_qp->tx_ix_head = 0; + tun_qp->tx_ix_tail = 0; + tun_qp->proxy_qpt = qp_type; + + return 0; + +tx_err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + } + kfree(tun_qp->tx_ring); + tun_qp->tx_ring = NULL; + i = MLX4_NUM_TUNNEL_BUFS; +err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; +} + +static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return; + + tun_qp = &ctx->qp[qp_type]; + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + if (tun_qp->tx_ring[i].ah) + ib_destroy_ah(tun_qp->tx_ring[i].ah); + } + kfree(tun_qp->tx_ring); + kfree(tun_qp->ring); +} + +static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct ib_wc wc; + int ret; + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { + tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_RECV: + mlx4_ib_multiplex_mad(ctx, &wc); + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, + wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)); + if (ret) + pr_err("Failed reposting tunnel " + "buf:%lld\n", wc.wr_id); + break; + case IB_WC_SEND: + pr_debug("received tunnel send completion:" + "wrid=0x%llx, status=0x%x\n", + wc.wr_id, wc.status); + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + + break; + default: + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + } + } + } +} + +static void pv_qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct mlx4_ib_demux_pv_ctx *sqp = qp_context; + + /* It's worse than that! He's dead, Jim! */ + pr_err("Fatal error (%d) on a MAD QP on port %d\n", + event->event, sqp->port); +} + +static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int create_tun) +{ + int i, ret; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; + struct ib_qp_attr attr; + int qp_attr_mask_INIT; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.init_attr.send_cq = ctx->cq; + qp_init_attr.init_attr.recv_cq = ctx->cq; + qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_send_sge = 1; + qp_init_attr.init_attr.cap.max_recv_sge = 1; + if (create_tun) { + qp_init_attr.init_attr.qp_type = IB_QPT_UD; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP; + qp_init_attr.port = ctx->port; + qp_init_attr.slave = ctx->slave; + qp_init_attr.proxy_qp_type = qp_type; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_QKEY | IB_QP_PORT; + } else { + qp_init_attr.init_attr.qp_type = qp_type; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; + } + qp_init_attr.init_attr.port_num = ctx->port; + qp_init_attr.init_attr.qp_context = ctx; + qp_init_attr.init_attr.event_handler = pv_qp_event_handler; + tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); + if (IS_ERR(tun_qp->qp)) { + ret = PTR_ERR(tun_qp->qp); + tun_qp->qp = NULL; + pr_err("Couldn't create %s QP (%d)\n", + create_tun ? "tunnel" : "special", ret); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IB_QPS_INIT; + ret = 0; + if (create_tun) + ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, + ctx->port, IB_DEFAULT_PKEY_FULL, + &attr.pkey_index); + if (ret || !create_tun) + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + attr.qkey = IB_QP1_QKEY; + attr.port_num = ctx->port; + ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); + if (ret) { + pr_err("Couldn't change %s qp state to INIT (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); + if (ret) { + pr_err("Couldn't change %s qp state to RTR (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + pr_err("Couldn't change %s qp state to RTS (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); + if (ret) { + pr_err(" mlx4_ib_post_pv_buf error" + " (err = %d, i = %d)\n", ret, i); + goto err_qp; + } + } + return 0; + +err_qp: + ib_destroy_qp(tun_qp->qp); + tun_qp->qp = NULL; + return ret; +} + +/* + * IB MAD completion callback for real SQPs + */ +static void mlx4_ib_sqp_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct ib_wc wc; + struct ib_grh *grh; + struct ib_mad *mad; + + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { + sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_SEND: + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + break; + case IB_WC_RECV: + mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); + grh = &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); + mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); + if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1))) + pr_err("Failed reposting SQP " + "buf:%lld\n", wc.wr_id); + break; + default: + BUG_ON(1); + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + } + } + } +} + +static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx **ret_ctx) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + + *ret_ctx = NULL; + ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); + if (!ctx) { + pr_err("failed allocating pv resource context " + "for port %d, slave %d\n", port, slave); + return -ENOMEM; + } + + ctx->ib_dev = &dev->ib_dev; + ctx->port = port; + ctx->slave = slave; + *ret_ctx = ctx; + return 0; +} + +static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (dev->sriov.demux[port - 1].tun[slave]) { + kfree(dev->sriov.demux[port - 1].tun[slave]); + dev->sriov.demux[port - 1].tun[slave] = NULL; + } +} + +static int create_pv_resources(struct ib_device *ibdev, int slave, int port, + int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) +{ + int ret, cq_size; + + if (ctx->state != DEMUX_PV_STATE_DOWN) + return -EEXIST; + + ctx->state = DEMUX_PV_STATE_STARTING; + /* have QP0 only if link layer is IB */ + if (rdma_port_get_link_layer(ibdev, ctx->port) == + IB_LINK_LAYER_INFINIBAND) + ctx->has_smi = 1; + + if (ctx->has_smi) { + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret); + goto err_out; + } + } + + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret); + goto err_out_qp0; + } + + cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; + if (ctx->has_smi) + cq_size *= 2; + + ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler, + NULL, ctx, cq_size, 0); + if (IS_ERR(ctx->cq)) { + ret = PTR_ERR(ctx->cq); + pr_err("Couldn't create tunnel CQ (%d)\n", ret); + goto err_buf; + } + + ctx->pd = ib_alloc_pd(ctx->ib_dev); + if (IS_ERR(ctx->pd)) { + ret = PTR_ERR(ctx->pd); + pr_err("Couldn't create tunnel PD (%d)\n", ret); + goto err_cq; + } + + ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(ctx->mr)) { + ret = PTR_ERR(ctx->mr); + pr_err("Couldn't get tunnel DMA MR (%d)\n", ret); + goto err_pd; + } + + if (ctx->has_smi) { + ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP0 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_mr; + } + } + + ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP1 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_qp0; + } + + if (create_tun) + INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); + else + INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); + + ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; + + ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + if (ret) { + pr_err("Couldn't arm tunnel cq (%d)\n", ret); + goto err_wq; + } + ctx->state = DEMUX_PV_STATE_ACTIVE; + return 0; + +err_wq: + ctx->wq = NULL; + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + + +err_qp0: + if (ctx->has_smi) + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + +err_mr: + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + +err_pd: + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + +err_cq: + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + +err_buf: + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); + +err_out_qp0: + if (ctx->has_smi) + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); +err_out: + ctx->state = DEMUX_PV_STATE_DOWN; + return ret; +} + +static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx *ctx, int flush) +{ + if (!ctx) + return; + if (ctx->state > DEMUX_PV_STATE_DOWN) { + ctx->state = DEMUX_PV_STATE_DOWNING; + if (flush) + flush_workqueue(ctx->wq); + if (ctx->has_smi) { + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); + } + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, + int port, int do_init) +{ + int ret = 0; + + if (!do_init) { + clean_vf_mcast(&dev->sriov.demux[port - 1], slave); + /* for master, destroy real sqp resources */ + if (slave == mlx4_master_func_num(dev->dev)) + destroy_pv_resources(dev, slave, port, + dev->sriov.sqps[port - 1], 1); + /* destroy the tunnel qp resources */ + destroy_pv_resources(dev, slave, port, + dev->sriov.demux[port - 1].tun[slave], 1); + return 0; + } + + /* create the tunnel qp resources */ + ret = create_pv_resources(&dev->ib_dev, slave, port, 1, + dev->sriov.demux[port - 1].tun[slave]); + + /* for master, create the real sqp resources */ + if (!ret && slave == mlx4_master_func_num(dev->dev)) + ret = create_pv_resources(&dev->ib_dev, slave, port, 0, + dev->sriov.sqps[port - 1]); + return ret; +} + +void mlx4_ib_tunnels_update_work(struct work_struct *work) +{ + struct mlx4_ib_demux_work *dmxw; + + dmxw = container_of(work, struct mlx4_ib_demux_work, work); + mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, + dmxw->do_init); + kfree(dmxw); + return; +} + +static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, + struct mlx4_ib_demux_ctx *ctx, + int port) +{ + char name[12]; + int ret = 0; + int i; + + ctx->tun = kcalloc(dev->dev->caps.sqp_demux, + sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); + if (!ctx->tun) + return -ENOMEM; + + ctx->dev = dev; + ctx->port = port; + ctx->ib_dev = &dev->ib_dev; + + for (i = 0; + i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1)); + i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev->dev, i); + + if (!test_bit(port - 1, actv_ports.ports)) + continue; + + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); + if (ret) { + ret = -ENOMEM; + goto err_mcg; + } + } + + ret = mlx4_ib_mcg_port_init(ctx); + if (ret) { + pr_err("Failed initializing mcg para-virt (%d)\n", ret); + goto err_mcg; + } + + snprintf(name, sizeof name, "mlx4_ibt%d", port); + ctx->wq = create_singlethread_workqueue(name); + if (!ctx->wq) { + pr_err("Failed to create tunnelling WQ for port %d\n", port); + ret = -ENOMEM; + goto err_wq; + } + + snprintf(name, sizeof name, "mlx4_ibud%d", port); + ctx->ud_wq = create_singlethread_workqueue(name); + if (!ctx->ud_wq) { + pr_err("Failed to create up/down WQ for port %d\n", port); + ret = -ENOMEM; + goto err_udwq; + } + + return 0; + +err_udwq: + destroy_workqueue(ctx->wq); + ctx->wq = NULL; + +err_wq: + mlx4_ib_mcg_port_cleanup(ctx, 1); +err_mcg: + for (i = 0; i < dev->dev->caps.sqp_demux; i++) + free_pv_object(dev, i, port); + kfree(ctx->tun); + ctx->tun = NULL; + return ret; +} + +static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) +{ + if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { + sqp_ctx->state = DEMUX_PV_STATE_DOWNING; + flush_workqueue(sqp_ctx->wq); + if (sqp_ctx->has_smi) { + ib_destroy_qp(sqp_ctx->qp[0].qp); + sqp_ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); + } + ib_destroy_qp(sqp_ctx->qp[1].qp); + sqp_ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); + ib_dereg_mr(sqp_ctx->mr); + sqp_ctx->mr = NULL; + ib_dealloc_pd(sqp_ctx->pd); + sqp_ctx->pd = NULL; + ib_destroy_cq(sqp_ctx->cq); + sqp_ctx->cq = NULL; + sqp_ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) +{ + int i; + if (ctx) { + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + mlx4_ib_mcg_port_cleanup(ctx, 1); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (!ctx->tun[i]) + continue; + if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) + ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; + } + flush_workqueue(ctx->wq); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); + free_pv_object(dev, i, ctx->port); + } + kfree(ctx->tun); + destroy_workqueue(ctx->ud_wq); + destroy_workqueue(ctx->wq); + } +} + +static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) +{ + int i; + + if (!mlx4_is_master(dev->dev)) + return; + /* initialize or tear down tunnel QPs for the master */ + for (i = 0; i < dev->dev->caps.num_ports; i++) + mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init); + return; +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) +{ + int i = 0; + int err; + + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + dev->sriov.is_going_down = 0; + spin_lock_init(&dev->sriov.going_down_lock); + mlx4_ib_cm_paravirt_init(dev); + + mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); + + if (mlx4_is_slave(dev->dev)) { + mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); + return 0; + } + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (i == mlx4_master_func_num(dev->dev)) + mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid); + else + mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid()); + } + + err = mlx4_ib_init_alias_guid_service(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); + goto paravirt_err; + } + err = mlx4_ib_device_register_sysfs(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); + goto sysfs_err; + } + + mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", + dev->dev->caps.sqp_demux); + for (i = 0; i < dev->num_ports; i++) { + union ib_gid gid; + err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1); + if (err) + goto demux_err; + dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id; + err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1, + &dev->sriov.sqps[i]); + if (err) + goto demux_err; + err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); + if (err) + goto free_pv; + } + mlx4_ib_master_tunnels(dev, 1); + return 0; + +free_pv: + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); +demux_err: + while (--i >= 0) { + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + mlx4_ib_device_unregister_sysfs(dev); + +sysfs_err: + mlx4_ib_destroy_alias_guid_service(dev); + +paravirt_err: + mlx4_ib_cm_paravirt_clean(dev, -1); + + return err; +} + +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) +{ + int i; + unsigned long flags; + + if (!mlx4_is_mfunc(dev->dev)) + return; + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + dev->sriov.is_going_down = 1; + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + if (mlx4_is_master(dev->dev)) { + for (i = 0; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.demux[i].ud_wq); + mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); + kfree(dev->sriov.sqps[i]); + dev->sriov.sqps[i] = NULL; + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + + mlx4_ib_cm_paravirt_clean(dev, -1); + mlx4_ib_destroy_alias_guid_service(dev); + mlx4_ib_device_unregister_sysfs(dev); + } +} diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 3530c41fcd1..0f7027e7db1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -39,6 +39,8 @@ #include <linux/inetdevice.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> +#include <net/ipv6.h> +#include <net/addrconf.h> #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> @@ -46,19 +48,27 @@ #include <linux/mlx4/driver.h> #include <linux/mlx4/cmd.h> +#include <linux/mlx4/qp.h> #include "mlx4_ib.h" #include "user.h" -#define DRV_NAME "mlx4_ib" -#define DRV_VERSION "1.0" -#define DRV_RELDATE "April 4, 2008" +#define DRV_NAME MLX4_IB_DRV_NAME +#define DRV_VERSION "2.2-1" +#define DRV_RELDATE "Feb 2014" + +#define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); +int mlx4_ib_sm_guid_assign = 1; +module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); + static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -70,6 +80,8 @@ struct update_gid_work { int port; }; +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); + static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) @@ -82,6 +94,31 @@ static void init_query_mad(struct ib_smp *mad) static union ib_gid zgid; +static int check_flow_steering_support(struct mlx4_dev *dev) +{ + int eth_num_ports = 0; + int ib_num_ports = 0; + + int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + + if (dmfs) { + int i; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + eth_num_ports++; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + dmfs &= (!ib_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && + (!eth_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); + if (ib_num_ports && mlx4_is_mfunc(dev)) { + pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); + dmfs = 0; + } + } + return dmfs; +} + static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { @@ -98,7 +135,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, + 1, NULL, NULL, in_mad, out_mad); if (err) goto out; @@ -130,34 +168,44 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; + else + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; + if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; - props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30)); + props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; - props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; + props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; + props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; - props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws; + props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; - props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; + props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; - props->masked_atomic_cap = IB_ATOMIC_HCA; + props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; @@ -182,11 +230,12 @@ mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) } static int ib_link_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props) + struct ib_port_attr *props, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int ext_active_speed; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); @@ -198,7 +247,10 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port, in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; @@ -211,7 +263,10 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port, props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); - props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; + if (netw_view) + props->gid_tbl_len = out_mad->data[50]; + else + props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); @@ -244,7 +299,7 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port, in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; @@ -270,7 +325,7 @@ static u8 state_to_phys_state(enum ib_port_state state) } static int eth_link_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props) + struct ib_port_attr *props, int netw_view) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); @@ -293,7 +348,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; - props->port_cap_flags = IB_PORT_CM_SUP; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; @@ -320,26 +375,36 @@ out: return err; } -static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props) +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) { int err; memset(props, 0, sizeof *props); err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? - ib_link_query_port(ibdev, port, props) : - eth_link_query_port(ibdev, port, props); + ib_link_query_port(ibdev, port, props, netw_view) : + eth_link_query_port(ibdev, port, props, netw_view); return err; } -static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, - union ib_gid *gid) +static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + /* returns host view */ + return __mlx4_ib_query_port(ibdev, port, props, 0); +} + +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int clear = 0; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -350,23 +415,38 @@ static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (mlx4_is_mfunc(dev->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); + if (mlx4_is_mfunc(dev->dev) && !netw_view) { + if (index) { + /* For any index > 0, return the null guid */ + err = 0; + clear = 1; + goto out; + } + } + init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: + if (clear) + memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; @@ -386,16 +466,17 @@ static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) - return __mlx4_ib_query_gid(ibdev, port, index, gid); + return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); else return iboe_query_gid(ibdev, port, index, gid); } -static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, - u16 *pkey) +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); @@ -407,7 +488,11 @@ static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); if (err) goto out; @@ -419,10 +504,16 @@ out: return err; } +static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); +} + static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; + unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; @@ -430,9 +521,12 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; - spin_lock(&to_mdev(ibdev)->sm_lock); + if (mlx4_is_slave(to_mdev(ibdev)->dev)) + return -EOPNOTSUPP; + + spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); memcpy(ibdev->node_desc, props->node_desc, 64); - spin_unlock(&to_mdev(ibdev)->sm_lock); + spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); /* * If possible, pass node desc to FW, so it can generate @@ -442,29 +536,25 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, if (IS_ERR(mailbox)) return 0; - memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, - MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } -static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, - u32 cap_mask) +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; - u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - memset(mailbox->buf, 0, 256); - if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); @@ -473,8 +563,8 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } - err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; @@ -483,11 +573,20 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; struct ib_port_attr attr; u32 cap_mask; int err; - mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not meaningful. + */ + if (is_eth) + return 0; + + mutex_lock(&mdev->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) @@ -496,9 +595,9 @@ static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; - err = mlx4_SET_PORT(to_mdev(ibdev), port, - !!(mask & IB_PORT_RESET_QKEY_CNTR), - cap_mask); + err = mlx4_ib_SET_PORT(mdev, port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); @@ -510,15 +609,24 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); - resp.qp_tab_size = dev->dev->caps.num_qps; - resp.bf_reg_size = dev->dev->caps.bf_reg_size; - resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + resp_v3.qp_tab_size = dev->dev->caps.num_qps; + resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; + resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.dev_caps = dev->dev->caps.userspace_caps; + resp.qp_tab_size = dev->dev->caps.num_qps; + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + resp.cqe_size = dev->dev->caps.cqe_size; + } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) @@ -533,7 +641,11 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); - err = ib_copy_to_udata(udata, &resp, sizeof resp); + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); + else + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); @@ -693,7 +805,6 @@ static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { - u8 mac[6]; struct net_device *ndev; int ret = 0; @@ -707,37 +818,394 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, spin_unlock(&mdev->iboe.lock); if (ndev) { - rdma_get_mcast_mac((struct in6_addr *)gid, mac); - rtnl_lock(); - dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac); ret = 1; - rtnl_unlock(); dev_put(ndev); } return ret; } +struct mlx4_ib_steering { + struct list_head list; + u64 reg_id; + union ib_gid gid; +}; + +static int parse_flow_attr(struct mlx4_dev *dev, + u32 qp_num, + union ib_flow_spec *ib_spec, + struct _rule_hw *mlx4_spec) +{ + enum mlx4_net_trans_rule_id type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + type = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, + ETH_ALEN); + memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, + ETH_ALEN); + mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; + mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; + break; + case IB_FLOW_SPEC_IB: + type = MLX4_NET_TRANS_RULE_ID_IB; + mlx4_spec->ib.l3_qpn = + cpu_to_be32(qp_num); + mlx4_spec->ib.qpn_mask = + cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); + break; + + + case IB_FLOW_SPEC_IPV4: + type = MLX4_NET_TRANS_RULE_ID_IPV4; + mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; + mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; + mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; + mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; + break; + + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + type = ib_spec->type == IB_FLOW_SPEC_TCP ? + MLX4_NET_TRANS_RULE_ID_TCP : + MLX4_NET_TRANS_RULE_ID_UDP; + mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; + mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; + mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; + mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; + break; + + default: + return -EINVAL; + } + if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 || + mlx4_hw_rule_sz(dev, type) < 0) + return -EINVAL; + mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type)); + mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2; + return mlx4_hw_rule_sz(dev, type); +} + +struct default_rules { + __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u8 link_layer; +}; +static const struct default_rules default_table[] = { + { + .mandatory_fields = {IB_FLOW_SPEC_IPV4}, + .mandatory_not_fields = {IB_FLOW_SPEC_ETH}, + .rules_create_list = {IB_FLOW_SPEC_IB}, + .link_layer = IB_LINK_LAYER_INFINIBAND + } +}; + +static int __mlx4_ib_default_rules_match(struct ib_qp *qp, + struct ib_flow_attr *flow_attr) +{ + int i, j, k; + void *ib_flow; + const struct default_rules *pdefault_rules = default_table; + u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); + + for (i = 0; i < sizeof(default_table)/sizeof(default_table[0]); i++, + pdefault_rules++) { + __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; + memset(&field_types, 0, sizeof(field_types)); + + if (link_layer != pdefault_rules->link_layer) + continue; + + ib_flow = flow_attr + 1; + /* we assume the specs are sorted */ + for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && + j < flow_attr->num_of_specs; k++) { + union ib_flow_spec *current_flow = + (union ib_flow_spec *)ib_flow; + + /* same layer but different type */ + if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == + (pdefault_rules->mandatory_fields[k] & + IB_FLOW_SPEC_LAYER_MASK)) && + (current_flow->type != + pdefault_rules->mandatory_fields[k])) + goto out; + + /* same layer, try match next one */ + if (current_flow->type == + pdefault_rules->mandatory_fields[k]) { + j++; + ib_flow += + ((union ib_flow_spec *)ib_flow)->size; + } + } + + ib_flow = flow_attr + 1; + for (j = 0; j < flow_attr->num_of_specs; + j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) + for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) + /* same layer and same type */ + if (((union ib_flow_spec *)ib_flow)->type == + pdefault_rules->mandatory_not_fields[k]) + goto out; + + return i; + } +out: + return -1; +} + +static int __mlx4_ib_create_default_rules( + struct mlx4_ib_dev *mdev, + struct ib_qp *qp, + const struct default_rules *pdefault_rules, + struct _rule_hw *mlx4_spec) { + int size = 0; + int i; + + for (i = 0; i < sizeof(pdefault_rules->rules_create_list)/ + sizeof(pdefault_rules->rules_create_list[0]); i++) { + int ret; + union ib_flow_spec ib_spec; + switch (pdefault_rules->rules_create_list[i]) { + case 0: + /* no rule */ + continue; + case IB_FLOW_SPEC_IB: + ib_spec.type = IB_FLOW_SPEC_IB; + ib_spec.size = sizeof(struct ib_flow_spec_ib); + + break; + default: + /* invalid rule */ + return -EINVAL; + } + /* We must put empty rule, qpn is being ignored */ + ret = parse_flow_attr(mdev->dev, 0, &ib_spec, + mlx4_spec); + if (ret < 0) { + pr_info("invalid parsing\n"); + return -EINVAL; + } + + mlx4_spec = (void *)mlx4_spec + ret; + size += ret; + } + return size; +} + +static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + int domain, + enum mlx4_net_trans_promisc_mode flow_type, + u64 *reg_id) +{ + int ret, i; + int size = 0; + void *ib_flow; + struct mlx4_ib_dev *mdev = to_mdev(qp->device); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + int default_flow; + + static const u16 __mlx4_domain[] = { + [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, + [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, + [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, + [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, + }; + + if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { + pr_err("Invalid priority value %d\n", flow_attr->priority); + return -EINVAL; + } + + if (domain >= IB_FLOW_DOMAIN_NUM) { + pr_err("Invalid domain value %d\n", domain); + return -EINVAL; + } + + if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + ctrl = mailbox->buf; + + ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | + flow_attr->priority); + ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type); + ctrl->port = flow_attr->port; + ctrl->qpn = cpu_to_be32(qp->qp_num); + + ib_flow = flow_attr + 1; + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + /* Add default flows */ + default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); + if (default_flow >= 0) { + ret = __mlx4_ib_create_default_rules( + mdev, qp, default_table + default_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + size += ret; + } + for (i = 0; i < flow_attr->num_of_specs; i++) { + ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + ib_flow += ((union ib_flow_spec *) ib_flow)->size; + size += ret; + } + + ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (ret == -ENOMEM) + pr_err("mcg table is full. Fail to register network rule.\n"); + else if (ret == -ENXIO) + pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); + else if (ret) + pr_err("Invalid argumant. Fail to register network rule.\n"); + + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return ret; +} + +static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) +{ + int err; + err = mlx4_cmd(dev, reg_id, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + pr_err("Fail to detach network rule. registration id = 0x%llx\n", + reg_id); + return err; +} + +static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + int err = 0, i = 0; + struct mlx4_ib_flow *mflow; + enum mlx4_net_trans_promisc_mode type[2]; + + memset(type, 0, sizeof(type)); + + mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); + if (!mflow) { + err = -ENOMEM; + goto err_free; + } + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + type[0] = MLX4_FS_REGULAR; + break; + + case IB_FLOW_ATTR_ALL_DEFAULT: + type[0] = MLX4_FS_ALL_DEFAULT; + break; + + case IB_FLOW_ATTR_MC_DEFAULT: + type[0] = MLX4_FS_MC_DEFAULT; + break; + + case IB_FLOW_ATTR_SNIFFER: + type[0] = MLX4_FS_UC_SNIFFER; + type[1] = MLX4_FS_MC_SNIFFER; + break; + + default: + err = -EINVAL; + goto err_free; + } + + while (i < ARRAY_SIZE(type) && type[i]) { + err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], + &mflow->reg_id[i]); + if (err) + goto err_free; + i++; + } + + return &mflow->ibflow; + +err_free: + kfree(mflow); + return ERR_PTR(err); +} + +static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) +{ + int err, ret = 0; + int i = 0; + struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); + struct mlx4_ib_flow *mflow = to_mflow(flow_id); + + while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { + err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); + if (err) + ret = err; + i++; + } + + kfree(mflow); + return ret; +} + static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); + u64 reg_id; + struct mlx4_ib_steering *ib_steering = NULL; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); + if (!ib_steering) + return -ENOMEM; + } - err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, - !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), - MLX4_PROT_IB_IPV6); + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + prot, ®_id); if (err) - return err; + goto err_malloc; err = add_gid_entry(ibqp, gid); if (err) goto err_add; + if (ib_steering) { + memcpy(ib_steering->gid.raw, gid->raw, 16); + ib_steering->reg_id = reg_id; + mutex_lock(&mqp->mutex); + list_add(&ib_steering->list, &mqp->steering_rules); + mutex_unlock(&mqp->mutex); + } return 0; err_add: - mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6); + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id); +err_malloc: + kfree(ib_steering); + return err; } @@ -762,12 +1230,34 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); - u8 mac[6]; struct net_device *ndev; struct mlx4_ib_gid_entry *ge; + u64 reg_id = 0; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + struct mlx4_ib_steering *ib_steering; + + mutex_lock(&mqp->mutex); + list_for_each_entry(ib_steering, &mqp->steering_rules, list) { + if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { + list_del(&ib_steering->list); + break; + } + } + mutex_unlock(&mqp->mutex); + if (&ib_steering->list == &mqp->steering_rules) { + pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); + return -EINVAL; + } + reg_id = ib_steering->reg_id; + kfree(ib_steering); + } - err = mlx4_multicast_detach(mdev->dev, - &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6); + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id); if (err) return err; @@ -779,13 +1269,8 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); - rdma_get_mcast_mac((struct in6_addr *)gid, mac); - if (ndev) { - rtnl_lock(); - dev_mc_del(mdev->iboe.netdevs[ge->port - 1], mac); - rtnl_unlock(); + if (ndev) dev_put(ndev); - } list_del(&ge->list); kfree(ge); } else @@ -800,6 +1285,7 @@ static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); @@ -809,8 +1295,10 @@ static int init_node_data(struct mlx4_ib_dev *dev) init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + if (mlx4_is_master(dev->dev)) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; - err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; @@ -818,10 +1306,11 @@ static int init_node_data(struct mlx4_ib_dev *dev) in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; - err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; + dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: @@ -877,7 +1366,8 @@ static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_board_id }; -static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, + struct net_device *dev) { memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); @@ -898,7 +1388,6 @@ static void update_gids_task(struct work_struct *work) union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; - struct ib_event event; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { @@ -911,174 +1400,478 @@ static void update_gids_task(struct work_struct *work) err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, - MLX4_CMD_NATIVE); + MLX4_CMD_WRAPPED); if (err) pr_warn("set port command failed\n"); - else { - memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); - event.device = &gw->dev->ib_dev; - event.element.port_num = gw->port; - event.event = IB_EVENT_GID_CHANGE; - ib_dispatch_event(&event); + else + mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); + + mlx4_free_cmd_mailbox(dev, mailbox); + kfree(gw); +} + +static void reset_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = + container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("reset gid table failed\n"); + goto free; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof(gw->gids)); + + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn(KERN_WARNING + "set port %d command failed\n", gw->port); } mlx4_free_cmd_mailbox(dev, mailbox); +free: kfree(gw); } -static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) +static int update_gid_table(struct mlx4_ib_dev *dev, int port, + union ib_gid *gid, int clear, + int default_gid) { - struct net_device *ndev = dev->iboe.netdevs[port - 1]; struct update_gid_work *work; - struct net_device *tmp; int i; - u8 *hits; - int ret; - union ib_gid gid; - int free; - int found; int need_update = 0; - u16 vid; - - work = kzalloc(sizeof *work, GFP_ATOMIC); - if (!work) - return -ENOMEM; + int free = -1; + int found = -1; + int max_gids; - hits = kzalloc(128, GFP_ATOMIC); - if (!hits) { - ret = -ENOMEM; - goto out; - } - - rcu_read_lock(); - for_each_netdev_rcu(&init_net, tmp) { - if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { - gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - vid = rdma_vlan_dev_vlan_id(tmp); - mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); - found = 0; - free = -1; - for (i = 0; i < 128; ++i) { - if (free < 0 && - !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - free = i; - if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { - hits[i] = 1; - found = 1; + if (default_gid) { + free = 0; + } else { + max_gids = dev->dev->caps.gid_table_len[port]; + for (i = 1; i < max_gids; ++i) { + if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, + sizeof(*gid))) + found = i; + + if (clear) { + if (found >= 0) { + need_update = 1; + dev->iboe.gid_table[port - 1][found] = + zgid; break; } - } + } else { + if (found >= 0) + break; - if (!found) { - if (tmp == ndev && - (memcmp(&dev->iboe.gid_table[port - 1][0], - &gid, sizeof gid) || - !memcmp(&dev->iboe.gid_table[port - 1][0], - &zgid, sizeof gid))) { - dev->iboe.gid_table[port - 1][0] = gid; - ++need_update; - hits[0] = 1; - } else if (free >= 0) { - dev->iboe.gid_table[port - 1][free] = gid; - hits[free] = 1; - ++need_update; - } + if (free < 0 && + !memcmp(&dev->iboe.gid_table[port - 1][i], + &zgid, sizeof(*gid))) + free = i; } } } - rcu_read_unlock(); - for (i = 0; i < 128; ++i) - if (!hits[i]) { - if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - ++need_update; - dev->iboe.gid_table[port - 1][i] = zgid; - } + if (found == -1 && !clear && free >= 0) { + dev->iboe.gid_table[port - 1][free] = *gid; + need_update = 1; + } - if (need_update) { - memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); - INIT_WORK(&work->work, update_gids_task); - work->port = port; - work->dev = dev; - queue_work(wq, &work->work); - } else - kfree(work); + if (!need_update) + return 0; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); - kfree(hits); return 0; +} -out: - kfree(work); - return ret; +static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev); } -static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) + +static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port) { - switch (event) { - case NETDEV_UP: - case NETDEV_CHANGEADDR: - update_ipv6_gids(dev, port, 0); - break; + struct update_gid_work *work; - case NETDEV_DOWN: - update_ipv6_gids(dev, port, 1); - dev->iboe.netdevs[port - 1] = NULL; - } + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids)); + memset(work->gids, 0, sizeof(work->gids)); + INIT_WORK(&work->work, reset_gids_task); + work->dev = dev; + work->port = port; + queue_work(wq, &work->work); + return 0; } -static void netdev_added(struct mlx4_ib_dev *dev, int port) +static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, + struct mlx4_ib_dev *ibdev, union ib_gid *gid) { - update_ipv6_gids(dev, port, 0); + struct mlx4_ib_iboe *iboe; + int port = 0; + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? + rdma_vlan_dev_real_dev(event_netdev) : + event_netdev; + union ib_gid default_gid; + + mlx4_make_default_gid(real_dev, &default_gid); + + if (!memcmp(gid, &default_gid, sizeof(*gid))) + return 0; + + if (event != NETDEV_DOWN && event != NETDEV_UP) + return 0; + + if ((real_dev != event_netdev) && + (event == NETDEV_DOWN) && + rdma_link_local_addr((struct in6_addr *)gid)) + return 0; + + iboe = &ibdev->iboe; + spin_lock(&iboe->lock); + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + update_gid_table(ibdev, port, gid, + event == NETDEV_DOWN, 0); + + spin_unlock(&iboe->lock); + return 0; + +} + +static u8 mlx4_ib_get_dev_port(struct net_device *dev, + struct mlx4_ib_dev *ibdev) +{ + u8 port = 0; + struct mlx4_ib_iboe *iboe; + struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? + rdma_vlan_dev_real_dev(dev) : dev; + + iboe = &ibdev->iboe; + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + break; + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return 0; + else + return port; } -static void netdev_removed(struct mlx4_ib_dev *dev, int port) +static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, + void *ptr) { - update_ipv6_gids(dev, port, 1); + struct mlx4_ib_dev *ibdev; + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *event_netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); + + mlx4_ib_addr_event(event, event_netdev, ibdev, &gid); + return NOTIFY_DONE; } -static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, +#if IS_ENABLED(CONFIG_IPV6) +static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; - struct net_device *oldnd; + struct inet6_ifaddr *ifa = ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *event_netdev = ifa->idev->dev; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6); + + mlx4_ib_addr_event(event, event_netdev, ibdev, gid); + return NOTIFY_DONE; +} +#endif + +#define MLX4_IB_INVALID_MAC ((u64)-1) +static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + int port) +{ + u64 new_smac = 0; + u64 release_mac = MLX4_IB_INVALID_MAC; + struct mlx4_ib_qp *qp; + + read_lock(&dev_base_lock); + new_smac = mlx4_mac_to_u64(dev->dev_addr); + read_unlock(&dev_base_lock); + + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); + qp = ibdev->qp1_proxy[port - 1]; + if (qp) { + int new_smac_index; + u64 old_smac = qp->pri.smac; + struct mlx4_update_qp_params update_params; + + if (new_smac == old_smac) + goto unlock; + + new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); + + if (new_smac_index < 0) + goto unlock; + + update_params.smac_index = new_smac_index; + if (mlx4_update_qp(ibdev->dev, &qp->mqp, MLX4_UPDATE_QP_SMAC, + &update_params)) { + release_mac = new_smac; + goto unlock; + } + + qp->pri.smac = new_smac; + qp->pri.smac_index = new_smac_index; + + release_mac = old_smac; + } + +unlock: + mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); + if (release_mac != MLX4_IB_INVALID_MAC) + mlx4_unregister_mac(ibdev->dev, port, release_mac); +} + +static void mlx4_ib_get_dev_addr(struct net_device *dev, + struct mlx4_ib_dev *ibdev, u8 port) +{ + struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; +#endif + union ib_gid gid; + + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return; + + /* IPv4 gids */ + in_dev = in_dev_get(dev); + if (in_dev) { + for_ifa(in_dev) { + /*ifa->ifa_address;*/ + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + update_gid_table(ibdev, port, &gid, 0, 0); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +#if IS_ENABLED(CONFIG_IPV6) + /* IPv6 gids */ + in6_dev = in6_dev_get(dev); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + update_gid_table(ibdev, port, pgid, 0, 0); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev, + struct net_device *dev, u8 port) +{ + union ib_gid gid; + mlx4_make_default_gid(dev, &gid); + update_gid_table(ibdev, port, &gid, 0, 1); +} + +static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) +{ + struct net_device *dev; + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + int i; + + for (i = 1; i <= ibdev->num_ports; ++i) + if (reset_gid_table(ibdev, i)) + return -1; + + read_lock(&dev_base_lock); + spin_lock(&iboe->lock); + + for_each_netdev(&init_net, dev) { + u8 port = mlx4_ib_get_dev_port(dev, ibdev); + if (port) + mlx4_ib_get_dev_addr(dev, ibdev, port); + } + + spin_unlock(&iboe->lock); + read_unlock(&dev_base_lock); + + return 0; +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + unsigned long event) + +{ struct mlx4_ib_iboe *iboe; + int update_qps_port = -1; int port; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); iboe = &ibdev->iboe; spin_lock(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { - oldnd = iboe->netdevs[port - 1]; + enum ib_port_state port_state = IB_PORT_NOP; + struct net_device *old_master = iboe->masters[port - 1]; + struct net_device *curr_netdev; + struct net_device *curr_master; + iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); - if (oldnd != iboe->netdevs[port - 1]) { - if (iboe->netdevs[port - 1]) - netdev_added(ibdev, port); - else - netdev_removed(ibdev, port); + if (iboe->netdevs[port - 1]) + mlx4_ib_set_default_gid(ibdev, + iboe->netdevs[port - 1], port); + curr_netdev = iboe->netdevs[port - 1]; + + if (iboe->netdevs[port - 1] && + netif_is_bond_slave(iboe->netdevs[port - 1])) { + iboe->masters[port - 1] = netdev_master_upper_dev_get( + iboe->netdevs[port - 1]); + } else { + iboe->masters[port - 1] = NULL; + } + curr_master = iboe->masters[port - 1]; + + if (dev == iboe->netdevs[port - 1] && + (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || + event == NETDEV_UP || event == NETDEV_CHANGE)) + update_qps_port = port; + + if (curr_netdev) { + port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + } else { + reset_gid_table(ibdev, port); + } + /* if using bonding/team and a slave port is down, we don't the bond IP + * based gids in the table since flows that select port by gid may get + * the down port. + */ + if (curr_master && (port_state == IB_PORT_DOWN)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + } + /* if bonding is used it is possible that we add it to masters + * only after IP address is assigned to the net bonding + * interface. + */ + if (curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + mlx4_ib_get_dev_addr(curr_master, ibdev, port); } - } - if (dev == iboe->netdevs[0] || - (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) - handle_en_event(ibdev, 1, event); - else if (dev == iboe->netdevs[1] - || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) - handle_en_event(ibdev, 2, event); + if (!curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); + } + } spin_unlock(&iboe->lock); + if (update_qps_port > 0) + mlx4_ib_update_qps(ibdev, dev, update_qps_port); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mlx4_ib_dev *ibdev; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); + mlx4_ib_scan_netdevs(ibdev, dev, event); + return NOTIFY_DONE; } +static void init_pkeys(struct mlx4_ib_dev *ibdev) +{ + int port; + int slave; + int i; + + if (mlx4_is_master(ibdev->dev)) { + for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) { + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = + /* master has the identity virt2phys pkey mapping */ + (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : + ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; + mlx4_sync_pkey_table(ibdev->dev, slave, port, i, + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); + } + } + } + /* initialize pkey cache */ + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) + ibdev->pkeys.phys_pkey_cache[port-1][i] = + (i) ? 0 : 0xFFFF; + } + } +} + static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { - char name[32]; + char name[80]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; @@ -1108,10 +1901,11 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { - sprintf(name, "mlx4-ib-%d-%d@%s", - i, j, dev->pdev->bus->name); + snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s", + i, j, dev->pdev->bus->name); /* Set IRQ for specific name (per ring) */ - if (mlx4_assign_eq(dev, name, &ibdev->eq_table[eq])) { + if (mlx4_assign_eq(dev, name, NULL, + &ibdev->eq_table[eq])) { /* Use legacy (same as mlx4_en driver) */ pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); ibdev->eq_table[eq] = @@ -1158,14 +1952,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) int i, j; int err; struct mlx4_ib_iboe *iboe; + int ib_num_ports = 0; pr_info_once("%s", mlx4_ib_version); - if (mlx4_is_mfunc(dev)) { - pr_warn("IB not yet supported in SRIOV\n"); - return NULL; - } - + num_ports = 0; mlx4_foreach_ib_transport_port(i, dev) num_ports++; @@ -1204,7 +1995,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + if (dev->caps.userspace_caps) + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + else + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | @@ -1272,10 +2067,23 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; - ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; - ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; - ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; - ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + if (!mlx4_is_slave(ibdev->dev)) { + ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; + ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; + ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; + ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || + dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; + ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; + ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; + + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; @@ -1285,6 +2093,16 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } + if (check_flow_steering_support(dev)) { + ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED; + ibdev->ib_dev.create_flow = mlx4_ib_create_flow; + ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; + + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + } + mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); @@ -1293,29 +2111,93 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { + mutex_init(&ibdev->qp1_proxy_lock[i]); if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]); if (err) ibdev->counters[i] = -1; - } else - ibdev->counters[i] = -1; + } else { + ibdev->counters[i] = -1; + } } + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && + ib_num_ports) { + ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; + err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, + MLX4_IB_UC_STEER_QPN_ALIGN, + &ibdev->steer_qpn_base); + if (err) + goto err_counter; + + ibdev->ib_uc_qpns_bitmap = + kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * + sizeof(long), + GFP_KERNEL); + if (!ibdev->ib_uc_qpns_bitmap) { + dev_err(&dev->pdev->dev, "bit map alloc failed\n"); + goto err_steer_qp_release; + } + + bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); + + err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( + dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_base + + ibdev->steer_qpn_count - 1); + if (err) + goto err_steer_free_bitmap; + } + if (ib_register_device(&ibdev->ib_dev, NULL)) - goto err_counter; + goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { - iboe->nb.notifier_call = mlx4_ib_netdev_event; - err = register_netdevice_notifier(&iboe->nb); - if (err) - goto err_reg; + if (mlx4_ib_init_sriov(ibdev)) + goto err_mad; + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { + if (!iboe->nb.notifier_call) { + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); + if (err) { + iboe->nb.notifier_call = NULL; + goto err_notif; + } + } + if (!iboe->nb_inet.notifier_call) { + iboe->nb_inet.notifier_call = mlx4_ib_inet_event; + err = register_inetaddr_notifier(&iboe->nb_inet); + if (err) { + iboe->nb_inet.notifier_call = NULL; + goto err_notif; + } + } +#if IS_ENABLED(CONFIG_IPV6) + if (!iboe->nb_inet6.notifier_call) { + iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event; + err = register_inet6addr_notifier(&iboe->nb_inet6); + if (err) { + iboe->nb_inet6.notifier_call = NULL; + goto err_notif; + } + } +#endif + for (i = 1 ; i <= ibdev->num_ports ; ++i) + reset_gid_table(ibdev, i); + rtnl_lock(); + mlx4_ib_scan_netdevs(ibdev, NULL, 0); + rtnl_unlock(); + mlx4_ib_init_gid_table(ibdev); } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { @@ -1326,16 +2208,55 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_active = true; + if (mlx4_is_mfunc(ibdev->dev)) + init_pkeys(ibdev); + + /* create paravirt contexts for any VFs which are active */ + if (mlx4_is_master(ibdev->dev)) { + for (j = 0; j < MLX4_MFUNC_MAX; j++) { + if (j == mlx4_master_func_num(ibdev->dev)) + continue; + if (mlx4_is_slave_active(ibdev->dev, j)) + do_slave_init(ibdev, j, 1); + } + } return ibdev; err_notif: - if (unregister_netdevice_notifier(&ibdev->iboe.nb)) - pr_warn("failure unregistering notifier\n"); + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif flush_workqueue(wq); + mlx4_ib_close_sriov(ibdev); + +err_mad: + mlx4_ib_mad_cleanup(ibdev); + err_reg: ib_unregister_device(&ibdev->ib_dev); +err_steer_free_bitmap: + kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); err_counter: for (; i; --i) if (ibdev->counters[i - 1] != -1) @@ -1356,11 +2277,75 @@ err_dealloc: return NULL; } +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ + int offset; + + WARN_ON(!dev->ib_uc_qpns_bitmap); + + offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, + dev->steer_qpn_count, + get_count_order(count)); + if (offset < 0) + return offset; + + *qpn = dev->steer_qpn_base + offset; + return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ + if (!qpn || + dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) + return; + + BUG_ON(qpn < dev->steer_qpn_base); + + bitmap_release_region(dev->ib_uc_qpns_bitmap, + qpn - dev->steer_qpn_base, + get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach) +{ + int err; + size_t flow_size; + struct ib_flow_attr *flow = NULL; + struct ib_flow_spec_ib *ib_spec; + + if (is_attach) { + flow_size = sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_ib); + flow = kzalloc(flow_size, GFP_KERNEL); + if (!flow) + return -ENOMEM; + flow->port = mqp->port; + flow->num_of_specs = 1; + flow->size = flow_size; + ib_spec = (struct ib_flow_spec_ib *)(flow + 1); + ib_spec->type = IB_FLOW_SPEC_IB; + ib_spec->size = sizeof(struct ib_flow_spec_ib); + /* Add an empty rule for IB L2 */ + memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); + + err = __mlx4_ib_create_flow(&mqp->ibqp, flow, + IB_FLOW_DOMAIN_NIC, + MLX4_FS_REGULAR, + &mqp->reg_id); + } else { + err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); + } + kfree(flow); + return err; +} + static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p; + mlx4_ib_close_sriov(ibdev); mlx4_ib_mad_cleanup(ibdev); ib_unregister_device(&ibdev->ib_dev); if (ibdev->iboe.nb.notifier_call) { @@ -1368,6 +2353,26 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); + } + + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif + iounmap(ibdev->uar_map); for (p = 0; p < ibdev->num_ports; ++p) if (ibdev->counters[p] != -1) @@ -1382,21 +2387,86 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) ib_dealloc_device(&ibdev->ib_dev); } +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) +{ + struct mlx4_ib_demux_work **dm = NULL; + struct mlx4_dev *dev = ibdev->dev; + int i; + unsigned long flags; + struct mlx4_active_ports actv_ports; + unsigned int ports; + unsigned int first_port; + + if (!mlx4_is_master(dev)) + return; + + actv_ports = mlx4_get_active_ports(dev, slave); + ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); + first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + + dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); + if (!dm) { + pr_err("failed to allocate memory for tunneling qp update\n"); + goto out; + } + + for (i = 0; i < ports; i++) { + dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); + if (!dm[i]) { + pr_err("failed to allocate memory for tunneling qp update work struct\n"); + for (i = 0; i < dev->caps.num_ports; i++) { + if (dm[i]) + kfree(dm[i]); + } + goto out; + } + } + /* initialize or tear down tunnel QPs for the slave */ + for (i = 0; i < ports; i++) { + INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); + dm[i]->port = first_port + i + 1; + dm[i]->slave = slave; + dm[i]->do_init = do_init; + dm[i]->dev = ibdev; + spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); + if (!ibdev->sriov.is_going_down) + queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + } +out: + kfree(dm); + return; +} + static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, - enum mlx4_dev_event event, int port) + enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); + struct mlx4_eqe *eqe = NULL; + struct ib_event_work *ew; + int p = 0; - if (port > ibdev->num_ports) - return; + if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) + eqe = (struct mlx4_eqe *)param; + else + p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: + if (p > ibdev->num_ports) + return; + if (mlx4_is_master(dev) && + rdma_port_get_link_layer(&ibdev->ib_dev, p) == + IB_LINK_LAYER_INFINIBAND) { + mlx4_ib_invalidate_all_guid_record(ibdev, p); + } ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: + if (p > ibdev->num_ports) + return; ibev.event = IB_EVENT_PORT_ERR; break; @@ -1405,12 +2475,39 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, ibev.event = IB_EVENT_DEVICE_FATAL; break; + case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: + ew = kmalloc(sizeof *ew, GFP_ATOMIC); + if (!ew) { + pr_err("failed to allocate memory for events work\n"); + break; + } + + INIT_WORK(&ew->work, handle_port_mgmt_change_event); + memcpy(&ew->ib_eqe, eqe, sizeof *eqe); + ew->ib_dev = ibdev; + /* need to queue only for port owner, which uses GEN_EQE */ + if (mlx4_is_master(dev)) + queue_work(wq, &ew->work); + else + handle_port_mgmt_change_event(&ew->work); + return; + + case MLX4_DEV_EVENT_SLAVE_INIT: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 1); + return; + + case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 0); + return; + default: return; } ibev.device = ibdev_ptr; - ibev.element.port_num = port; + ibev.element.port_num = (u8) p; ib_dispatch_event(&ibev); } @@ -1430,18 +2527,28 @@ static int __init mlx4_ib_init(void) if (!wq) return -ENOMEM; + err = mlx4_ib_mcg_init(); + if (err) + goto clean_wq; + err = mlx4_register_interface(&mlx4_ib_interface); - if (err) { - destroy_workqueue(wq); - return err; - } + if (err) + goto clean_mcg; return 0; + +clean_mcg: + mlx4_ib_mcg_destroy(); + +clean_wq: + destroy_workqueue(wq); + return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); + mlx4_ib_mcg_destroy(); destroy_workqueue(wq); } diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c new file mode 100644 index 00000000000..ed327e6c8fd --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -0,0 +1,1257 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_sa.h> + +#include <linux/mlx4/cmd.h> +#include <linux/rbtree.h> +#include <linux/delay.h> + +#include "mlx4_ib.h" + +#define MAX_VFS 80 +#define MAX_PEND_REQS_PER_FUNC 4 +#define MAD_TIMEOUT_MS 2000 + +#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) +#define mcg_error(fmt, arg...) pr_err(fmt, ##arg) +#define mcg_warn_group(group, format, arg...) \ + pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ + (group)->name, group->demux->port, ## arg) + +#define mcg_error_group(group, format, arg...) \ + pr_err(" %16s: " format, (group)->name, ## arg) + + +static union ib_gid mgid0; + +static struct workqueue_struct *clean_wq; + +enum mcast_state { + MCAST_NOT_MEMBER = 0, + MCAST_MEMBER, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_JOIN_SENT, + MCAST_LEAVE_SENT, + MCAST_RESP_READY +}; + +struct mcast_member { + enum mcast_state state; + uint8_t join_state; + int num_pend_reqs; + struct list_head pending; +}; + +struct ib_sa_mcmember_data { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtusel_mtu; + u8 tclass; + __be16 pkey; + u8 ratesel_rate; + u8 lifetmsel_lifetm; + __be32 sl_flowlabel_hoplimit; + u8 scope_join_state; + u8 proxy_join; + u8 reserved[2]; +}; + +struct mcast_group { + struct ib_sa_mcmember_data rec; + struct rb_node node; + struct list_head mgid0_list; + struct mlx4_ib_demux_ctx *demux; + struct mcast_member func[MAX_VFS]; + struct mutex lock; + struct work_struct work; + struct list_head pending_list; + int members[3]; + enum mcast_group_state state; + enum mcast_group_state prev_state; + struct ib_sa_mad response_sa_mad; + __be64 last_req_tid; + + char name[33]; /* MGID string */ + struct device_attribute dentry; + + /* refcount is the reference count for the following: + 1. Each queued request + 2. Each invocation of the worker thread + 3. Membership of the port at the SA + */ + atomic_t refcount; + + /* delayed work to clean pending SM request */ + struct delayed_work timeout_work; + struct list_head cleanup_list; +}; + +struct mcast_req { + int func; + struct ib_sa_mad sa_mad; + struct list_head group_list; + struct list_head func_list; + struct mcast_group *group; + int clean; +}; + + +#define safe_atomic_dec(ref) \ + do {\ + if (atomic_dec_and_test(ref)) \ + mcg_warn_group(group, "did not expect to reach zero\n"); \ + } while (0) + +static const char *get_state_string(enum mcast_group_state state) +{ + switch (state) { + case MCAST_IDLE: + return "MCAST_IDLE"; + case MCAST_JOIN_SENT: + return "MCAST_JOIN_SENT"; + case MCAST_LEAVE_SENT: + return "MCAST_LEAVE_SENT"; + case MCAST_RESP_READY: + return "MCAST_RESP_READY"; + } + return "Invalid State"; +} + +static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid) +{ + struct rb_node *node = ctx->mcg_table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx, + struct mcast_group *group) +{ + struct rb_node **link = &ctx->mcg_table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &ctx->mcg_table); + return NULL; +} + +static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_ah_attr ah_attr; + + spin_lock(&dev->sm_lock); + if (!dev->sm_ah[ctx->port - 1]) { + /* port is not yet Active, sm_ah not ready */ + spin_unlock(&dev->sm_lock); + return -EAGAIN; + } + mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + spin_unlock(&dev->sm_lock); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), + ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, + &ah_attr, NULL, mad); +} + +static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1]; + struct ib_wc wc; + struct ib_ah_attr ah_attr; + + /* Our agent might not yet be registered when mads start to arrive */ + if (!agent) + return -EAGAIN; + + ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + + if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) + return -EINVAL; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = ctx->port; + wc.slid = ah_attr.dlid; /* opensm lid */ + wc.src_qp = 1; + return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad); +} + +static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + /* we rely on a mad request as arrived from a VF */ + memcpy(&mad, sa_mad, sizeof mad); + + /* fix port GID to be the real one (slave 0) */ + sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0]; + + /* assign our own TID */ + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_leave_to_wire(struct mcast_group *group, u8 join_state) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_SA_METHOD_DELETE; + mad.mad_hdr.status = cpu_to_be16(0); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = 0x0; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE; + + *sa_data = group->rec; + sa_data->scope_join_state = join_state; + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + if (ret) + group->state = MCAST_IDLE; + + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_reply_to_slave(int slave, struct mcast_group *group, + struct ib_sa_mad *req_sa_mad, u16 status) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + mad.mad_hdr.status = cpu_to_be16(status); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid; + *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */ + + *sa_data = group->rec; + + /* reconstruct VF's requested join_state and port_gid */ + sa_data->scope_join_state &= 0xf0; + sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f); + memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid); + + ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad); + return ret; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 src_value, u8 dst_value) +{ + int err; + u8 selector = dst_value >> 6; + dst_value &= 0x3f; + src_value &= 0x3f; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static u16 cmp_rec(struct ib_sa_mcmember_data *src, + struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask) +{ + /* src is group record, dst is request record */ + /* MGID must already match */ + /* Port_GID we always replace to our Port_GID, so it is a match */ + +#define MAD_STATUS_REQ_INVALID 0x0200 + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, + src->mtusel_mtu, dst->mtusel_mtu)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->tclass != dst->tclass) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, + src->ratesel_rate, dst->ratesel_rate)) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + src->lifetmsel_lifetm, dst->lifetmsel_lifetm)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && + (src->scope_join_state & 0xf0) != + (dst->scope_join_state & 0xf0)) + return MAD_STATUS_REQ_INVALID; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +/* release group, return 1 if this was last release and group is destroyed + * timout work is canceled sync */ +static int release_group(struct mcast_group *group, int from_timeout_handler) +{ + struct mlx4_ib_demux_ctx *ctx = group->demux; + int nzgroup; + + mutex_lock(&ctx->mcg_table_lock); + mutex_lock(&group->lock); + if (atomic_dec_and_test(&group->refcount)) { + if (!from_timeout_handler) { + if (group->state != MCAST_IDLE && + !cancel_delayed_work(&group->timeout_work)) { + atomic_inc(&group->refcount); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return 0; + } + } + + nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0); + if (nzgroup) + del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + if (!list_empty(&group->pending_list)) + mcg_warn_group(group, "releasing a group with non empty pending list\n"); + if (nzgroup) + rb_erase(&group->node, &ctx->mcg_table); + list_del_init(&group->mgid0_list); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return 1; + } else { + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + } + return 0; +} + +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < 3; i++) + if (!group->members[i]) + leave_state |= (1 << i); + + return leave_state & (group->rec.scope_join_state & 7); +} + +static int join_group(struct mcast_group *group, int slave, u8 join_mask) +{ + int ret = 0; + u8 join_state; + + /* remove bits that slave is already member of, and adjust */ + join_state = join_mask & (~group->func[slave].join_state); + adjust_membership(group, join_state, 1); + group->func[slave].join_state |= join_state; + if (group->func[slave].state != MCAST_MEMBER && join_state) { + group->func[slave].state = MCAST_MEMBER; + ret = 1; + } + return ret; +} + +static int leave_group(struct mcast_group *group, int slave, u8 leave_state) +{ + int ret = 0; + + adjust_membership(group, leave_state, -1); + group->func[slave].join_state &= ~leave_state; + if (!group->func[slave].join_state) { + group->func[slave].state = MCAST_NOT_MEMBER; + ret = 1; + } + return ret; +} + +static int check_leave(struct mcast_group *group, int slave, u8 leave_mask) +{ + if (group->func[slave].state != MCAST_MEMBER) + return MAD_STATUS_REQ_INVALID; + + /* make sure we're not deleting unset bits */ + if (~group->func[slave].join_state & leave_mask) + return MAD_STATUS_REQ_INVALID; + + if (!leave_mask) + return MAD_STATUS_REQ_INVALID; + + return 0; +} + +static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mcast_group *group; + struct mcast_req *req = NULL; + + group = container_of(delay, typeof(*group), timeout_work); + + mutex_lock(&group->lock); + if (group->state == MCAST_JOIN_SENT) { + if (!list_empty(&group->pending_list)) { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + list_del(&req->group_list); + list_del(&req->func_list); + --group->func[req->func].num_pend_reqs; + mutex_unlock(&group->lock); + kfree(req); + if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) { + if (release_group(group, 1)) + return; + } else { + kfree(group); + return; + } + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "DRIVER BUG\n"); + } else if (group->state == MCAST_LEAVE_SENT) { + if (group->rec.scope_join_state & 7) + group->rec.scope_join_state &= 0xf8; + group->state = MCAST_IDLE; + mutex_unlock(&group->lock); + if (release_group(group, 1)) + return; + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); + group->state = MCAST_IDLE; + atomic_inc(&group->refcount); + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + + mutex_unlock(&group->lock); +} + +static int handle_leave_req(struct mcast_group *group, u8 leave_mask, + struct mcast_req *req) +{ + u16 status; + + if (req->clean) + leave_mask = group->func[req->func].join_state; + + status = check_leave(group, req->func, leave_mask); + if (!status) + leave_group(group, req->func, leave_mask); + + if (!req->clean) + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + return 1; +} + +static int handle_join_req(struct mcast_group *group, u8 join_mask, + struct mcast_req *req) +{ + u8 group_join_state = group->rec.scope_join_state & 7; + int ref = 0; + u16 status; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + + if (join_mask == (group_join_state & join_mask)) { + /* port's membership need not change */ + status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask); + if (!status) + join_group(group, req->func, join_mask); + + --group->func[req->func].num_pend_reqs; + send_reply_to_slave(req->func, group, &req->sa_mad, status); + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++ref; + } else { + /* port's membership needs to be updated */ + group->prev_state = group->state; + if (send_join_to_wire(group, &req->sa_mad)) { + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ref = 1; + group->state = group->prev_state; + } else + group->state = MCAST_JOIN_SENT; + } + + return ref; +} + +static void mlx4_ib_mcg_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_req *req = NULL; + struct ib_sa_mcmember_data *sa_data; + u8 req_join_state; + int rc = 1; /* release_count - this is for the scheduled work */ + u16 status; + u8 method; + + group = container_of(work, typeof(*group), work); + + mutex_lock(&group->lock); + + /* First, let's see if a response from SM is waiting regarding this group. + * If so, we need to update the group's REC. If this is a bad response, we + * may need to send a bad response to a VF waiting for it. If VF is waiting + * and this is a good response, the VF will be answered later in this func. */ + if (group->state == MCAST_RESP_READY) { + /* cancels mlx4_ib_mcg_timeout_handler */ + cancel_delayed_work(&group->timeout_work); + status = be16_to_cpu(group->response_sa_mad.mad_hdr.status); + method = group->response_sa_mad.mad_hdr.method; + if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { + mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", + be64_to_cpu(group->response_sa_mad.mad_hdr.tid), + be64_to_cpu(group->last_req_tid)); + group->state = group->prev_state; + goto process_requests; + } + if (status) { + if (!list_empty(&group->pending_list)) + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (req) { + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++rc; + } else + mcg_warn_group(group, "no request for failed join\n"); + } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing) + ++rc; + } else { + u8 resp_join_state; + u8 cur_join_state; + + resp_join_state = ((struct ib_sa_mcmember_data *) + group->response_sa_mad.data)->scope_join_state & 7; + cur_join_state = group->rec.scope_join_state & 7; + + if (method == IB_MGMT_METHOD_GET_RESP) { + /* successfull join */ + if (!cur_join_state && resp_join_state) + --rc; + } else if (!resp_join_state) + ++rc; + memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec); + } + group->state = MCAST_IDLE; + } + +process_requests: + /* We should now go over pending join/leave requests, as long as we are idle. */ + while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { + req = list_first_entry(&group->pending_list, struct mcast_req, + group_list); + sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + req_join_state = sa_data->scope_join_state & 0x7; + + /* For a leave request, we will immediately answer the VF, and + * update our internal counters. The actual leave will be sent + * to SM later, if at all needed. We dequeue the request now. */ + if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE) + rc += handle_leave_req(group, req_join_state, req); + else + rc += handle_join_req(group, req_join_state, req); + } + + /* Handle leaves */ + if (group->state == MCAST_IDLE) { + req_join_state = get_leave_state(group); + if (req_join_state) { + group->rec.scope_join_state &= ~req_join_state; + group->prev_state = group->state; + if (send_leave_to_wire(group, req_join_state)) { + group->state = group->prev_state; + ++rc; + } else + group->state = MCAST_LEAVE_SENT; + } + } + + if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) + goto process_requests; + mutex_unlock(&group->lock); + + while (rc--) + release_group(group, 0); +} + +static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx, + __be64 tid, + union ib_gid *new_mgid) +{ + struct mcast_group *group = NULL, *cur_group; + struct mcast_req *req; + struct list_head *pos; + struct list_head *n; + + mutex_lock(&ctx->mcg_table_lock); + list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { + group = list_entry(pos, struct mcast_group, mgid0_list); + mutex_lock(&group->lock); + if (group->last_req_tid == tid) { + if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { + group->rec.mgid = *new_mgid; + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + list_del_init(&group->mgid0_list); + cur_group = mcast_insert(ctx, group); + if (cur_group) { + /* A race between our code and SM. Silently cleaning the new one */ + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + release_group(group, 0); + return NULL; + } + + atomic_inc(&group->refcount); + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return group; + } else { + struct mcast_req *tmp1, *tmp2; + + list_del(&group->mgid0_list); + if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE) + cancel_delayed_work_sync(&group->timeout_work); + + list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) { + list_del(&tmp1->group_list); + kfree(tmp1); + } + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return NULL; + } + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); + + return NULL; +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf); + +static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid, int create, + gfp_t gfp_mask) +{ + struct mcast_group *group, *cur_group; + int is_mgid0; + int i; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + group = mcast_find(ctx, mgid); + if (group) + goto found; + } + + if (!create) + return ERR_PTR(-ENOENT); + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return ERR_PTR(-ENOMEM); + + group->demux = ctx; + group->rec.mgid = *mgid; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->mgid0_list); + for (i = 0; i < MAX_VFS; ++i) + INIT_LIST_HEAD(&group->func[i].pending); + INIT_WORK(&group->work, mlx4_ib_mcg_work_handler); + INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); + mutex_init(&group->lock); + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + sysfs_attr_init(&group->dentry.attr); + group->dentry.show = sysfs_show_group; + group->dentry.store = NULL; + group->dentry.attr.name = group->name; + group->dentry.attr.mode = 0400; + group->state = MCAST_IDLE; + + if (is_mgid0) { + list_add(&group->mgid0_list, &ctx->mcg_mgid0_list); + goto found; + } + + cur_group = mcast_insert(ctx, group); + if (cur_group) { + mcg_warn("group just showed up %s - confused\n", cur_group->name); + kfree(group); + return ERR_PTR(-EINVAL); + } + + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + +found: + atomic_inc(&group->refcount); + return group; +} + +static void queue_req(struct mcast_req *req) +{ + struct mcast_group *group = req->group; + + atomic_inc(&group->refcount); /* for the request */ + atomic_inc(&group->refcount); /* for scheduling the work */ + list_add_tail(&req->group_list, &group->pending_list); + list_add_tail(&req->func_list, &group->func[req->func].pending); + /* calls mlx4_ib_mcg_work_handler */ + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); +} + +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + + switch (mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_DELETE_RESP: + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { + __be64 tid = mad->mad_hdr.tid; + *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */ + group = search_relocate_mgid0_group(ctx, tid, &rec->mgid); + } else + group = NULL; + } + + if (!group) + return 1; + + mutex_lock(&group->lock); + group->response_sa_mad = *mad; + group->prev_state = group->state; + group->state = MCAST_RESP_READY; + /* calls mlx4_ib_mcg_work_handler */ + atomic_inc(&group->refcount); + if (!queue_work(ctx->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_MGMT_METHOD_SET: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE: + return 0; /* not consumed, pass-through to guest over tunnel */ + default: + mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n", + port, mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + struct mcast_req *req; + int may_create = 0; + + if (ctx->flushing) + return -EAGAIN; + + switch (sa_mad->mad_hdr.method) { + case IB_MGMT_METHOD_SET: + may_create = 1; + case IB_SA_METHOD_DELETE: + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->func = slave; + req->sa_mad = *sa_mad; + + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + kfree(req); + return PTR_ERR(group); + } + mutex_lock(&group->lock); + if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { + mutex_unlock(&group->lock); + mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", + port, slave, MAX_PEND_REQS_PER_FUNC); + release_group(group, 0); + kfree(req); + return -ENOMEM; + } + ++group->func[slave].num_pend_reqs; + req->group = group; + queue_req(req); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_SA_METHOD_GET_TABLE: + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE_RESP: + return 0; /* not consumed, pass-through */ + default: + mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n", + port, slave, sa_mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mcast_group *group = + container_of(attr, struct mcast_group, dentry); + struct mcast_req *req = NULL; + char pending_str[40]; + char state_str[40]; + ssize_t len = 0; + int f; + + if (group->state == MCAST_IDLE) + sprintf(state_str, "%s", get_state_string(group->state)); + else + sprintf(state_str, "%s(TID=0x%llx)", + get_state_string(group->state), + be64_to_cpu(group->last_req_tid)); + if (list_empty(&group->pending_list)) { + sprintf(pending_str, "No"); + } else { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + sprintf(pending_str, "Yes(TID=0x%llx)", + be64_to_cpu(req->sa_mad.mad_hdr.tid)); + } + len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ", + group->rec.scope_join_state & 0xf, + group->members[2], group->members[1], group->members[0], + atomic_read(&group->refcount), + pending_str, + state_str); + for (f = 0; f < MAX_VFS; ++f) + if (group->func[f].state == MCAST_MEMBER) + len += sprintf(buf + len, "%d[%1x] ", + f, group->func[f].join_state); + + len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x " + "%4x %4x %2x %2x)\n", + be16_to_cpu(group->rec.pkey), + be32_to_cpu(group->rec.qkey), + (group->rec.mtusel_mtu & 0xc0) >> 6, + group->rec.mtusel_mtu & 0x3f, + group->rec.tclass, + (group->rec.ratesel_rate & 0xc0) >> 6, + group->rec.ratesel_rate & 0x3f, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8, + be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff, + group->rec.proxy_join); + + return len; +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx) +{ + char name[20]; + + atomic_set(&ctx->tid, 0); + sprintf(name, "mlx4_ib_mcg%d", ctx->port); + ctx->mcg_wq = create_singlethread_workqueue(name); + if (!ctx->mcg_wq) + return -ENOMEM; + + mutex_init(&ctx->mcg_table_lock); + ctx->mcg_table = RB_ROOT; + INIT_LIST_HEAD(&ctx->mcg_mgid0_list); + ctx->flushing = 0; + + return 0; +} + +static void force_clean_group(struct mcast_group *group) +{ + struct mcast_req *req, *tmp + ; + list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) { + list_del(&req->group_list); + kfree(req); + } + del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr); + rb_erase(&group->node, &group->demux->mcg_table); + kfree(group); +} + +static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + int i; + struct rb_node *p; + struct mcast_group *group; + unsigned long end; + int count; + + for (i = 0; i < MAX_VFS; ++i) + clean_vf_mcast(ctx, i); + + end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + do { + count = 0; + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) + ++count; + mutex_unlock(&ctx->mcg_table_lock); + if (!count) + break; + + msleep(1); + } while (time_after(end, jiffies)); + + flush_workqueue(ctx->mcg_wq); + if (destroy_wq) + destroy_workqueue(ctx->mcg_wq); + + mutex_lock(&ctx->mcg_table_lock); + while ((p = rb_first(&ctx->mcg_table)) != NULL) { + group = rb_entry(p, struct mcast_group, node); + if (atomic_read(&group->refcount)) + mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group); + + force_clean_group(group); + } + mutex_unlock(&ctx->mcg_table_lock); +} + +struct clean_work { + struct work_struct work; + struct mlx4_ib_demux_ctx *ctx; + int destroy_wq; +}; + +static void mcg_clean_task(struct work_struct *work) +{ + struct clean_work *cw = container_of(work, struct clean_work, work); + + _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); + cw->ctx->flushing = 0; + kfree(cw); +} + +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + struct clean_work *work; + + if (ctx->flushing) + return; + + ctx->flushing = 1; + + if (destroy_wq) { + _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); + ctx->flushing = 0; + return; + } + + work = kmalloc(sizeof *work, GFP_KERNEL); + if (!work) { + ctx->flushing = 0; + mcg_warn("failed allocating work for cleanup\n"); + return; + } + + work->ctx = ctx; + work->destroy_wq = destroy_wq; + INIT_WORK(&work->work, mcg_clean_task); + queue_work(clean_wq, &work->work); +} + +static void build_leave_mad(struct mcast_req *req) +{ + struct ib_sa_mad *mad = &req->sa_mad; + + mad->mad_hdr.method = IB_SA_METHOD_DELETE; +} + + +static void clear_pending_reqs(struct mcast_group *group, int vf) +{ + struct mcast_req *req, *tmp, *group_first = NULL; + int clear; + int pend = 0; + + if (!list_empty(&group->pending_list)) + group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list); + + list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) { + clear = 1; + if (group_first == req && + (group->state == MCAST_JOIN_SENT || + group->state == MCAST_LEAVE_SENT)) { + clear = cancel_delayed_work(&group->timeout_work); + pend = !clear; + group->state = MCAST_IDLE; + } + if (clear) { + --group->func[vf].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + atomic_dec(&group->refcount); + } + } + + if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) { + mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n", + list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs); + } +} + +static int push_deleteing_req(struct mcast_group *group, int slave) +{ + struct mcast_req *req; + struct mcast_req *pend_req; + + if (!group->func[slave].join_state) + return 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) { + mcg_warn_group(group, "failed allocation - may leave stall groups\n"); + return -ENOMEM; + } + + if (!list_empty(&group->func[slave].pending)) { + pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list); + if (pend_req->clean) { + kfree(req); + return 0; + } + } + + req->clean = 1; + req->func = slave; + req->group = group; + ++group->func[slave].num_pend_reqs; + build_leave_mad(req); + queue_req(req); + return 0; +} + +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave) +{ + struct mcast_group *group; + struct rb_node *p; + + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { + group = rb_entry(p, struct mcast_group, node); + mutex_lock(&group->lock); + if (atomic_read(&group->refcount)) { + /* clear pending requests of this VF */ + clear_pending_reqs(group, slave); + push_deleteing_req(group, slave); + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); +} + + +int mlx4_ib_mcg_init(void) +{ + clean_wq = create_singlethread_workqueue("mlx4_ib_mcg"); + if (!clean_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_mcg_destroy(void) +{ + destroy_workqueue(clean_wq); +} diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index ff36655d23d..369da3ca5d6 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -37,13 +37,26 @@ #include <linux/compiler.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/idr.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_sa.h> #include <linux/mlx4/device.h> #include <linux/mlx4/doorbell.h> +#define MLX4_IB_DRV_NAME "mlx4_ib" + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ + +#define mlx4_ib_warn(ibdev, format, arg...) \ + dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) + enum { MLX4_IB_SQ_MIN_WQE_SHIFT = 6, MLX4_IB_MAX_HEADROOM = 2048 @@ -52,6 +65,11 @@ enum { #define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) #define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) +/*module param to indicate if SM assigns the alias_GUID*/ +extern int mlx4_ib_sm_guid_assign; + +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS 256 struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; @@ -74,6 +92,7 @@ struct mlx4_ib_xrcd { struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; + int entry_size; }; struct mlx4_ib_cq_resize { @@ -99,6 +118,11 @@ struct mlx4_ib_mr { struct ib_umem *umem; }; +struct mlx4_ib_mw { + struct ib_mw ibmw; + struct mlx4_mw mmw; +}; + struct mlx4_ib_fast_reg_page_list { struct ib_fast_reg_page_list ibfrpl; __be64 *mapped_page_list; @@ -110,6 +134,12 @@ struct mlx4_ib_fmr { struct mlx4_fmr mfmr; }; +struct mlx4_ib_flow { + struct ib_flow ibflow; + /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ + u64 reg_id[2]; +}; + struct mlx4_ib_wq { u64 *wrid; spinlock_t lock; @@ -123,8 +153,12 @@ struct mlx4_ib_wq { }; enum mlx4_ib_qp_flags { - MLX4_IB_QP_LSO = 1 << 0, - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, + MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, + MLX4_IB_SRIOV_SQP = 1 << 31, }; struct mlx4_ib_gid_entry { @@ -134,6 +168,96 @@ struct mlx4_ib_gid_entry { u8 port; }; +enum mlx4_ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + MLX4_IB_QPT_SMI = IB_QPT_SMI, + MLX4_IB_QPT_GSI = IB_QPT_GSI, + + MLX4_IB_QPT_RC = IB_QPT_RC, + MLX4_IB_QPT_UC = IB_QPT_UC, + MLX4_IB_QPT_UD = IB_QPT_UD, + MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, + MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, + MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, + MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, + MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, + + MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, + MLX4_IB_QPT_PROXY_SMI = 1 << 17, + MLX4_IB_QPT_PROXY_GSI = 1 << 18, + MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, + MLX4_IB_QPT_TUN_SMI = 1 << 20, + MLX4_IB_QPT_TUN_GSI = 1 << 21, +}; + +#define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ + MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) + +enum mlx4_ib_mad_ifc_flags { + MLX4_MAD_IFC_IGNORE_MKEY = 1, + MLX4_MAD_IFC_IGNORE_BKEY = 2, + MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | + MLX4_MAD_IFC_IGNORE_BKEY), + MLX4_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX4_NUM_TUNNEL_BUFS = 256, +}; + +struct mlx4_ib_tunnel_header { + struct mlx4_av av; + __be32 remote_qpn; + __be32 qkey; + __be16 vlan; + u8 mac[6]; + __be16 pkey_index; + u8 reserved[6]; +}; + +struct mlx4_ib_buf { + void *addr; + dma_addr_t map; +}; + +struct mlx4_rcv_tunnel_hdr { + __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: + * 0x0 - no vlan was in the packet + * 0x01 - C-VLAN was in the packet */ + u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ + u8 reserved; + __be16 pkey_index; + __be16 sl_vid; + __be16 slid_mac_47_32; + __be32 mac_31_0; +}; + +struct mlx4_ib_proxy_sqp_hdr { + struct ib_grh grh; + struct mlx4_rcv_tunnel_hdr tun; +} __packed; + +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; @@ -149,6 +273,7 @@ struct mlx4_ib_qp { int sq_spare_wqes; struct mlx4_ib_wq sq; + enum mlx4_ib_qp_type mlx4_ib_qp_type; struct ib_umem *umem; struct mlx4_mtt mtt; int buf_size; @@ -163,6 +288,11 @@ struct mlx4_ib_qp { u8 state; int mlx_type; struct list_head gid_list; + struct list_head steering_rules; + struct mlx4_ib_buf *sqp_proxy_rcv; + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; + u64 reg_id; }; struct mlx4_ib_srq { @@ -185,13 +315,184 @@ struct mlx4_ib_ah { union mlx4_ext_av av; }; +/****************************************/ +/* alias guid support */ +/****************************************/ +#define NUM_PORT_ALIAS_GUID 2 +#define NUM_ALIAS_GUID_IN_REC 8 +#define NUM_ALIAS_GUID_REC_IN_PORT 16 +#define GUID_REC_SIZE 8 +#define NUM_ALIAS_GUID_PER_PORT 128 +#define MLX4_NOT_SET_GUID (0x00LL) +#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) + +enum mlx4_guid_alias_rec_status { + MLX4_GUID_INFO_STATUS_IDLE, + MLX4_GUID_INFO_STATUS_SET, + MLX4_GUID_INFO_STATUS_PENDING, +}; + +enum mlx4_guid_alias_rec_ownership { + MLX4_GUID_DRIVER_ASSIGN, + MLX4_GUID_SYSADMIN_ASSIGN, + MLX4_GUID_NONE_ASSIGN, /*init state of each record*/ +}; + +enum mlx4_guid_alias_rec_method { + MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, + MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, +}; + +struct mlx4_sriov_alias_guid_info_rec_det { + u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; + ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ + enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ + u8 method; /*set or delete*/ + enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/ +}; + +struct mlx4_sriov_alias_guid_port_rec_det { + struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; + struct workqueue_struct *wq; + struct delayed_work alias_guid_work; + u8 port; + struct mlx4_sriov_alias_guid *parent; + struct list_head cb_list; +}; + +struct mlx4_sriov_alias_guid { + struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; + spinlock_t ag_work_lock; + struct ib_sa_client *sa_client; +}; + +struct mlx4_ib_demux_work { + struct work_struct work; + struct mlx4_ib_dev *dev; + int slave; + int do_init; + u8 port; + +}; + +struct mlx4_ib_tun_tx_buf { + struct mlx4_ib_buf buf; + struct ib_ah *ah; +}; + +struct mlx4_ib_demux_pv_qp { + struct ib_qp *qp; + enum ib_qp_type proxy_qpt; + struct mlx4_ib_buf *ring; + struct mlx4_ib_tun_tx_buf *tx_ring; + spinlock_t tx_lock; + unsigned tx_ix_head; + unsigned tx_ix_tail; +}; + +enum mlx4_ib_demux_pv_state { + DEMUX_PV_STATE_DOWN, + DEMUX_PV_STATE_STARTING, + DEMUX_PV_STATE_ACTIVE, + DEMUX_PV_STATE_DOWNING, +}; + +struct mlx4_ib_demux_pv_ctx { + int port; + int slave; + enum mlx4_ib_demux_pv_state state; + int has_smi; + struct ib_device *ib_dev; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_mr *mr; + struct work_struct work; + struct workqueue_struct *wq; + struct mlx4_ib_demux_pv_qp qp[2]; +}; + +struct mlx4_ib_demux_ctx { + struct ib_device *ib_dev; + int port; + struct workqueue_struct *wq; + struct workqueue_struct *ud_wq; + spinlock_t ud_lock; + __be64 subnet_prefix; + __be64 guid_cache[128]; + struct mlx4_ib_dev *dev; + /* the following lock protects both mcg_table and mcg_mgid0_list */ + struct mutex mcg_table_lock; + struct rb_root mcg_table; + struct list_head mcg_mgid0_list; + struct workqueue_struct *mcg_wq; + struct mlx4_ib_demux_pv_ctx **tun; + atomic_t tid; + int flushing; /* flushing the work queue */ +}; + +struct mlx4_ib_sriov { + struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; + struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; + /* when using this spinlock you should use "irq" because + * it may be called from interrupt context.*/ + spinlock_t going_down_lock; + int is_going_down; + + struct mlx4_sriov_alias_guid alias_guid; + + /* CM paravirtualization fields */ + struct list_head cm_list; + spinlock_t id_map_lock; + struct rb_root sl_id_map; + struct idr pv_id_table; +}; + struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; + struct net_device *masters[MLX4_MAX_PORTS]; struct notifier_block nb; + struct notifier_block nb_inet; + struct notifier_block nb_inet6; union ib_gid gid_table[MLX4_MAX_PORTS][128]; }; +struct pkey_mgt { + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + struct list_head pkey_port_list[MLX4_MFUNC_MAX]; + struct kobject *device_parent[MLX4_MFUNC_MAX]; +}; + +struct mlx4_ib_iov_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + u32 entry_num; + char name[15]; + struct device_attribute dentry; + struct device *dev; +}; + +struct mlx4_ib_iov_sysfs_attr_ar { + struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; +}; + +struct mlx4_ib_iov_port { + char name[100]; + u8 num; + struct mlx4_ib_dev *dev; + struct list_head list; + struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; + struct ib_port_attr attr; + struct kobject *cur_port; + struct kobject *admin_alias_parent; + struct kobject *gids_parent; + struct kobject *pkeys_parent; + struct kobject *mcgs_parent; + struct mlx4_ib_iov_sysfs_attr mcg_dentry; +}; + struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; @@ -205,6 +506,7 @@ struct mlx4_ib_dev { struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; struct ib_ah *sm_ah[MLX4_MAX_PORTS]; spinlock_t sm_lock; + struct mlx4_ib_sriov sriov; struct mutex cap_mask_mutex; bool ib_active; @@ -212,6 +514,31 @@ struct mlx4_ib_dev { int counters[MLX4_MAX_PORTS]; int *eq_table; int eq_added; + struct kobject *iov_parent; + struct kobject *ports_parent; + struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; + struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; + struct pkey_mgt pkeys; + unsigned long *ib_uc_qpns_bitmap; + int steer_qpn_count; + int steer_qpn_base; + int steering_support; + struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; + /* lock when destroying qp1_proxy and getting netdev events */ + struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; +}; + +struct ib_event_work { + struct work_struct work; + struct mlx4_ib_dev *ib_dev; + struct mlx4_eqe ib_eqe; +}; + +struct mlx4_ib_qp_tunnel_init_attr { + struct ib_qp_init_attr init_attr; + int slave; + enum ib_qp_type proxy_qp_type; + u8 port; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) @@ -249,6 +576,11 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx4_ib_mr, ibmr); } +static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx4_ib_mw, ibmw); +} + static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) { return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl); @@ -258,6 +590,12 @@ static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) { return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); } + +static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) +{ + return container_of(ibflow, struct mlx4_ib_flow, ibflow); +} + static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx4_ib_qp, ibqp); @@ -283,6 +621,9 @@ static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) return container_of(ibah, struct mlx4_ib_ah, ibah); } +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); + int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db); void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); @@ -294,6 +635,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, @@ -339,7 +684,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); -int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad); int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, @@ -354,21 +699,94 @@ int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova); int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view); +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view); -int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, - u8 *mac, int *is_mcast, u8 port); +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view); -static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) +static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) - return 1; + return true; return !!(ah->av.ib.g_slid & 0x80); } +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); +int mlx4_ib_mcg_init(void); +void mlx4_ib_mcg_destroy(void); + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad); +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad); + int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid); +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type); + +void mlx4_ib_tunnels_update_work(struct work_struct *work); + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad); + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, + struct ib_mad *mad); + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad); + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad); + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); + +/* alias guid support */ +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); + +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, + u8 port_num, u8 *p_data); + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data); + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); + +__be64 mlx4_ib_gen_node_guid(void); + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index bbaf6176f20..cb2a8727f3f 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -41,9 +41,19 @@ static u32 convert_access(int acc) (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) | MLX4_PERM_LOCAL_READ; } +static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type) +{ + switch (type) { + case IB_MW_TYPE_1: return MLX4_MW_TYPE_1; + case IB_MW_TYPE_2: return MLX4_MW_TYPE_2; + default: return -1; + } +} + struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) { struct mlx4_ib_mr *mr; @@ -68,7 +78,7 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) return &mr->ibmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_free: kfree(mr); @@ -80,11 +90,11 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { u64 *pages; - struct ib_umem_chunk *chunk; - int i, j, k; + int i, k, entry; int n; int len; int err = 0; + struct scatterlist *sg; pages = (u64 *) __get_free_page(GFP_KERNEL); if (!pages) @@ -92,26 +102,25 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, i = n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(&chunk->page_list[j]) + - umem->page_size * k; - /* - * Be friendly to mlx4_write_mtt() and - * pass it chunks of appropriate size. - */ - if (i == PAGE_SIZE / sizeof (u64)) { - err = mlx4_write_mtt(dev->dev, mtt, n, - i, pages); - if (err) - goto out; - n += i; - i = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> mtt->page_shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + umem->page_size * k; + /* + * Be friendly to mlx4_write_mtt() and + * pass it chunks of appropriate size. + */ + if (i == PAGE_SIZE / sizeof (u64)) { + err = mlx4_write_mtt(dev->dev, mtt, n, + i, pages); + if (err) + goto out; + n += i; + i = 0; } } + } if (i) err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); @@ -163,7 +172,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &mr->ibmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_umem: ib_umem_release(mr->umem); @@ -177,8 +186,11 @@ err_free: int mlx4_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); + int ret; - mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (ret) + return ret; if (mr->umem) ib_umem_release(mr->umem); kfree(mr); @@ -186,6 +198,70 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mw *mw; + int err; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, + to_mlx4_type(type), &mw->mmw); + if (err) + goto err_free; + + err = mlx4_mw_enable(dev->dev, &mw->mmw); + if (err) + goto err_mw; + + mw->ibmw.rkey = mw->mmw.key; + + return &mw->ibmw; + +err_mw: + mlx4_mw_free(dev->dev, &mw->mmw); + +err_free: + kfree(mw); + + return ERR_PTR(err); +} + +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct ib_send_wr wr; + struct ib_send_wr *bad_wr; + int ret; + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_BIND_MW; + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + wr.wr.bind_mw.mw = mw; + wr.wr.bind_mw.bind_info = mw_bind->bind_info; + wr.wr.bind_mw.rkey = ib_inc_rkey(mw->rkey); + + ret = mlx4_ib_post_send(qp, &wr, &bad_wr); + if (!ret) + mw->rkey = wr.wr.bind_mw.rkey; + + return ret; +} + +int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) +{ + struct mlx4_ib_mw *mw = to_mmw(ibmw); + + mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw); + kfree(mw); + + return 0; +} + struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { @@ -212,7 +288,7 @@ struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, return &mr->ibmr; err_mr: - mlx4_mr_free(dev->dev, &mr->mmr); + (void) mlx4_mr_free(dev->dev, &mr->mmr); err_free: kfree(mr); @@ -291,7 +367,7 @@ struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, return &fmr->ibfmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); err_free: kfree(fmr); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 8d4ed24aef9..67780452f0c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -38,6 +38,7 @@ #include <rdma/ib_cache.h> #include <rdma/ib_pack.h> #include <rdma/ib_addr.h> +#include <rdma/ib_mad.h> #include <linux/mlx4/qp.h> @@ -89,6 +90,21 @@ enum { MLX4_RAW_QP_MSGMAX = 31, }; +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif +static inline u64 mlx4_mac_to_u64(u8 *addr) +{ + u64 mac = 0; + int i; + + for (i = 0; i < ETH_ALEN; i++) { + mac <<= 8; + mac |= addr[i]; + } + return mac; +} + static const __be32 mlx4_ib_opcode[] = { [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), @@ -103,6 +119,7 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), + [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW), }; static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) @@ -110,16 +127,62 @@ static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) return container_of(mqp, struct mlx4_ib_sqp, qp); } +static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (!mlx4_is_master(dev->dev)) + return 0; + + return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn && + qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn + + 8 * MLX4_MFUNC_MAX; +} + static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - return qp->mqp.qpn >= dev->dev->caps.sqp_start && - qp->mqp.qpn <= dev->dev->caps.sqp_start + 3; + int proxy_sqp = 0; + int real_sqp = 0; + int i; + /* PPF or Native -- real SQP */ + real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3); + if (real_sqp) + return 1; + /* VF or PF -- proxy SQP */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] || + qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) { + proxy_sqp = 1; + break; + } + } + } + return proxy_sqp; } +/* used for INIT/CLOSE port logic */ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - return qp->mqp.qpn >= dev->dev->caps.sqp_start && - qp->mqp.qpn <= dev->dev->caps.sqp_start + 1; + int proxy_qp0 = 0; + int real_qp0 = 0; + int i; + /* PPF or Native -- real QP0 */ + real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1); + if (real_qp0) + return 1; + /* VF or PF -- proxy QP0 */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) { + proxy_qp0 = 1; + break; + } + } + } + return proxy_qp0; } static void *get_wqe(struct mlx4_ib_qp *qp, int offset) @@ -270,7 +333,7 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) } } -static int send_wqe_overhead(enum ib_qp_type type, u32 flags) +static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) { /* * UD WQEs must have a datagram segment. @@ -279,19 +342,29 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) * header and space for the ICRC). */ switch (type) { - case IB_QPT_UD: + case MLX4_IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); - case IB_QPT_UC: + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + 64; + case MLX4_IB_QPT_TUN_SMI_OWNER: + case MLX4_IB_QPT_TUN_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg); + + case MLX4_IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); - case IB_QPT_RC: + case MLX4_IB_QPT_RC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_atomic_seg) + sizeof (struct mlx4_wqe_raddr_seg); - case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + ALIGN(MLX4_IB_UD_HEADER_SIZE + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, @@ -345,7 +418,7 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, } static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - enum ib_qp_type type, struct mlx4_ib_qp *qp) + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) { int s; @@ -360,7 +433,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, * For MLX transport we need 2 extra S/G entries: * one for the header and one for the checksum at the end */ - if ((type == IB_QPT_SMI || type == IB_QPT_GSI) && + if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || + type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) return -EINVAL; @@ -404,7 +478,9 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, */ if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && qp->sq_signal_bits && BITS_PER_LONG == 64 && - type != IB_QPT_SMI && type != IB_QPT_GSI) + type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && + !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) qp->sq.wqe_shift = ilog2(64); else qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); @@ -476,6 +552,54 @@ static int set_user_sq_size(struct mlx4_ib_dev *dev, return 0; } +static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + qp->sqp_proxy_rcv = + kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt, + GFP_KERNEL); + if (!qp->sqp_proxy_rcv) + return -ENOMEM; + for (i = 0; i < qp->rq.wqe_cnt; i++) { + qp->sqp_proxy_rcv[i].addr = + kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), + GFP_KERNEL); + if (!qp->sqp_proxy_rcv[i].addr) + goto err; + qp->sqp_proxy_rcv[i].map = + ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + } + return 0; + +err: + while (i > 0) { + --i; + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); + qp->sqp_proxy_rcv = NULL; + return -ENOMEM; +} + +static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); +} + static int qp_has_rq(struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) @@ -484,17 +608,95 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) return !attr->srq; } +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i]) + return !!dev->caps.qp0_qkey[i]; + } + return 0; +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, + gfp_t gfp) { int qpn; int err; + struct mlx4_ib_sqp *sqp; + struct mlx4_ib_qp *qp; + enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; + + /* When tunneling special qps, we use a plain UD qp */ + if (sqpn) { + if (mlx4_is_mfunc(dev->dev) && + (!mlx4_is_master(dev->dev) || + !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { + if (init_attr->qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_PROXY_GSI; + else { + if (mlx4_is_master(dev->dev) || + qp0_enabled_vf(dev->dev, sqpn)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } + } + qpn = sqpn; + /* add extra sg entry for tunneling */ + init_attr->cap.max_recv_sge++; + } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) { + struct mlx4_ib_qp_tunnel_init_attr *tnl_init = + container_of(init_attr, + struct mlx4_ib_qp_tunnel_init_attr, init_attr); + if ((tnl_init->proxy_qp_type != IB_QPT_SMI && + tnl_init->proxy_qp_type != IB_QPT_GSI) || + !mlx4_is_master(dev->dev)) + return -EINVAL; + if (tnl_init->proxy_qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_TUN_GSI; + else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || + mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, + tnl_init->port)) + qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_TUN_SMI; + /* we are definitely in the PPF here, since we are creating + * tunnel QPs. base_tunnel_sqpn is therefore valid. */ + qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave + + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; + sqpn = qpn; + } + + if (!*caller_qp) { + if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || + (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { + sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); + if (!sqp) + return -ENOMEM; + qp = &sqp->qp; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } else { + qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); + if (!qp) + return -ENOMEM; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } + } else + qp = *caller_qp; + + qp->mlx4_ib_qp_type = qp_type; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); qp->state = IB_QPS_RESET; if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) @@ -549,19 +751,27 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; - err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp); + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->steering_support == + MLX4_STEERING_MODE_DEVICE_MANAGED) + qp->flags |= MLX4_IB_QP_NETIF; + else + goto err; + } + + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; if (qp_has_rq(init_attr)) { - err = mlx4_db_alloc(dev->dev, &qp->db, 0); + err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); if (err) goto err; *qp->db.db = 0; } - if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { err = -ENOMEM; goto err_db; } @@ -571,13 +781,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); - + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -585,19 +794,29 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } if (sqpn) { - qpn = sqpn; + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + if (alloc_proxy_bufs(pd->device, qp)) { + err = -ENOMEM; + goto err_wrid; + } + } } else { /* Raw packet QPNs must be aligned to 8 bits. If not, the WQE * BlueFlame setup flow wrongly causes VLAN insertion. */ if (init_attr->qp_type == IB_QPT_RAW_PACKET) err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn); else - err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn); + if (qp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); + else + err = mlx4_qp_reserve_range(dev->dev, 1, 1, + &qpn); if (err) - goto err_wrid; + goto err_proxy; } - err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); if (err) goto err_qpn; @@ -612,13 +831,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); qp->mqp.event = mlx4_ib_qp_event; - + if (!*caller_qp) + *caller_qp = qp; return 0; err_qpn: - if (!sqpn) - mlx4_qp_release_range(dev->dev, qpn, 1); - + if (!sqpn) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpn, 1); + else + mlx4_qp_release_range(dev->dev, qpn, 1); + } +err_proxy: + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + free_proxy_bufs(pd->device, qp); err_wrid: if (pd->uobject) { if (qp_has_rq(init_attr)) @@ -642,6 +868,8 @@ err_db: mlx4_db_free(dev->dev, &qp->db); err: + if (!*caller_qp) + kfree(qp); return err; } @@ -731,11 +959,32 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, { struct mlx4_ib_cq *send_cq, *recv_cq; - if (qp->state != IB_QPS_RESET) + if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } get_cqs(qp, &send_cq, &recv_cq); @@ -754,8 +1003,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_qp_free(dev->dev, &qp->mqp); - if (!is_sqp(dev, qp)) - mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + } mlx4_mtt_cleanup(dev->dev, &qp->mtt); @@ -767,6 +1020,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, } else { kfree(qp->sq.wrid); kfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + free_proxy_bufs(&dev->ib_dev, qp); mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); @@ -775,25 +1031,57 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, del_gid_entries(qp); } +static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) +{ + /* Native or PPF */ + if (!mlx4_is_mfunc(dev->dev) || + (mlx4_is_master(dev->dev) && + attr->create_flags & MLX4_IB_SRIOV_SQP)) { + return dev->dev->phys_caps.base_sqpn + + (attr->qp_type == IB_QPT_SMI ? 0 : 2) + + attr->port_num - 1; + } + /* PF or VF -- creating proxies */ + if (attr->qp_type == IB_QPT_SMI) + return dev->dev->caps.qp0_proxy[attr->port_num - 1]; + else + return dev->dev->caps.qp1_proxy[attr->port_num - 1]; +} + struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { - struct mlx4_ib_sqp *sqp; - struct mlx4_ib_qp *qp; + struct mlx4_ib_qp *qp = NULL; int err; u16 xrcdn = 0; + gfp_t gfp; + gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? + GFP_NOIO : GFP_KERNEL; /* - * We only support LSO and multicast loopback blocking, and - * only for kernel UD QPs. + * We only support LSO, vendor flag1, and multicast loopback blocking, + * and only for kernel UD QPs. */ - if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO | - IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | + MLX4_IB_SRIOV_TUNNEL_QP | + MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_USE_GFP_NOIO)) return ERR_PTR(-EINVAL); + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (init_attr->qp_type != IB_QPT_UD) + return ERR_PTR(-EINVAL); + } + if (init_attr->create_flags && - (udata || init_attr->qp_type != IB_QPT_UD)) + (udata || + ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) && + init_attr->qp_type != IB_QPT_UD) || + ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && + init_attr->qp_type > IB_QPT_GSI))) return ERR_PTR(-EINVAL); switch (init_attr->qp_type) { @@ -809,18 +1097,19 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, /* fall through */ case IB_QPT_RC: case IB_QPT_UC: - case IB_QPT_UD: case IB_QPT_RAW_PACKET: - { - qp = kzalloc(sizeof *qp, GFP_KERNEL); + qp = kzalloc(sizeof *qp, gfp); if (!qp) return ERR_PTR(-ENOMEM); - - err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, 0, qp); - if (err) { - kfree(qp); + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + /* fall through */ + case IB_QPT_UD: + { + err = create_qp_common(to_mdev(pd->device), pd, init_attr, + udata, 0, &qp, gfp); + if (err) return ERR_PTR(err); - } qp->ibqp.qp_num = qp->mqp.qpn; qp->xrcdn = xrcdn; @@ -834,21 +1123,11 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, if (udata) return ERR_PTR(-EINVAL); - sqp = kzalloc(sizeof *sqp, GFP_KERNEL); - if (!sqp) - return ERR_PTR(-ENOMEM); - - qp = &sqp->qp; - err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, - to_mdev(pd->device)->dev->caps.sqp_start + - (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) + - init_attr->port_num - 1, - qp); - if (err) { - kfree(sqp); + get_sqp_num(to_mdev(pd->device), init_attr), + &qp, gfp); + if (err) return ERR_PTR(err); - } qp->port = init_attr->port_num; qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; @@ -872,6 +1151,12 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); + if (dev->qp1_proxy[mqp->port - 1] == mqp) { + mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); + dev->qp1_proxy[mqp->port - 1] = NULL; + mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); + } + pd = get_pd(mqp); destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); @@ -883,18 +1168,27 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) return 0; } -static int to_mlx4_st(enum ib_qp_type type) +static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) { switch (type) { - case IB_QPT_RC: return MLX4_QP_ST_RC; - case IB_QPT_UC: return MLX4_QP_ST_UC; - case IB_QPT_UD: return MLX4_QP_ST_UD; - case IB_QPT_XRC_INI: - case IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; - default: return -1; + case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; + case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; + case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; + case MLX4_IB_QPT_XRC_INI: + case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_MLX : -1); + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_PROXY_GSI: + case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_UD : -1); + default: return -1; } } @@ -944,16 +1238,16 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); } -static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, - struct mlx4_qp_path *path, u8 port) +static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, + u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, + struct mlx4_roce_smac_vlan_info *smac_info, u8 port) { - int err; int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; - u8 mac[6]; - int is_mcast; - u16 vlan_tag; int vidx; + int smac_index; + int err; + path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); @@ -982,36 +1276,105 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } if (is_eth) { - path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | - ((port - 1) << 6) | ((ah->sl & 7) << 3); - if (!(ah->ah_flags & IB_AH_GRH)) return -1; - err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port); - if (err) - return err; - - memcpy(path->dmac, mac, 6); - path->ackto = MLX4_IB_LINK_TYPE_ETH; - /* use index 0 into MAC table for IBoE */ - path->grh_mylmc &= 0x80; + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3); - vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); + path->feup |= MLX4_FEUP_FORCE_ETH_UP; if (vlan_tag < 0x1000) { - if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) - return -ENOENT; - - path->vlan_index = vidx; + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } else { + path->vlan_index = smac_info->vlan_index; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } + path->feup |= MLX4_FVL_FORCE_ETH_VLAN; path->fl = 1 << 6; + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } } - } else + + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + if (!smac_info->smac || smac_info->smac != smac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, smac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = smac; + smac_info->candidate_smac_port = port; + } else { + return -EINVAL; + } + } else { + smac_index = smac_info->smac_index; + } + + memcpy(path->dmac, ah->dmac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80; + } else { path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + } return 0; } +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->ah_attr, + mlx4_mac_to_u64((u8 *)qp->smac), + (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff, + path, &mqp->pri, port); +} + +static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, + const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->alt_ah_attr, + mlx4_mac_to_u64((u8 *)qp->alt_smac), + (qp_attr_mask & IB_QP_ALT_VID) ? + qp->alt_vlan_id : 0xffff, + path, &mqp->alt, port); +} + static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { struct mlx4_ib_gid_entry *ge, *tmp; @@ -1024,6 +1387,37 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) } } +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, + struct mlx4_qp_context *context) +{ + struct net_device *ndev; + u64 u64_mac; + int smac_index; + + + ndev = dev->iboe.netdevs[qp->port - 1]; + if (ndev) { + smac = ndev->dev_addr; + u64_mac = mlx4_mac_to_u64(smac); + } else { + u64_mac = dev->dev->caps.def_mac[qp->port]; + } + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else { + return -ENOENT; + } + } + return 0; +} + static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -1035,6 +1429,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; + int steer_qp = 0; int err = -EINVAL; context = kzalloc(sizeof *context, GFP_KERNEL); @@ -1042,7 +1437,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, return -ENOMEM; context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | - (to_mlx4_st(ibqp->qp_type) << 16)); + (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); @@ -1092,6 +1487,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; context->xrcd = cpu_to_be32((u32) qp->xrcdn); + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + context->param3 |= cpu_to_be32(1 << 30); } if (qp->ibqp.uobject) @@ -1117,16 +1514,24 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; } else context->pri_path.counter_index = 0xff; + + if (qp->flags & MLX4_IB_QP_NETIF) { + mlx4_ib_steer_qp_reg(dev, qp, 1); + steer_qp = 1; + } } if (attr_mask & IB_QP_PKEY_INDEX) { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.disable_pkey_check = 0x40; context->pri_path.pkey_index = attr->pkey_index; optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; } if (attr_mask & IB_QP_AV) { - if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, - attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) + if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, + attr_mask & IB_QP_PORT ? + attr->port_num : qp->port)) goto out; optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | @@ -1147,8 +1552,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; - if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, - attr->alt_port_num)) + if (mlx4_set_alt_path(dev, attr, attr_mask, qp, + &context->alt_path, + attr->alt_port_num)) goto out; context->alt_path.pkey_index = attr->alt_pkey_index; @@ -1209,8 +1615,24 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_QP_RQ_PSN) context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ if (attr_mask & IB_QP_QKEY) { - context->qkey = cpu_to_be32(attr->qkey); + if (qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) + context->qkey = cpu_to_be32(IB_QP_SET_QKEY); + else { + if (mlx4_is_mfunc(dev->dev) && + !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) && + (attr->qkey & MLX4_RESERVED_QKEY_MASK) == + MLX4_RESERVED_QKEY_BASE) { + pr_err("Cannot use reserved QKEY" + " 0x%x (range 0xffff0000..0xffffffff" + " is reserved)\n", attr->qkey); + err = -EINVAL; + goto out; + } + context->qkey = cpu_to_be32(attr->qkey); + } optpar |= MLX4_QP_OPTPAR_Q_KEY; } @@ -1226,12 +1648,50 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_PACKET)) { context->pri_path.sched_queue = (qp->port - 1) << 6; - if (is_qp0(dev, qp)) + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) { context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; - else + if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) + context->pri_path.fl = 0x80; + } else { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; + } + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context); + if (err) + return -EINVAL; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + dev->qp1_proxy[qp->port - 1] = qp; + } + } + } + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) + context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | + MLX4_IB_LINK_TYPE_ETH; + + if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + int is_eth = rdma_port_get_link_layer( + &dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) { + context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; + optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; + } } + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; @@ -1302,23 +1762,113 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ - if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, - ibqp->srq ? to_msrq(ibqp->srq): NULL); - if (send_cq != recv_cq) - mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + if (new_state == IB_QPS_RESET) { + if (!ibqp->uobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; + + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } - qp->rq.head = 0; - qp->rq.tail = 0; - qp->sq.head = 0; - qp->sq.tail = 0; - qp->sq_next_wqe = 0; - if (qp->rq.wqe_cnt) - *qp->db.db = 0; + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } } - out: + if (err && steer_qp) + mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); + if (qp->pri.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + } else { + if (qp->pri.smac) + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); + } else { + if (qp->alt.smac) + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + } + qp->alt.candidate_smac = 0; + qp->alt.candidate_smac_index = 0; + qp->alt.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + return err; } @@ -1329,17 +1879,35 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; - + int ll; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + ll = IB_LINK_LAYER_UNSPECIFIED; + } else { + int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = rdma_port_get_link_layer(&dev->ib_dev, port); + } + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, ll)) { + pr_debug("qpn 0x%x: invalid attribute mask specified " + "for transition %d to %d. qp_type %d," + " attr_mask 0x%x\n", + ibqp->qp_num, cur_state, new_state, + ibqp->qp_type, attr_mask); goto out; + } if ((attr_mask & IB_QP_PORT) && - (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) { + (attr->port_num == 0 || attr->port_num > dev->num_ports)) { + pr_debug("qpn 0x%x: invalid port number (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->port_num, cur_state, + new_state, ibqp->qp_type); goto out; } @@ -1350,17 +1918,30 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_PKEY_INDEX) { int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) + if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { + pr_debug("qpn 0x%x: invalid pkey index (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->pkey_index, cur_state, + new_state, ibqp->qp_type); goto out; + } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { + pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { + pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } @@ -1376,11 +1957,138 @@ out: return err; } +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i] || + qpn == dev->caps.qp0_tunnel[i]) { + *qkey = dev->caps.qp0_qkey[i]; + return 0; + } + } + return -EINVAL; +} + +static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, + struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); + struct ib_device *ib_dev = &mdev->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + u32 qkey; + int send_size; + int header_size; + int spc; + int i; + + if (wr->opcode != IB_WR_SEND) + return -EINVAL; + + send_size = 0; + + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + /* for proxy-qp0 sends, need to add in size of tunnel header */ + /* for tunnel-qp0 sends, tunnel header is already in s/g list */ + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) + send_size += sizeof (struct mlx4_ib_tunnel_header); + + ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header); + + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + sqp->ud_header.lrh.source_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + /* force loopback */ + mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + + sqp->ud_header.lrh.virtual_lane = 0; + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER) + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + else + sqp->ud_header.bth.destination_qpn = + cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); + + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + if (mlx4_is_master(mdev->dev)) { + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } else { + if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } + sqp->ud_header.deth.qkey = cpu_to_be32(qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); + + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct ib_device *ib_dev = sqp->qp.ibqp.device; struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); union ib_gid sgid; @@ -1389,10 +2097,11 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, int header_size; int spc; int i; - int is_eth; - int is_vlan = 0; - int is_grh; - u16 vlan; + int err = 0; + u16 vlan = 0xffff; + bool is_eth; + bool is_vlan = false; + bool is_grh; send_size = 0; for (i = 0; i < wr->num_sge; ++i) @@ -1401,10 +2110,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; is_grh = mlx4_ib_ah_grh_present(ah); if (is_eth) { - ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, - ah->av.ib.gid_index, &sgid); - vlan = rdma_get_vlan_id(&sgid); - is_vlan = vlan < 0x1000; + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; + } else { + err = ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + if (err) + return err; + } + + if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { + vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; + is_vlan = 1; + } } ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); @@ -1421,8 +2147,25 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; - ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, - ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + if (is_eth) + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + else { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + sqp->ud_header.grh.source_gid.global.subnet_prefix = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + subnet_prefix; + sqp->ud_header.grh.source_gid.global.interface_id = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + guid_cache[ah->av.ib.gid_index]; + } else + ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, + &sqp->ud_header.grh.source_gid); + } memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16); } @@ -1434,6 +2177,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | (sqp->ud_header.lrh.service_level << 8)); + if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) + mlx->flags |= cpu_to_be32(0x1); /* force loopback */ mlx->rlid = sqp->ud_header.lrh.destination_lid; } @@ -1453,13 +2198,23 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, if (is_eth) { u8 *smac; + struct in6_addr in6; + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; mlx->sched_prio = cpu_to_be16(pcp); memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); /* FIXME: cache smac value? */ - smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) + smac = to_mdev(sqp->qp.ibqp.device)-> + iboe.netdevs[sqp->qp.port - 1]->dev_addr; + else /* use the src mac of the tunnel */ + smac = ah->av.eth.s_mac; memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); @@ -1561,9 +2316,12 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq static __be32 convert_access(int acc) { - return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC) : 0) | - (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) | - (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ) : 0) | + return (acc & IB_ACCESS_REMOTE_ATOMIC ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | + (acc & IB_ACCESS_REMOTE_READ ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); } @@ -1589,12 +2347,28 @@ static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) fseg->reserved[1] = 0; } +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr) +{ + bseg->flags1 = + convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) & + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ | + MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE | + MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC); + bseg->flags2 = 0; + if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2); + if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED); + bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey); + bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey); + bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr); + bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length); +} + static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) { - iseg->flags = 0; - iseg->mem_key = cpu_to_be32(rkey); - iseg->guest_id = 0; - iseg->pa = 0; + memset(iseg, 0, sizeof(*iseg)); + iseg->mem_key = cpu_to_be32(rkey); } static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, @@ -1639,6 +2413,68 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); } +static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, + struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr, + enum mlx4_ib_qp_type qpt) +{ + union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; + struct mlx4_av sqp_av = {0}; + int port = *((u8 *) &av->ib.port_pd) & 0x3; + + /* force loopback */ + sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); + sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ + sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & + cpu_to_be32(0xf0000000); + + memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); + if (qpt == MLX4_IB_QPT_PROXY_GSI) + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + else + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]); + /* Use QKEY from the QP context, which is set by master */ + dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); +} + +static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + struct mlx4_ib_tunnel_header hdr; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + int spc; + int i; + + memcpy(&hdr.av, &ah->av, sizeof hdr.av); + hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); + hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = ah->av.eth.vlan; + + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (sizeof (hdr) <= spc) { + memcpy(inl + 1, &hdr, sizeof (hdr)); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr)); + i = 1; + } else { + memcpy(inl + 1, &hdr, spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16); +} + static void set_mlx_icrc_seg(void *dseg) { u32 *t = dseg; @@ -1720,6 +2556,13 @@ static __be32 send_ieth(struct ib_send_wr *wr) } } +static void add_zero_len_inline(void *wqe) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + memset(wqe, 0, 16); + inl->byte_count = cpu_to_be32(1 << 31); +} + int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { @@ -1778,9 +2621,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wqe += sizeof *ctrl; size = sizeof *ctrl / 16; - switch (ibqp->qp_type) { - case IB_QPT_RC: - case IB_QPT_UC: + switch (qp->mlx4_ib_qp_type) { + case MLX4_IB_QPT_RC: + case MLX4_IB_QPT_UC: switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: @@ -1835,13 +2678,38 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_fmr_seg) / 16; break; + case IB_WR_BIND_MW: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof(struct mlx4_wqe_bind_seg); + size += sizeof(struct mlx4_wqe_bind_seg) / 16; + break; default: /* No extra segments required for sends */ break; } break; - case IB_QPT_UD: + case MLX4_IB_QPT_TUN_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_TUN_GSI: + /* this is a UD qp used in MAD responses to slaves. */ + set_datagram_seg(wqe, wr); + /* set the forced-loopback bit in the data seg av */ + *(__be32 *) wqe |= cpu_to_be32(0x80000000); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + case MLX4_IB_QPT_UD: set_datagram_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; @@ -1858,8 +2726,39 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX4_IB_QPT_PROXY_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + /* to start tunnel header on a cache-line boundary */ + add_zero_len_inline(wqe); + wqe += 16; + size++; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + /* If we are tunneling special qps, this is a UD qp. + * In this case we first add a UD segment targeting + * the tunnel qp, and then add a header with address + * information */ + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, + qp->mlx4_ib_qp_type); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; @@ -1885,8 +2784,10 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); /* Add one more inline data segment for ICRC for MLX sends */ - if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI)) { + if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) { set_mlx_icrc_seg(dseg + 1); size += sizeof (struct mlx4_wqe_data_seg) / 16; } @@ -1978,8 +2879,10 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int err = 0; int nreq; int ind; + int max_gs; int i; + max_gs = qp->rq.max_gs; spin_lock_irqsave(&qp->rq.lock, flags); ind = qp->rq.head & (qp->rq.wqe_cnt - 1); @@ -1999,10 +2902,25 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, scat = get_recv_wqe(qp, ind); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + ib_dma_sync_single_for_device(ibqp->device, + qp->sqp_proxy_rcv[ind].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + scat->byte_count = + cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr)); + /* use dma lkey from upper layer entry */ + scat->lkey = cpu_to_be32(wr->sg_list->lkey); + scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); + scat++; + max_gs--; + } + for (i = 0; i < wr->num_sge; ++i) __set_data_seg(scat + i, wr->sg_list + i); - if (i < qp->rq.max_gs) { + if (i < max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); scat[i].addr = 0; @@ -2197,6 +3115,13 @@ done: if (qp->flags & MLX4_IB_QP_LSO) qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + if (qp->flags & MLX4_IB_QP_NETIF) + qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; + + qp_init_attr->sq_sig_type = + qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + out: mutex_unlock(&qp->mutex); return err; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 60c5fb025fc..62d9285300a 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -134,13 +134,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; } else { - err = mlx4_db_alloc(dev->dev, &srq->db, 0); + err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); if (err) goto err_srq; *srq->db.db = 0; - if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, + GFP_KERNEL)) { err = -ENOMEM; goto err_db; } @@ -165,7 +166,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); if (err) goto err_mtt; diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c new file mode 100644 index 00000000000..cb4c66e723b --- /dev/null +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -0,0 +1,906 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*#include "core_priv.h"*/ +#include "mlx4_ib.h" +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/stat.h> + +#include <rdma/ib_mad.h> +/*show_admin_alias_guid returns the administratively assigned value of that GUID. + * Values returned in buf parameter string: + * 0 - requests opensm to assign a value. + * ffffffffffffffff - delete this entry. + * other - value assigned by administrator. + */ +static ssize_t show_admin_alias_guid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + + record_num = mlx4_ib_iov_dentry->entry_num / 8 ; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ; + + return sprintf(buf, "%llx\n", + be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid. + ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[8 * guid_index_in_rec])); +} + +/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID. + * Values in buf parameter string: + * 0 - requests opensm to assign a value. + * 0xffffffffffffffff - delete this entry. + * other - guid value assigned by the administrator. + */ +static ssize_t store_admin_alias_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u64 sysadmin_ag_val; + + record_num = mlx4_ib_iov_dentry->entry_num / 8; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; + if (0 == record_num && 0 == guid_index_in_rec) { + pr_err("GUID 0 block 0 is RO\n"); + return count; + } + sscanf(buf, "%llx", &sysadmin_ag_val); + *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * guid_index_in_rec] = + cpu_to_be64(sysadmin_ag_val); + + /* Change the state to be pending for update */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE ; + + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_SET; + + switch (sysadmin_ag_val) { + case MLX4_GUID_FOR_DELETE_VAL: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_DELETE; + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + break; + /* The sysadmin requests the SM to re-assign */ + case MLX4_NOT_SET_GUID: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_DRIVER_ASSIGN; + break; + /* The sysadmin requests a specific value.*/ + default: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + break; + } + + /* set the record index */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes + = mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + + mlx4_ib_init_alias_guid_work(mdev, port->num - 1); + + return count; +} + +static ssize_t show_port_gid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + union ib_gid gid; + ssize_t ret; + + ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &gid, 1); + if (ret) + return ret; + ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) gid.raw)[0]), + be16_to_cpu(((__be16 *) gid.raw)[1]), + be16_to_cpu(((__be16 *) gid.raw)[2]), + be16_to_cpu(((__be16 *) gid.raw)[3]), + be16_to_cpu(((__be16 *) gid.raw)[4]), + be16_to_cpu(((__be16 *) gid.raw)[5]), + be16_to_cpu(((__be16 *) gid.raw)[6]), + be16_to_cpu(((__be16 *) gid.raw)[7])); + return ret; +} + +static ssize_t show_phys_port_pkey(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u16 pkey; + ssize_t ret; + + ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &pkey, 1); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define DENTRY_REMOVE(_dentry) \ +do { \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ +} while (0); + +static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry, + char *_name, struct kobject *_kobj, + ssize_t (*show)(struct device *dev, + struct device_attribute *attr, + char *buf), + ssize_t (*store)(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + ) +{ + int ret = 0; + struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry; + + vdentry->ctx = _ctx; + vdentry->dentry.show = show; + vdentry->dentry.store = store; + sysfs_attr_init(&vdentry->dentry.attr); + vdentry->dentry.attr.name = vdentry->name; + vdentry->dentry.attr.mode = 0; + vdentry->kobj = _kobj; + snprintf(vdentry->name, 15, "%s", _name); + + if (vdentry->dentry.store) + vdentry->dentry.attr.mode |= S_IWUSR; + + if (vdentry->dentry.show) + vdentry->dentry.attr.mode |= S_IRUGO; + + ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr); + if (ret) { + pr_err("failed to create %s\n", vdentry->dentry.attr.name); + vdentry->ctx = NULL; + return ret; + } + + return ret; +} + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + int ret; + + ret = sysfs_create_file(port->mcgs_parent, attr); + if (ret) + pr_err("failed to create %s\n", attr->name); + + return ret; +} + +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + + sysfs_remove_file(port->mcgs_parent, attr); +} + +static int add_port_entries(struct mlx4_ib_dev *device, int port_num) +{ + int i; + char buff[10]; + struct mlx4_ib_iov_port *port = NULL; + int ret = 0 ; + struct ib_port_attr attr; + + /* get the physical gid and pkey table sizes.*/ + ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1); + if (ret) + goto err; + + port = &device->iov_ports[port_num - 1]; + port->dev = device; + port->num = port_num; + /* Directory structure: + * iov - + * port num - + * admin_guids + * gids (operational) + * mcg_table + */ + port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar), + GFP_KERNEL); + if (!port->dentr_ar) { + ret = -ENOMEM; + goto err; + } + sprintf(buff, "%d", port_num); + port->cur_port = kobject_create_and_add(buff, + kobject_get(device->ports_parent)); + if (!port->cur_port) { + ret = -ENOMEM; + goto kobj_create_err; + } + /* admin GUIDs */ + port->admin_alias_parent = kobject_create_and_add("admin_guids", + kobject_get(port->cur_port)); + if (!port->admin_alias_parent) { + ret = -ENOMEM; + goto err_admin_guids; + } + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[i].entry_num = i; + ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i], + buff, port->admin_alias_parent, + show_admin_alias_guid, store_admin_alias_guid); + if (ret) + goto err_admin_alias_parent; + } + + /* gids subdirectory (operational gids) */ + port->gids_parent = kobject_create_and_add("gids", + kobject_get(port->cur_port)); + if (!port->gids_parent) { + ret = -ENOMEM; + goto err_gids; + } + + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[attr.gid_tbl_len + i], + buff, + port->gids_parent, show_port_gid, NULL); + if (ret) + goto err_gids_parent; + } + + /* physical port pkey table */ + port->pkeys_parent = + kobject_create_and_add("pkeys", kobject_get(port->cur_port)); + if (!port->pkeys_parent) { + ret = -ENOMEM; + goto err_pkeys; + } + + for (i = 0 ; i < attr.pkey_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i], + buff, port->pkeys_parent, + show_phys_port_pkey, NULL); + if (ret) + goto err_pkeys_parent; + } + + /* MCGs table */ + port->mcgs_parent = + kobject_create_and_add("mcgs", kobject_get(port->cur_port)); + if (!port->mcgs_parent) { + ret = -ENOMEM; + goto err_mcgs; + } + return 0; + +err_mcgs: + kobject_put(port->cur_port); + +err_pkeys_parent: + kobject_put(port->pkeys_parent); + +err_pkeys: + kobject_put(port->cur_port); + +err_gids_parent: + kobject_put(port->gids_parent); + +err_gids: + kobject_put(port->cur_port); + +err_admin_alias_parent: + kobject_put(port->admin_alias_parent); + +err_admin_guids: + kobject_put(port->cur_port); + kobject_put(port->cur_port); /* once more for create_and_add buff */ + +kobj_create_err: + kobject_put(device->ports_parent); + kfree(port->dentr_ar); + +err: + pr_err("add_port_entries FAILED: for port:%d, error: %d\n", + port_num, ret); + return ret; +} + +static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max) +{ + char base_name[9]; + + /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */ + strlcpy(name, pci_name(dev->dev->pdev), max); + strncpy(base_name, name, 8); /*till xxxx:yy:*/ + base_name[8] = '\0'; + /* with no ARI only 3 last bits are used so when the fn is higher than 8 + * need to add it to the dev num, so count in the last number will be + * modulo 8 */ + sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8)); +} + +struct mlx4_port { + struct kobject kobj; + struct mlx4_ib_dev *dev; + struct attribute_group pkey_group; + struct attribute_group gid_group; + struct device_attribute enable_smi_admin; + struct device_attribute smi_enabled; + int slave; + u8 port_num; +}; + + +static void mlx4_port_release(struct kobject *kobj) +{ + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + struct attribute *a; + int i; + + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + kfree(p->pkey_group.attrs); + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + kfree(p->gid_group.attrs); + kfree(p); +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->show) + return -EIO; + return port_attr->show(p, port_attr, buf); +} + +static ssize_t port_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->store) + return -EIO; + return port_attr->store(p, port_attr, buf, size); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show, + .store = port_attr_store, +}; + +static struct kobj_type port_type = { + .release = mlx4_port_release, + .sysfs_ops = &port_sysfs_ops, +}; + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; +}; + +static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + ssize_t ret = -ENODEV; + + if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >= + (p->dev->dev->caps.pkey_table_len[p->port_num])) + ret = sprintf(buf, "none\n"); + else + ret = sprintf(buf, "%d\n", + p->dev->pkeys.virt2phys_pkey[p->slave] + [p->port_num - 1][tab_attr->index]); + return ret; +} + +static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + const char *buf, size_t count) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int idx; + int err; + + /* do not allow remapping Dom0 virtual pkey table */ + if (p->slave == mlx4_master_func_num(p->dev->dev)) + return -EINVAL; + + if (!strncasecmp(buf, "no", 2)) + idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1; + else if (sscanf(buf, "%i", &idx) != 1 || + idx >= p->dev->dev->caps.pkey_table_len[p->port_num] || + idx < 0) + return -EINVAL; + + p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1] + [tab_attr->index] = idx; + mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num, + tab_attr->index, idx); + err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num); + if (err) { + pr_err("mlx4_gen_pkey_eqe failed for slave %d," + " port %d, index %d\n", p->slave, p->port_num, idx); + return err; + } + return count; +} + +static ssize_t show_port_gid_idx(struct mlx4_port *p, + struct port_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", p->slave); +} + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct mlx4_port *, + struct port_attribute *, char *buf), + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof (struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + if (snprintf(element->name, sizeof (element->name), + "%d", i) >= sizeof (element->name)) { + kfree(element); + goto err; + } + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + if (store) { + element->attr.attr.mode = S_IWUSR | S_IRUGO; + element->attr.store = store; + } else + element->attr.attr.mode = S_IRUGO; + + element->attr.show = show; + element->index = i; + tab_attr[i] = &element->attr.attr; + } + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +static ssize_t sysfs_show_smi_enabled(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, smi_enabled); + ssize_t len = 0; + + if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + ssize_t len = 0; + + if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + int enable; + + if (sscanf(buf, "%i", &enable) != 1 || + enable < 0 || enable > 1) + return -EINVAL; + + if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) + return -EINVAL; + return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + int ret; + + /* do not display entries if eth transport, or if master */ + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return 0; + + sysfs_attr_init(&p->smi_enabled.attr); + p->smi_enabled.show = sysfs_show_smi_enabled; + p->smi_enabled.store = NULL; + p->smi_enabled.attr.name = "smi_enabled"; + p->smi_enabled.attr.mode = 0444; + ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); + if (ret) { + pr_err("failed to create smi_enabled\n"); + return ret; + } + + sysfs_attr_init(&p->enable_smi_admin.attr); + p->enable_smi_admin.show = sysfs_show_enable_smi_admin; + p->enable_smi_admin.store = sysfs_store_enable_smi_admin; + p->enable_smi_admin.attr.name = "enable_smi_admin"; + p->enable_smi_admin.attr.mode = 0644; + ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); + if (ret) { + pr_err("failed to create enable_smi_admin\n"); + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + return ret; + } + return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return; + + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} + +static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) +{ + struct mlx4_port *p; + int i; + int ret; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->dev = dev; + p->port_num = port_num; + p->slave = slave; + + ret = kobject_init_and_add(&p->kobj, &port_type, + kobject_get(dev->dev_ports_parent[slave]), + "%d", port_num); + if (ret) + goto err_alloc; + + p->pkey_group.name = "pkey_idx"; + p->pkey_group.attrs = + alloc_group_attrs(show_port_pkey, store_port_pkey, + dev->dev->caps.pkey_table_len[port_num]); + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_alloc; + } + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + p->gid_group.name = "gid_idx"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); + if (!p->gid_group.attrs) { + ret = -ENOMEM; + goto err_free_pkey; + } + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + ret = add_vf_smi_entries(p); + if (ret) + goto err_free_gid; + + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); + return 0; + +err_free_gid: + kfree(p->gid_group.attrs[0]); + kfree(p->gid_group.attrs); + +err_free_pkey: + for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i) + kfree(p->pkey_group.attrs[i]); + kfree(p->pkey_group.attrs); + +err_alloc: + kobject_put(dev->dev_ports_parent[slave]); + kfree(p); + return ret; +} + +static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) +{ + char name[32]; + int err; + int port; + struct kobject *p, *t; + struct mlx4_port *mport; + struct mlx4_active_ports actv_ports; + + get_name(dev, name, slave, sizeof name); + + dev->pkeys.device_parent[slave] = + kobject_create_and_add(name, kobject_get(dev->iov_parent)); + + if (!dev->pkeys.device_parent[slave]) { + err = -ENOMEM; + goto fail_dev; + } + + INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]); + + dev->dev_ports_parent[slave] = + kobject_create_and_add("ports", + kobject_get(dev->pkeys.device_parent[slave])); + + if (!dev->dev_ports_parent[slave]) { + err = -ENOMEM; + goto err_ports; + } + + actv_ports = mlx4_get_active_ports(dev->dev, slave); + + for (port = 1; port <= dev->dev->caps.num_ports; ++port) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; + err = add_port(dev, port, slave); + if (err) + goto err_add; + } + return 0; + +err_add: + list_for_each_entry_safe(p, t, + &dev->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + mport = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &mport->pkey_group); + sysfs_remove_group(p, &mport->gid_group); + remove_vf_smi_entries(mport); + kobject_put(p); + } + kobject_put(dev->dev_ports_parent[slave]); + +err_ports: + kobject_put(dev->pkeys.device_parent[slave]); + /* extra put for the device_parent create_and_add */ + kobject_put(dev->pkeys.device_parent[slave]); + +fail_dev: + kobject_put(dev->iov_parent); + return err; +} + +static int register_pkey_tree(struct mlx4_ib_dev *device) +{ + int i; + + if (!mlx4_is_master(device->dev)) + return 0; + + for (i = 0; i <= device->dev->num_vfs; ++i) + register_one_pkey_tree(device, i); + + return 0; +} + +static void unregister_pkey_tree(struct mlx4_ib_dev *device) +{ + int slave; + struct kobject *p, *t; + struct mlx4_port *port; + + if (!mlx4_is_master(device->dev)) + return; + + for (slave = device->dev->num_vfs; slave >= 0; --slave) { + list_for_each_entry_safe(p, t, + &device->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + port = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + remove_vf_smi_entries(port); + kobject_put(p); + kobject_put(device->dev_ports_parent[slave]); + } + kobject_put(device->dev_ports_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->iov_parent); + } +} + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) +{ + int i; + int ret = 0; + + if (!mlx4_is_master(dev->dev)) + return 0; + + dev->iov_parent = + kobject_create_and_add("iov", + kobject_get(dev->ib_dev.ports_parent->parent)); + if (!dev->iov_parent) { + ret = -ENOMEM; + goto err; + } + dev->ports_parent = + kobject_create_and_add("ports", + kobject_get(dev->iov_parent)); + if (!dev->ports_parent) { + ret = -ENOMEM; + goto err_ports; + } + + for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) { + ret = add_port_entries(dev, i); + if (ret) + goto err_add_entries; + } + + ret = register_pkey_tree(dev); + if (ret) + goto err_add_entries; + return 0; + +err_add_entries: + kobject_put(dev->ports_parent); + +err_ports: + kobject_put(dev->iov_parent); +err: + kobject_put(dev->ib_dev.ports_parent->parent); + pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret); + return ret; +} + +static void unregister_alias_guid_tree(struct mlx4_ib_dev *device) +{ + struct mlx4_ib_iov_port *p; + int i; + + if (!mlx4_is_master(device->dev)) + return; + + for (i = 0; i < device->dev->caps.num_ports; i++) { + p = &device->iov_ports[i]; + kobject_put(p->admin_alias_parent); + kobject_put(p->gids_parent); + kobject_put(p->pkeys_parent); + kobject_put(p->mcgs_parent); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->dev->ports_parent); + kfree(p->dentr_ar); + } +} + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) +{ + unregister_alias_guid_tree(device); + unregister_pkey_tree(device); + kobject_put(device->ports_parent); + kobject_put(device->iov_parent); + kobject_put(device->iov_parent); + kobject_put(device->ib_dev.ports_parent->parent); +} diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h index 13beedeeef9..07e6769ef43 100644 --- a/drivers/infiniband/hw/mlx4/user.h +++ b/drivers/infiniband/hw/mlx4/user.h @@ -40,7 +40,9 @@ * Increment this value if any changes that break userspace ABI * compatibility are made. */ -#define MLX4_IB_UVERBS_ABI_VERSION 3 + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 /* * Make sure that all structs defined in this file remain laid out so @@ -50,10 +52,18 @@ * instead. */ +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; __u32 qp_tab_size; __u16 bf_reg_size; __u16 bf_regs_per_page; + __u32 cqe_size; }; struct mlx4_ib_alloc_pd_resp { |
