diff options
Diffstat (limited to 'net')
788 files changed, 50890 insertions, 25780 deletions
diff --git a/net/802/garp.c b/net/802/garp.c index 5d9630a0eb9..b38ee6dcba4 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -397,7 +397,7 @@ static void garp_join_timer_arm(struct garp_applicant *app) { unsigned long delay; - delay = (u64)msecs_to_jiffies(garp_join_time) * net_random() >> 32; + delay = (u64)msecs_to_jiffies(garp_join_time) * prandom_u32() >> 32; mod_timer(&app->join_timer, jiffies + delay); } diff --git a/net/802/hippi.c b/net/802/hippi.c index a97a3bde77b..5ff2a718ddc 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -172,14 +172,14 @@ EXPORT_SYMBOL(hippi_mac_addr); int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) { /* Never send broadcast/multicast ARP messages */ - NEIGH_VAR_SET(p, MCAST_PROBES, 0); + NEIGH_VAR_INIT(p, MCAST_PROBES, 0); /* In IPv6 unicast probes are valid even on NBMA, * because they are encapsulated in normal IPv6 protocol. * Should be a generic flag. */ if (p->tbl->family != AF_INET6) - NEIGH_VAR_SET(p, UCAST_PROBES, 0); + NEIGH_VAR_INIT(p, UCAST_PROBES, 0); return 0; } EXPORT_SYMBOL(hippi_neigh_setup_dev); diff --git a/net/802/mrp.c b/net/802/mrp.c index 3ed61621587..72db2785ef2 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -583,7 +583,7 @@ static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; - delay = (u64)msecs_to_jiffies(mrp_join_time) * net_random() >> 32; + delay = (u64)msecs_to_jiffies(mrp_join_time) * prandom_u32() >> 32; mod_timer(&app->join_timer, jiffies + delay); } diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig index b85a91fa61f..42320180967 100644 --- a/net/8021q/Kconfig +++ b/net/8021q/Kconfig @@ -6,11 +6,11 @@ config VLAN_8021Q tristate "802.1Q/802.1ad VLAN Support" ---help--- Select this and you will be able to create 802.1Q VLAN interfaces - on your ethernet interfaces. 802.1Q VLAN supports almost - everything a regular ethernet interface does, including - firewalling, bridging, and of course IP traffic. You will need - the 'vconfig' tool from the VLAN project in order to effectively - use VLANs. See the VLAN web page for more information: + on your Ethernet interfaces. 802.1Q VLAN supports almost + everything a regular Ethernet interface does, including + firewalling, bridging, and of course IP traffic. You will need + the 'ip' utility in order to effectively use VLANs. + See the VLAN web page for more information: <http://www.candelatech.com/~greear/vlan.html> To compile this code as a module, choose M here: the module diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index b3d17d1c49c..44ebd5c2cd4 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -169,6 +169,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; + vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; @@ -301,15 +302,17 @@ static void vlan_sync_address(struct net_device *dev, !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); - memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN); + ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { + struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); + vlandev->gso_max_size = dev->gso_max_size; - if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) + if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 5704ed9c3a2..9d010a09ab9 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -38,9 +38,9 @@ struct vlan_info { static inline unsigned int vlan_proto_idx(__be16 proto) { switch (proto) { - case __constant_htons(ETH_P_8021Q): + case htons(ETH_P_8021Q): return VLAN_PROTO_8021Q; - case __constant_htons(ETH_P_8021AD): + case htons(ETH_P_8021AD): return VLAN_PROTO_8021AD; default: BUG(); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 6ee48aac776..75d42776399 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -22,11 +22,11 @@ bool vlan_do_receive(struct sk_buff **skbp) return false; skb->dev = vlan_dev; - if (skb->pkt_type == PACKET_OTHERHOST) { + if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. * This allows the VLAN to have a different MAC than the * underlying device, and still route correctly. */ - if (ether_addr_equal(eth_hdr(skb)->h_dest, vlan_dev->dev_addr)) + if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr)) skb->pkt_type = PACKET_HOST; } @@ -63,7 +63,7 @@ bool vlan_do_receive(struct sk_buff **skbp) } /* Must be invoked with rcu_read_lock. */ -struct net_device *__vlan_find_dev_deep(struct net_device *dev, +struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev, __be16 vlan_proto, u16 vlan_id) { struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info); @@ -81,13 +81,13 @@ struct net_device *__vlan_find_dev_deep(struct net_device *dev, upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) - return __vlan_find_dev_deep(upper_dev, + return __vlan_find_dev_deep_rcu(upper_dev, vlan_proto, vlan_id); } return NULL; } -EXPORT_SYMBOL(__vlan_find_dev_deep); +EXPORT_SYMBOL(__vlan_find_dev_deep_rcu); struct net_device *vlan_dev_real_dev(const struct net_device *dev) { @@ -106,10 +106,19 @@ u16 vlan_dev_vlan_id(const struct net_device *dev) } EXPORT_SYMBOL(vlan_dev_vlan_id); +__be16 vlan_dev_vlan_proto(const struct net_device *dev) +{ + return vlan_dev_priv(dev)->vlan_proto; +} +EXPORT_SYMBOL(vlan_dev_vlan_proto); + static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) { - if (skb_cow(skb, skb_headroom(skb)) < 0) + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); return NULL; + } + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 762896ebfcf..dd11f612e03 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -61,7 +61,7 @@ static int vlan_dev_rebuild_header(struct sk_buff *skb) pr_debug("%s: unable to resolve type %X addresses\n", dev->name, ntohs(veth->h_vlan_encapsulated_proto)); - memcpy(veth->h_source, dev->dev_addr, ETH_ALEN); + ether_addr_copy(veth->h_source, dev->dev_addr); break; } @@ -303,7 +303,7 @@ static int vlan_dev_open(struct net_device *dev) goto clear_allmulti; } - memcpy(vlan->real_dev_addr, real_dev->dev_addr, ETH_ALEN); + ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); @@ -367,7 +367,7 @@ static int vlan_dev_set_mac_address(struct net_device *dev, void *p) dev_uc_del(real_dev, dev->dev_addr); out: - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + ether_addr_copy(dev->dev_addr, addr->sa_data); return 0; } @@ -524,12 +524,37 @@ static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass); } +static int vlan_dev_get_lock_subclass(struct net_device *dev) +{ + return vlan_dev_priv(dev)->nest_level; +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .rebuild = vlan_dev_rebuild_header, .parse = eth_header_parse, }; +static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, + unsigned int len) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; + + if (saddr == NULL) + saddr = dev->dev_addr; + + return dev_hard_header(skb, real_dev, type, daddr, saddr, len); +} + +static const struct header_ops vlan_passthru_header_ops = { + .create = vlan_passthru_hard_header, + .rebuild = dev_rebuild_header, + .parse = eth_header_parse, +}; + static struct device_type vlan_type = { .name = "vlan", }; @@ -539,7 +564,6 @@ static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - int subclass = 0, i; netif_carrier_off(dev); @@ -558,6 +582,9 @@ static int vlan_dev_init(struct net_device *dev) dev->features |= real_dev->vlan_features | NETIF_F_LLTX; dev->gso_max_size = real_dev->gso_max_size; + if (dev->features & NETIF_F_VLAN_FEATURES) + netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); + /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; @@ -572,8 +599,9 @@ static int vlan_dev_init(struct net_device *dev) #endif dev->needed_headroom = real_dev->needed_headroom; - if (real_dev->features & NETIF_F_HW_VLAN_CTAG_TX) { - dev->header_ops = real_dev->header_ops; + if (vlan_hw_offload_capable(real_dev->features, + vlan_dev_priv(dev)->vlan_proto)) { + dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { dev->header_ops = &vlan_header_ops; @@ -584,22 +612,12 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - if (is_vlan_dev(real_dev)) - subclass = 1; - - vlan_dev_set_lockdep_class(dev, subclass); + vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); - vlan_dev_priv(dev)->vlan_pcpu_stats = alloc_percpu(struct vlan_pcpu_stats); + vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan_dev_priv(dev)->vlan_pcpu_stats) return -ENOMEM; - for_each_possible_cpu(i) { - struct vlan_pcpu_stats *vlan_stat; - vlan_stat = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); - u64_stats_init(&vlan_stat->syncp); - } - - return 0; } @@ -609,8 +627,6 @@ static void vlan_dev_uninit(struct net_device *dev) struct vlan_dev_priv *vlan = vlan_dev_priv(dev); int i; - free_percpu(vlan->vlan_pcpu_stats); - vlan->vlan_pcpu_stats = NULL; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { while ((pm = vlan->egress_priority_map[i]) != NULL) { vlan->egress_priority_map[i] = pm->next; @@ -625,9 +641,9 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; netdev_features_t old_features = features; - features &= real_dev->vlan_features; + features = netdev_intersect_features(features, real_dev->vlan_features); features |= NETIF_F_RXCSUM; - features &= real_dev->features; + features = netdev_intersect_features(features, real_dev->features); features |= old_features & NETIF_F_SOFT_FEATURES; features |= NETIF_F_LLTX; @@ -653,38 +669,36 @@ static void vlan_ethtool_get_drvinfo(struct net_device *dev, static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { + struct vlan_pcpu_stats *p; + u32 rx_errors = 0, tx_dropped = 0; + int i; - if (vlan_dev_priv(dev)->vlan_pcpu_stats) { - struct vlan_pcpu_stats *p; - u32 rx_errors = 0, tx_dropped = 0; - int i; - - for_each_possible_cpu(i) { - u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes; - unsigned int start; - - p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); - do { - start = u64_stats_fetch_begin_bh(&p->syncp); - rxpackets = p->rx_packets; - rxbytes = p->rx_bytes; - rxmulticast = p->rx_multicast; - txpackets = p->tx_packets; - txbytes = p->tx_bytes; - } while (u64_stats_fetch_retry_bh(&p->syncp, start)); - - stats->rx_packets += rxpackets; - stats->rx_bytes += rxbytes; - stats->multicast += rxmulticast; - stats->tx_packets += txpackets; - stats->tx_bytes += txbytes; - /* rx_errors & tx_dropped are u32 */ - rx_errors += p->rx_errors; - tx_dropped += p->tx_dropped; - } - stats->rx_errors = rx_errors; - stats->tx_dropped = tx_dropped; + for_each_possible_cpu(i) { + u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes; + unsigned int start; + + p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); + do { + start = u64_stats_fetch_begin_irq(&p->syncp); + rxpackets = p->rx_packets; + rxbytes = p->rx_bytes; + rxmulticast = p->rx_multicast; + txpackets = p->tx_packets; + txbytes = p->tx_bytes; + } while (u64_stats_fetch_retry_irq(&p->syncp, start)); + + stats->rx_packets += rxpackets; + stats->rx_bytes += rxbytes; + stats->multicast += rxmulticast; + stats->tx_packets += txpackets; + stats->tx_bytes += txbytes; + /* rx_errors & tx_dropped are u32 */ + rx_errors += p->rx_errors; + tx_dropped += p->tx_dropped; } + stats->rx_errors = rx_errors; + stats->tx_dropped = tx_dropped; + return stats; } @@ -694,20 +708,19 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, - gfp_t gfp) +static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), gfp); + netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; - err = __netpoll_setup(netpoll, real_dev, gfp); + err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; @@ -767,8 +780,18 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, }; +static void vlan_dev_free(struct net_device *dev) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + + free_percpu(vlan->vlan_pcpu_stats); + vlan->vlan_pcpu_stats = NULL; + free_netdev(dev); +} + void vlan_setup(struct net_device *dev) { ether_setup(dev); @@ -778,7 +801,7 @@ void vlan_setup(struct net_device *dev) dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; memset(dev->broadcast, 0, ETH_ALEN); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index c7e634af851..8ac8a5cc214 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -56,8 +56,8 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[]) if (data[IFLA_VLAN_PROTOCOL]) { switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { - case __constant_htons(ETH_P_8021Q): - case __constant_htons(ETH_P_8021AD): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): break; default: return -EPROTONOSUPPORT; diff --git a/net/9p/client.c b/net/9p/client.c index ee8fd6bd403..0004cbaac4a 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -204,7 +204,7 @@ free_and_return: return ret; } -struct p9_fcall *p9_fcall_alloc(int alloc_msize) +static struct p9_fcall *p9_fcall_alloc(int alloc_msize) { struct p9_fcall *fc; fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); @@ -415,9 +415,17 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) * req: request received * */ -void p9_client_cb(struct p9_client *c, struct p9_req_t *req) +void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) { p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); + + /* + * This barrier is needed to make sure any change made to req before + * the other thread wakes up will indeed be seen by the waiting side. + */ + smp_wmb(); + req->status = status; + wake_up(req->wq); p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); } @@ -655,16 +663,13 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) if (IS_ERR(req)) return PTR_ERR(req); - /* * if we haven't received a response for oldreq, * remove it from the list */ - if (oldreq->status == REQ_STATUS_FLSH) { - spin_lock(&c->lock); - list_del(&oldreq->req_list); - spin_unlock(&c->lock); - } + if (oldreq->status == REQ_STATUS_SENT) + if (c->trans_mod->cancelled) + c->trans_mod->cancelled(c, oldreq); p9_free_req(c, req); return 0; @@ -751,6 +756,12 @@ again: err = wait_event_interruptible(*req->wq, req->status >= REQ_STATUS_RCVD); + /* + * Make sure our req is coherent with regard to updates in other + * threads - echoes to wmb() in the callback + */ + smp_rmb(); + if ((err == -ERESTARTSYS) && (c->status == Connected) && (type == P9_TFLUSH)) { sigpending = 1; @@ -1012,9 +1023,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) goto destroy_tagpool; if (!clnt->trans_mod) - clnt->trans_mod = v9fs_get_trans_by_name("virtio"); - - if (!clnt->trans_mod) clnt->trans_mod = v9fs_get_default_trans(); if (clnt->trans_mod == NULL) { diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 9321a776306..80d08f6664c 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -66,20 +66,6 @@ struct p9_fd_opts { int privport; }; -/** - * struct p9_trans_fd - transport state - * @rd: reference to file to read from - * @wr: reference of file to write to - * @conn: connection state reference - * - */ - -struct p9_trans_fd { - struct file *rd; - struct file *wr; - struct p9_conn *conn; -}; - /* * Option Parsing (code inspired by NFS code) * - a little lazy - parse all fd-transport options @@ -159,6 +145,20 @@ struct p9_conn { unsigned long wsched; }; +/** + * struct p9_trans_fd - transport state + * @rd: reference to file to read from + * @wr: reference of file to write to + * @conn: connection state reference + * + */ + +struct p9_trans_fd { + struct file *rd; + struct file *wr; + struct p9_conn conn; +}; + static void p9_poll_workfn(struct work_struct *work); static DEFINE_SPINLOCK(p9_poll_lock); @@ -212,15 +212,9 @@ static void p9_conn_cancel(struct p9_conn *m, int err) m->err = err; list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { - req->status = REQ_STATUS_ERROR; - if (!req->t_err) - req->t_err = err; list_move(&req->req_list, &cancel_list); } list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { - req->status = REQ_STATUS_ERROR; - if (!req->t_err) - req->t_err = err; list_move(&req->req_list, &cancel_list); } spin_unlock_irqrestore(&m->client->lock, flags); @@ -228,7 +222,9 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); list_del(&req->req_list); - p9_client_cb(m->client, req); + if (!req->t_err) + req->t_err = err; + p9_client_cb(m->client, req, REQ_STATUS_ERROR); } } @@ -302,6 +298,7 @@ static void p9_read_work(struct work_struct *work) { int n, err; struct p9_conn *m; + int status = REQ_STATUS_ERROR; m = container_of(work, struct p9_conn, rq); @@ -348,8 +345,7 @@ static void p9_read_work(struct work_struct *work) "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); m->req = p9_tag_lookup(m->client, tag); - if (!m->req || (m->req->status != REQ_STATUS_SENT && - m->req->status != REQ_STATUS_FLSH)) { + if (!m->req || (m->req->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", tag); err = -EIO; @@ -375,10 +371,10 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "got new packet\n"); spin_lock(&m->client->lock); if (m->req->status != REQ_STATUS_ERROR) - m->req->status = REQ_STATUS_RCVD; + status = REQ_STATUS_RCVD; list_del(&m->req->req_list); spin_unlock(&m->client->lock); - p9_client_cb(m->client, m->req); + p9_client_cb(m->client, m->req, status); m->rbuf = NULL; m->rpos = 0; m->rsize = 0; @@ -573,21 +569,19 @@ p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) } /** - * p9_conn_create - allocate and initialize the per-session mux data + * p9_conn_create - initialize the per-session mux data * @client: client instance * * Note: Creates the polling task if this is the first session. */ -static struct p9_conn *p9_conn_create(struct p9_client *client) +static void p9_conn_create(struct p9_client *client) { int n; - struct p9_conn *m; + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize); - m = kzalloc(sizeof(struct p9_conn), GFP_KERNEL); - if (!m) - return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&m->mux_list); m->client = client; @@ -609,8 +603,6 @@ static struct p9_conn *p9_conn_create(struct p9_client *client) p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m); set_bit(Wpending, &m->wsched); } - - return m; } /** @@ -669,7 +661,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) { int n; struct p9_trans_fd *ts = client->trans; - struct p9_conn *m = ts->conn; + struct p9_conn *m = &ts->conn; p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", m, current, req->tc, req->tc->id); @@ -704,14 +696,26 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; ret = 0; - } else if (req->status == REQ_STATUS_SENT) - req->status = REQ_STATUS_FLSH; - + } spin_unlock(&client->lock); return ret; } +static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); + + /* we haven't received a response for oldreq, + * remove it from the list. + */ + spin_lock(&client->lock); + list_del(&req->req_list); + spin_unlock(&client->lock); + + return 0; +} + /** * parse_opts - parse mount options into p9_fd_opts structure * @params: options string passed from mount @@ -780,7 +784,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) static int p9_fd_open(struct p9_client *client, int rfd, int wfd) { - struct p9_trans_fd *ts = kmalloc(sizeof(struct p9_trans_fd), + struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); if (!ts) return -ENOMEM; @@ -806,9 +810,8 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) { struct p9_trans_fd *p; struct file *file; - int ret; - p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); + p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); if (!p) return -ENOMEM; @@ -829,20 +832,12 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) p->rd->f_flags |= O_NONBLOCK; - p->conn = p9_conn_create(client); - if (IS_ERR(p->conn)) { - ret = PTR_ERR(p->conn); - p->conn = NULL; - kfree(p); - sockfd_put(csocket); - sockfd_put(csocket); - return ret; - } + p9_conn_create(client); return 0; } /** - * p9_mux_destroy - cancels all pending requests and frees mux resources + * p9_mux_destroy - cancels all pending requests of mux * @m: mux to destroy * */ @@ -859,7 +854,6 @@ static void p9_conn_destroy(struct p9_conn *m) p9_conn_cancel(m, -ECONNRESET); m->client = NULL; - kfree(m); } /** @@ -881,7 +875,7 @@ static void p9_fd_close(struct p9_client *client) client->status = Disconnected; - p9_conn_destroy(ts->conn); + p9_conn_destroy(&ts->conn); if (ts->rd) fput(ts->rd); @@ -1033,14 +1027,7 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) return err; p = (struct p9_trans_fd *) client->trans; - p->conn = p9_conn_create(client); - if (IS_ERR(p->conn)) { - err = PTR_ERR(p->conn); - p->conn = NULL; - fput(p->rd); - fput(p->wr); - return err; - } + p9_conn_create(client); return 0; } @@ -1048,11 +1035,12 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) static struct p9_trans_module p9_tcp_trans = { .name = "tcp", .maxsize = MAX_SOCK_BUF, - .def = 1, + .def = 0, .create = p9_fd_create_tcp, .close = p9_fd_close, .request = p9_fd_request, .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, .owner = THIS_MODULE, }; @@ -1064,6 +1052,7 @@ static struct p9_trans_module p9_unix_trans = { .close = p9_fd_close, .request = p9_fd_request, .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, .owner = THIS_MODULE, }; @@ -1075,6 +1064,7 @@ static struct p9_trans_module p9_fd_trans = { .close = p9_fd_close, .request = p9_fd_request, .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 8f68df5d297..14ad43b5cf8 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -193,6 +193,8 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) if (!*p) continue; token = match_token(p, tokens, args); + if (token == Opt_err) + continue; r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, @@ -305,8 +307,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, } req->rc = c->rc; - req->status = REQ_STATUS_RCVD; - p9_client_cb(client, req); + p9_client_cb(client, req, REQ_STATUS_RCVD); return; @@ -511,6 +512,11 @@ dont_need_post_recv: goto send_error; } + /* Mark request as `sent' *before* we actually send it, + * because doing if after could erase the REQ_STATUS_RCVD + * status in case of a very fast reply. + */ + req->status = REQ_STATUS_SENT; err = ib_post_send(rdma->qp, &wr, &bad_wr); if (err) goto send_error; @@ -520,6 +526,7 @@ dont_need_post_recv: /* Handle errors that happened during or while preparing the send: */ send_error: + req->status = REQ_STATUS_ERROR; kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); @@ -582,12 +589,24 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) return rdma; } -/* its not clear to me we can do anything after send has been posted */ static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) { + /* Nothing to do here. + * We will take care of it (if we have to) in rdma_cancelled() + */ return 1; } +/* A request has been fully flushed without a reply. + * That means we have posted one buffer in excess. + */ +static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_rdma *rdma = client->trans; + atomic_inc(&rdma->excess_rc); + return 0; +} + /** * trans_create_rdma - Transport method for creating atransport instance * @client: client instance @@ -721,6 +740,7 @@ static struct p9_trans_module p9_rdma_trans = { .close = rdma_close, .request = rdma_request, .cancel = rdma_cancel, + .cancelled = rdma_cancelled, }; /** diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 9c5a1aa34d1..6940d8fe897 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -164,8 +164,7 @@ static void req_done(struct virtqueue *vq) p9_debug(P9_DEBUG_TRANS, ": rc %p\n", rc); p9_debug(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); req = p9_tag_lookup(chan->client, rc->tag); - req->status = REQ_STATUS_RCVD; - p9_client_cb(chan->client, req); + p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } @@ -340,7 +339,10 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, int count = nr_pages; while (nr_pages) { s = rest_of_page(data); - pages[index++] = kmap_to_page(data); + if (is_vmalloc_addr(data)) + pages[index++] = vmalloc_to_page(data); + else + pages[index++] = kmap_to_page(data); data += s; nr_pages--; } @@ -698,7 +700,7 @@ static struct p9_trans_module p9_virtio_trans = { * page in zero copy. */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), - .def = 0, + .def = 1, .owner = THIS_MODULE, }; diff --git a/net/Kconfig b/net/Kconfig index d334678c0bd..d92afe4204d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -89,8 +89,12 @@ config NETWORK_SECMARK to nfmark, but designated for security purposes. If you are unsure how to answer this question, answer N. +config NET_PTP_CLASSIFY + def_bool n + config NETWORK_PHY_TIMESTAMPING bool "Timestamping in PHY devices" + select NET_PTP_CLASSIFY help This allows timestamping of network packets by PHYs with hardware timestamping capabilities. This option adds some @@ -238,12 +242,19 @@ config XPS depends on SMP default y -config NETPRIO_CGROUP - tristate "Network priority cgroup" +config CGROUP_NET_PRIO + bool "Network priority cgroup" depends on CGROUPS ---help--- Cgroup subsystem for use in assigning processes to network priorities on - a per-interface basis + a per-interface basis. + +config CGROUP_NET_CLASSID + boolean "Network classid cgroup" + depends on CGROUPS + ---help--- + Cgroup subsystem for use as general purpose socket classid marker that is + being used in cls_cgroup and for netfilter matching. config NET_RX_BUSY_POLL boolean diff --git a/net/Makefile b/net/Makefile index 8fa2f91517f..cbbbe6d657c 100644 --- a/net/Makefile +++ b/net/Makefile @@ -57,7 +57,7 @@ obj-$(CONFIG_CAIF) += caif/ ifneq ($(CONFIG_DCB),) obj-y += dcb/ endif -obj-$(CONFIG_IEEE802154) += ieee802154/ +obj-y += ieee802154/ obj-$(CONFIG_MAC802154) += mac802154/ ifeq ($(CONFIG_NET),y) diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 690356fa52b..d1c55d8dd0a 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -40,6 +40,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> +#include <linux/etherdevice.h> int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME; int sysctl_aarp_tick_time = AARP_TICK_TIME; @@ -67,7 +68,7 @@ struct aarp_entry { unsigned long expires_at; struct atalk_addr target_addr; struct net_device *dev; - char hwaddr[6]; + char hwaddr[ETH_ALEN]; unsigned short xmit_count; struct aarp_entry *next; }; @@ -134,7 +135,7 @@ static void __aarp_send_query(struct aarp_entry *a) eah->pa_len = AARP_PA_ALEN; eah->function = htons(AARP_REQUEST); - memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + ether_addr_copy(eah->hw_src, dev->dev_addr); eah->pa_src_zero = 0; eah->pa_src_net = sat->s_net; @@ -181,7 +182,7 @@ static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us, eah->pa_len = AARP_PA_ALEN; eah->function = htons(AARP_REPLY); - memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + ether_addr_copy(eah->hw_src, dev->dev_addr); eah->pa_src_zero = 0; eah->pa_src_net = us->s_net; @@ -190,7 +191,7 @@ static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us, if (!sha) memset(eah->hw_dst, '\0', ETH_ALEN); else - memcpy(eah->hw_dst, sha, ETH_ALEN); + ether_addr_copy(eah->hw_dst, sha); eah->pa_dst_zero = 0; eah->pa_dst_net = them->s_net; @@ -232,7 +233,7 @@ static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us) eah->pa_len = AARP_PA_ALEN; eah->function = htons(AARP_PROBE); - memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + ether_addr_copy(eah->hw_src, dev->dev_addr); eah->pa_src_zero = 0; eah->pa_src_net = us->s_net; @@ -790,7 +791,7 @@ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, break; /* We can fill one in - this is good. */ - memcpy(a->hwaddr, ea->hw_src, ETH_ALEN); + ether_addr_copy(a->hwaddr, ea->hw_src); __aarp_resolved(&unresolved[hash], a, hash); if (!unresolved_count) mod_timer(&aarp_timer, @@ -925,7 +926,7 @@ static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos) struct aarp_entry *entry; rescan: - while(ct < AARP_HASH_SIZE) { + while (ct < AARP_HASH_SIZE) { for (entry = table[ct]; entry; entry = entry->next) { if (!pos || ++off == *pos) { iter->table = table; @@ -994,7 +995,7 @@ static const char *dt2str(unsigned long ticks) { static char buf[32]; - sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100 ) / HZ); + sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100) / HZ); return buf; } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 7d424ac6e76..bfcf6be1d66 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -293,7 +293,7 @@ static int atif_probe_device(struct atalk_iface *atif) /* Perform AARP probing for a proxy address */ static int atif_proxy_probe_device(struct atalk_iface *atif, - struct atalk_addr* proxy_addr) + struct atalk_addr *proxy_addr) { int netrange = ntohs(atif->nets.nr_lastnet) - ntohs(atif->nets.nr_firstnet) + 1; @@ -581,7 +581,7 @@ out: } /* Delete a route. Find it and discard it */ -static int atrtr_delete(struct atalk_addr * addr) +static int atrtr_delete(struct atalk_addr *addr) { struct atalk_route **r = &atalk_routes; int retval = 0; @@ -936,11 +936,11 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset, int i, copy; /* checksum stuff in header space */ - if ( (copy = start - offset) > 0) { + if ((copy = start - offset) > 0) { if (copy > len) copy = len; sum = atalk_sum_partial(skb->data + offset, copy, sum); - if ( (len -= copy) == 0) + if ((len -= copy) == 0) return sum; offset += copy; @@ -1151,7 +1151,7 @@ static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; at->src_net = addr->sat_addr.s_net = ap->s_net; - at->src_node = addr->sat_addr.s_node= ap->s_node; + at->src_node = addr->sat_addr.s_node = ap->s_node; } else { err = -EADDRNOTAVAIL; if (!atalk_find_interface(addr->sat_addr.s_net, @@ -1489,8 +1489,6 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; /* Queue packet (standard) */ - skb->sk = sock; - if (sock_queue_rcv_skb(sock, skb) < 0) goto drop; @@ -1566,7 +1564,7 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr { struct sock *sk = sock->sk; struct atalk_sock *at = at_sk(sk); - struct sockaddr_at *usat = (struct sockaddr_at *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_at *, usat, msg->msg_name); int flags = msg->msg_flags; int loopback = 0; struct sockaddr_at local_satalk, gsat; @@ -1644,7 +1642,6 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr if (!skb) goto out; - skb->sk = sk; skb_reserve(skb, ddp_dl->header_length); skb_reserve(skb, dev->hard_header_len); skb->dev = dev; @@ -1669,7 +1666,7 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr goto out; } - if (sk->sk_no_check == 1) + if (sk->sk_no_check_tx) ddp->deh_sum = 0; else ddp->deh_sum = atalk_checksum(skb, len + sizeof(*ddp)); @@ -1764,7 +1761,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied); if (!err && msg->msg_name) { - struct sockaddr_at *sat = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name); sat->sat_family = AF_APPLETALK; sat->sat_port = ddp->deh_sport; sat->sat_addr.s_node = ddp->deh_snode; @@ -1790,53 +1787,53 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; switch (cmd) { - /* Protocol layer */ - case TIOCOUTQ: { - long amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + /* Protocol layer */ + case TIOCOUTQ: { + long amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); - if (amount < 0) - amount = 0; - rc = put_user(amount, (int __user *)argp); - break; - } - case TIOCINQ: { - /* - * These two are safe on a single CPU system as only - * user tasks fiddle here - */ - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - long amount = 0; + if (amount < 0) + amount = 0; + rc = put_user(amount, (int __user *)argp); + break; + } + case TIOCINQ: { + /* + * These two are safe on a single CPU system as only + * user tasks fiddle here + */ + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + long amount = 0; - if (skb) - amount = skb->len - sizeof(struct ddpehdr); - rc = put_user(amount, (int __user *)argp); - break; - } - case SIOCGSTAMP: - rc = sock_get_timestamp(sk, argp); - break; - case SIOCGSTAMPNS: - rc = sock_get_timestampns(sk, argp); - break; - /* Routing */ - case SIOCADDRT: - case SIOCDELRT: - rc = -EPERM; - if (capable(CAP_NET_ADMIN)) - rc = atrtr_ioctl(cmd, argp); - break; - /* Interface */ - case SIOCGIFADDR: - case SIOCSIFADDR: - case SIOCGIFBRDADDR: - case SIOCATALKDIFADDR: - case SIOCDIFADDR: - case SIOCSARP: /* proxy AARP */ - case SIOCDARP: /* proxy AARP */ - rtnl_lock(); - rc = atif_ioctl(cmd, argp); - rtnl_unlock(); - break; + if (skb) + amount = skb->len - sizeof(struct ddpehdr); + rc = put_user(amount, (int __user *)argp); + break; + } + case SIOCGSTAMP: + rc = sock_get_timestamp(sk, argp); + break; + case SIOCGSTAMPNS: + rc = sock_get_timestampns(sk, argp); + break; + /* Routing */ + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (capable(CAP_NET_ADMIN)) + rc = atrtr_ioctl(cmd, argp); + break; + /* Interface */ + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCATALKDIFADDR: + case SIOCDIFADDR: + case SIOCSARP: /* proxy AARP */ + case SIOCDARP: /* proxy AARP */ + rtnl_lock(); + rc = atif_ioctl(cmd, argp); + rtnl_unlock(); + break; } return rc; diff --git a/net/atm/clip.c b/net/atm/clip.c index 8215f7cb170..ba291ce4bdf 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -68,7 +68,7 @@ static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) sk = sk_atm(atmarpd); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 0; } diff --git a/net/atm/lec.c b/net/atm/lec.c index f23916be18f..4c5b8ba0f84 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -152,7 +152,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); - sk->sk_data_ready(sk, skb2->len); + sk->sk_data_ready(sk); } } #endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ @@ -447,7 +447,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); - sk->sk_data_ready(sk, skb2->len); + sk->sk_data_ready(sk); } } #endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ @@ -521,7 +521,7 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, if (data != NULL) mesg->sizeoftlvs = data->len; if (mac_addr) - memcpy(&mesg->content.normal.mac_addr, mac_addr, ETH_ALEN); + ether_addr_copy(mesg->content.normal.mac_addr, mac_addr); else mesg->content.normal.targetless_le_arp = 1; if (atm_addr) @@ -530,13 +530,13 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, atm_force_charge(priv->lecd, skb->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); if (data != NULL) { pr_debug("about to send %d bytes of data\n", data->len); atm_force_charge(priv->lecd, data->truesize); skb_queue_tail(&sk->sk_receive_queue, data); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); } return 0; @@ -616,7 +616,7 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("%s: To daemon\n", dev->name); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); } else { /* Data frame, queue to protocol handlers */ struct lec_arp_table *entry; unsigned char *src, *dst; @@ -1565,7 +1565,7 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, pr_info("LEC: Arp entry kmalloc failed\n"); return NULL; } - memcpy(to_return->mac_addr, mac_addr, ETH_ALEN); + ether_addr_copy(to_return->mac_addr, mac_addr); INIT_HLIST_NODE(&to_return->next); setup_timer(&to_return->timer, lec_arp_expire_arp, (unsigned long)to_return); @@ -1887,7 +1887,8 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, entry = tmp; } else { entry->status = ESI_FORWARD_DIRECT; - memcpy(entry->mac_addr, mac_addr, ETH_ALEN); + ether_addr_copy(entry->mac_addr, + mac_addr); entry->last_used = jiffies; lec_arp_add(priv, entry); } @@ -2263,7 +2264,7 @@ lec_arp_check_empties(struct lec_priv *priv, &priv->lec_arp_empty_ones, next) { if (vcc == entry->vcc) { del_timer(&entry->timer); - memcpy(entry->mac_addr, src, ETH_ALEN); + ether_addr_copy(entry->mac_addr, src); entry->status = ESI_FORWARD_DIRECT; entry->last_used = jiffies; /* We might have got an entry */ diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 3af12755cd0..e8e0e7a8a23 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -478,7 +478,7 @@ static const uint8_t *copy_macs(struct mpoa_client *mpc, return NULL; } } - memcpy(mpc->mps_macs, router_mac, ETH_ALEN); + ether_addr_copy(mpc->mps_macs, router_mac); tlvs += 20; if (device_type == MPS_AND_MPC) tlvs += 20; if (mps_macs > 0) memcpy(mpc->mps_macs, tlvs, mps_macs*ETH_ALEN); @@ -706,7 +706,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) dprintk("(%s) control packet arrived\n", dev->name); /* Pass control packets to daemon */ skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return; } @@ -992,7 +992,7 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) sk = sk_atm(mpc->mpoad_vcc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 0; } @@ -1273,7 +1273,7 @@ static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry) sk = sk_atm(vcc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); dprintk("exiting\n"); } @@ -1492,7 +1492,7 @@ static void __exit atm_mpoa_cleanup(void) mpc_proc_clean(); - del_timer(&mpc_timer); + del_timer_sync(&mpc_timer); unregister_netdevice_notifier(&mpoa_notifier); deregister_atm_ioctl(&atm_ioctl_ops); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 8c93267ce96..c4e09846d1d 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -252,7 +252,7 @@ static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size) * we need to ensure there's a memory barrier after it. The bit * *must* be set before we do the atomic_inc() on pvcc->inflight. * There's no smp_mb__after_set_bit(), so it's this or abuse - * smp_mb__after_clear_bit(). + * smp_mb__after_atomic(). */ test_and_set_bit(BLOCKED, &pvcc->blocked); diff --git a/net/atm/raw.c b/net/atm/raw.c index b4f7b9ff3c7..2e17e97a7a8 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -25,7 +25,7 @@ static void atm_push_raw(struct atm_vcc *vcc, struct sk_buff *skb) struct sock *sk = sk_atm(vcc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); } } diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 4176887e72e..523bce72f69 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -51,7 +51,7 @@ static void sigd_put_skb(struct sk_buff *skb) #endif atm_force_charge(sigd, skb->truesize); skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb); - sk_atm(sigd)->sk_data_ready(sk_atm(sigd), skb->len); + sk_atm(sigd)->sk_data_ready(sk_atm(sigd)); } static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg) diff --git a/net/atm/svc.c b/net/atm/svc.c index 1281049c135..d8e5d0c2ebb 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -263,17 +263,11 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, goto out; } } -/* - * Not supported yet - * - * #ifndef CONFIG_SINGLE_SIGITF - */ + vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp); vcc->qos.txtp.pcr = 0; vcc->qos.txtp.min_pcr = 0; -/* - * #endif - */ + error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci); if (!error) sock->state = SS_CONNECTED; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 7bb1605bdfd..c35c3f48fc0 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1435,7 +1435,7 @@ out: static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) { - struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name); struct sock *sk = sock->sk; struct sockaddr_ax25 sax; struct sk_buff *skb; @@ -1640,7 +1640,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, ax25_digi digi; ax25_address src; const unsigned char *mac = skb_mac_header(skb); - struct sockaddr_ax25 *sax = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name); memset(sax, 0, sizeof(struct full_sockaddr_ax25)); ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL, diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 96f4cab3a2f..7ed8ab72481 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -422,7 +422,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, if (sk) { if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); sock_put(sk); } else { free: diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index fa780b76630..11660a3aab5 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -50,6 +50,15 @@ config BATMAN_ADV_NC If you think that your network does not need this feature you can safely disable it and save some space. +config BATMAN_ADV_MCAST + bool "Multicast optimisation" + depends on BATMAN_ADV + default n + help + This option enables the multicast optimisation which aims to + reduce the air overhead while improving the reliability of + multicast messages. + config BATMAN_ADV_DEBUG bool "B.A.T.M.A.N. debugging" depends on BATMAN_ADV diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 4f4aabbd8ea..eb7d8c0388e 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # @@ -13,9 +13,7 @@ # General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -# 02110-1301, USA +# along with this program; if not, see <http://www.gnu.org/licenses/>. # obj-$(CONFIG_BATMAN_ADV) += batman-adv.o @@ -38,3 +36,4 @@ batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o batman-adv-y += translation-table.o +batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index a4808c29ea3..4e49666f8c6 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a2b480a9087..f04224c3200 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -243,19 +241,19 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) size = bat_priv->num_ifaces * sizeof(uint8_t); orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC); if (!orig_node->bat_iv.bcast_own_sum) - goto free_bcast_own; + goto free_orig_node; hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) - goto free_bcast_own; + goto free_orig_node; return orig_node; -free_bcast_own: - kfree(orig_node->bat_iv.bcast_own); free_orig_node: + /* free twice, as batadv_orig_node_new sets refcount to 2 */ + batadv_orig_node_free_ref(orig_node); batadv_orig_node_free_ref(orig_node); return NULL; @@ -268,22 +266,39 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, struct batadv_orig_node *orig_neigh) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_neigh_node *neigh_node; + struct batadv_neigh_node *neigh_node, *tmp_neigh_node; neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node); if (!neigh_node) goto out; - spin_lock_init(&neigh_node->bat_iv.lq_update_lock); + if (!atomic_inc_not_zero(&hard_iface->refcount)) { + kfree(neigh_node); + neigh_node = NULL; + goto out; + } - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Creating new neighbor %pM for orig_node %pM on interface %s\n", - neigh_addr, orig_node->orig, hard_iface->net_dev->name); + neigh_node->orig_node = orig_neigh; + neigh_node->if_incoming = hard_iface; spin_lock_bh(&orig_node->neigh_list_lock); - hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface, + neigh_addr); + if (!tmp_neigh_node) { + hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + } else { + kfree(neigh_node); + batadv_hardif_free_ref(hard_iface); + neigh_node = tmp_neigh_node; + } spin_unlock_bh(&orig_node->neigh_list_lock); + if (!tmp_neigh_node) + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Creating new neighbor %pM for orig_node %pM on interface %s\n", + neigh_addr, orig_node->orig, + hard_iface->net_dev->name); + out: return neigh_node; } @@ -307,9 +322,9 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) hard_iface->bat_iv.ogm_buff = ogm_buff; batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; - batadv_ogm_packet->header.packet_type = BATADV_IV_OGM; - batadv_ogm_packet->header.version = BATADV_COMPAT_VERSION; - batadv_ogm_packet->header.ttl = 2; + batadv_ogm_packet->packet_type = BATADV_IV_OGM; + batadv_ogm_packet->version = BATADV_COMPAT_VERSION; + batadv_ogm_packet->ttl = 2; batadv_ogm_packet->flags = BATADV_NO_FLAGS; batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; @@ -332,10 +347,10 @@ static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; - memcpy(batadv_ogm_packet->orig, - hard_iface->net_dev->dev_addr, ETH_ALEN); - memcpy(batadv_ogm_packet->prev_sender, - hard_iface->net_dev->dev_addr, ETH_ALEN); + ether_addr_copy(batadv_ogm_packet->orig, + hard_iface->net_dev->dev_addr); + ether_addr_copy(batadv_ogm_packet->prev_sender, + hard_iface->net_dev->dev_addr); } static void @@ -346,7 +361,7 @@ batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; batadv_ogm_packet->flags = BATADV_PRIMARIES_FIRST_HOP; - batadv_ogm_packet->header.ttl = BATADV_TTL; + batadv_ogm_packet->ttl = BATADV_TTL; } /* when do we schedule our own ogm to be sent */ @@ -435,7 +450,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, fwd_str, (packet_num > 0 ? "aggregated " : ""), batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), - batadv_ogm_packet->tq, batadv_ogm_packet->header.ttl, + batadv_ogm_packet->tq, batadv_ogm_packet->ttl, (batadv_ogm_packet->flags & BATADV_DIRECTLINK ? "on" : "off"), hard_iface->net_dev->name, @@ -461,17 +476,9 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, /* send a batman ogm packet */ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { - struct batadv_hard_iface *hard_iface; struct net_device *soft_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; - struct batadv_ogm_packet *batadv_ogm_packet; - unsigned char directlink; - uint8_t *packet_pos; - - packet_pos = forw_packet->skb->data; - batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; - directlink = (batadv_ogm_packet->flags & BATADV_DIRECTLINK ? 1 : 0); if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); @@ -481,59 +488,48 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) soft_iface = forw_packet->if_incoming->soft_iface; bat_priv = netdev_priv(soft_iface); - if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) + if (WARN_ON(!forw_packet->if_outgoing)) goto out; - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) + if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface)) goto out; - /* multihomed peer assumed - * non-primary OGMs are only broadcasted on their interface - */ - if ((directlink && (batadv_ogm_packet->header.ttl == 1)) || - (forw_packet->own && (forw_packet->if_incoming != primary_if))) { - /* FIXME: what about aggregated packets ? */ - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "%s packet (originator %pM, seqno %u, TTL %d) on interface %s [%pM]\n", - (forw_packet->own ? "Sending own" : "Forwarding"), - batadv_ogm_packet->orig, - ntohl(batadv_ogm_packet->seqno), - batadv_ogm_packet->header.ttl, - forw_packet->if_incoming->net_dev->name, - forw_packet->if_incoming->net_dev->dev_addr); - - /* skb is only used once and than forw_packet is free'd */ - batadv_send_skb_packet(forw_packet->skb, - forw_packet->if_incoming, - batadv_broadcast_addr); - forw_packet->skb = NULL; - + if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) goto out; - } - /* broadcast on every interface */ - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != soft_iface) - continue; + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; - batadv_iv_ogm_send_to_if(forw_packet, hard_iface); - } - rcu_read_unlock(); + /* only for one specific outgoing interface */ + batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); out: if (primary_if) batadv_hardif_free_ref(primary_if); } -/* return true if new_packet can be aggregated with forw_packet */ +/** + * batadv_iv_ogm_can_aggregate - find out if an OGM can be aggregated on an + * existing forward packet + * @new_bat_ogm_packet: OGM packet to be aggregated + * @bat_priv: the bat priv with all the soft interface information + * @packet_len: (total) length of the OGM + * @send_time: timestamp (jiffies) when the packet is to be sent + * @direktlink: true if this is a direct link packet + * @if_incoming: interface where the packet was received + * @if_outgoing: interface for which the retransmission should be considered + * @forw_packet: the forwarded packet which should be checked + * + * Returns true if new_packet can be aggregated with forw_packet + */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, struct batadv_priv *bat_priv, int packet_len, unsigned long send_time, bool directlink, const struct batadv_hard_iface *if_incoming, + const struct batadv_hard_iface *if_outgoing, const struct batadv_forw_packet *forw_packet) { struct batadv_ogm_packet *batadv_ogm_packet; @@ -567,12 +563,16 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, if (!primary_if) goto out; + /* packet is not leaving on the same interface. */ + if (forw_packet->if_outgoing != if_outgoing) + goto out; + /* packets without direct link flag and high TTL * are flooded through the net */ if ((!directlink) && (!(batadv_ogm_packet->flags & BATADV_DIRECTLINK)) && - (batadv_ogm_packet->header.ttl != 1) && + (batadv_ogm_packet->ttl != 1) && /* own packets originating non-primary * interfaces leave only that interface @@ -587,7 +587,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, * interface only - we still can aggregate */ if ((directlink) && - (new_bat_ogm_packet->header.ttl == 1) && + (new_bat_ogm_packet->ttl == 1) && (forw_packet->if_incoming == if_incoming) && /* packets from direct neighbors or @@ -608,11 +608,22 @@ out: return res; } -/* create a new aggregated packet and add this packet to it */ +/** + * batadv_iv_ogm_aggregate_new - create a new aggregated packet and add this + * packet to it. + * @packet_buff: pointer to the OGM + * @packet_len: (total) length of the OGM + * @send_time: timestamp (jiffies) when the packet is to be sent + * @direct_link: whether this OGM has direct link status + * @if_incoming: interface where the packet was received + * @if_outgoing: interface for which the retransmission should be considered + * @own_packet: true if it is a self-generated ogm + */ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, int packet_len, unsigned long send_time, bool direct_link, struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, int own_packet) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); @@ -623,6 +634,9 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, if (!atomic_inc_not_zero(&if_incoming->refcount)) return; + if (!atomic_inc_not_zero(&if_outgoing->refcount)) + goto out_free_incoming; + /* own packet should always be scheduled */ if (!own_packet) { if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { @@ -663,6 +677,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, forw_packet_aggr->own = own_packet; forw_packet_aggr->if_incoming = if_incoming; + forw_packet_aggr->if_outgoing = if_outgoing; forw_packet_aggr->num_packets = 0; forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; forw_packet_aggr->send_time = send_time; @@ -685,6 +700,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, return; out: + batadv_hardif_free_ref(if_outgoing); +out_free_incoming: batadv_hardif_free_ref(if_incoming); } @@ -708,10 +725,21 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, } } +/** + * batadv_iv_ogm_queue_add - queue up an OGM for transmission + * @bat_priv: the bat priv with all the soft interface information + * @packet_buff: pointer to the OGM + * @packet_len: (total) length of the OGM + * @if_incoming: interface where the packet was received + * @if_outgoing: interface for which the retransmission should be considered + * @own_packet: true if it is a self-generated ogm + * @send_time: timestamp (jiffies) when the packet is to be sent + */ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned char *packet_buff, int packet_len, struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, int own_packet, unsigned long send_time) { /* _aggr -> pointer to the packet we want to aggregate with @@ -737,6 +765,7 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, bat_priv, packet_len, send_time, direct_link, if_incoming, + if_outgoing, forw_packet_pos)) { forw_packet_aggr = forw_packet_pos; break; @@ -760,7 +789,8 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, batadv_iv_ogm_aggregate_new(packet_buff, packet_len, send_time, direct_link, - if_incoming, own_packet); + if_incoming, if_outgoing, + own_packet); } else { batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff, packet_len, direct_link); @@ -773,12 +803,13 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, struct batadv_ogm_packet *batadv_ogm_packet, bool is_single_hop_neigh, bool is_from_best_next_hop, - struct batadv_hard_iface *if_incoming) + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); uint16_t tvlv_len; - if (batadv_ogm_packet->header.ttl <= 1) { + if (batadv_ogm_packet->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); return; } @@ -798,8 +829,8 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, tvlv_len = ntohs(batadv_ogm_packet->tvlv_len); - batadv_ogm_packet->header.ttl--; - memcpy(batadv_ogm_packet->prev_sender, ethhdr->h_source, ETH_ALEN); + batadv_ogm_packet->ttl--; + ether_addr_copy(batadv_ogm_packet->prev_sender, ethhdr->h_source); /* apply hop penalty */ batadv_ogm_packet->tq = batadv_hop_penalty(batadv_ogm_packet->tq, @@ -807,7 +838,7 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: tq: %i, ttl: %i\n", - batadv_ogm_packet->tq, batadv_ogm_packet->header.ttl); + batadv_ogm_packet->tq, batadv_ogm_packet->ttl); /* switch of primaries first hop flag when forwarding */ batadv_ogm_packet->flags &= ~BATADV_PRIMARIES_FIRST_HOP; @@ -818,7 +849,8 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, batadv_iv_ogm_queue_add(bat_priv, (unsigned char *)batadv_ogm_packet, BATADV_OGM_HLEN + tvlv_len, - if_incoming, 0, batadv_iv_ogm_fwd_send_time()); + if_incoming, if_outgoing, 0, + batadv_iv_ogm_fwd_send_time()); } /** @@ -863,10 +895,11 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; struct batadv_ogm_packet *batadv_ogm_packet; - struct batadv_hard_iface *primary_if; + struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; uint32_t seqno; uint16_t tvlv_len = 0; + unsigned long send_time; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -889,23 +922,60 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) atomic_inc(&hard_iface->bat_iv.ogm_seqno); batadv_iv_ogm_slide_own_bcast_window(hard_iface); - batadv_iv_ogm_queue_add(bat_priv, hard_iface->bat_iv.ogm_buff, - hard_iface->bat_iv.ogm_buff_len, hard_iface, 1, - batadv_iv_ogm_emit_send_time(bat_priv)); + send_time = batadv_iv_ogm_emit_send_time(bat_priv); + + if (hard_iface != primary_if) { + /* OGMs from secondary interfaces are only scheduled on their + * respective interfaces. + */ + batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, + hard_iface, hard_iface, 1, send_time); + goto out; + } + + /* OGMs from primary interfaces are scheduled on all + * interfaces. + */ + rcu_read_lock(); + list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { + if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) + continue; + batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, + *ogm_buff_len, hard_iface, + tmp_hard_iface, 1, send_time); + } + rcu_read_unlock(); + +out: if (primary_if) batadv_hardif_free_ref(primary_if); } +/** + * batadv_iv_ogm_orig_update - use OGM to update corresponding data in an + * originator + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: the orig node who originally emitted the ogm packet + * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node + * @ethhdr: Ethernet header of the OGM + * @batadv_ogm_packet: the ogm packet + * @if_incoming: interface where the packet was received + * @if_outgoing: interface for which the retransmission should be considered + * @dup_status: the duplicate status of this ogm packet. + */ static void batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, + struct batadv_orig_ifinfo *orig_ifinfo, const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, - const unsigned char *tt_buff, + struct batadv_hard_iface *if_outgoing, enum batadv_dup_status dup_status) { + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_node_tmp; @@ -933,12 +1003,21 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, if (dup_status != BATADV_NO_DUP) continue; - spin_lock_bh(&tmp_neigh_node->bat_iv.lq_update_lock); - batadv_ring_buffer_set(tmp_neigh_node->bat_iv.tq_recv, - &tmp_neigh_node->bat_iv.tq_index, 0); - tq_avg = batadv_ring_buffer_avg(tmp_neigh_node->bat_iv.tq_recv); - tmp_neigh_node->bat_iv.tq_avg = tq_avg; - spin_unlock_bh(&tmp_neigh_node->bat_iv.lq_update_lock); + /* only update the entry for this outgoing interface */ + neigh_ifinfo = batadv_neigh_ifinfo_get(tmp_neigh_node, + if_outgoing); + if (!neigh_ifinfo) + continue; + + spin_lock_bh(&tmp_neigh_node->ifinfo_lock); + batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, + &neigh_ifinfo->bat_iv.tq_index, 0); + tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); + neigh_ifinfo->bat_iv.tq_avg = tq_avg; + spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); + + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + neigh_ifinfo = NULL; } if (!neigh_node) { @@ -960,39 +1039,49 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, "Updating existing last-hop neighbor of originator\n"); rcu_read_unlock(); + neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); + if (!neigh_ifinfo) + goto out; neigh_node->last_seen = jiffies; - spin_lock_bh(&neigh_node->bat_iv.lq_update_lock); - batadv_ring_buffer_set(neigh_node->bat_iv.tq_recv, - &neigh_node->bat_iv.tq_index, + spin_lock_bh(&neigh_node->ifinfo_lock); + batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, + &neigh_ifinfo->bat_iv.tq_index, batadv_ogm_packet->tq); - tq_avg = batadv_ring_buffer_avg(neigh_node->bat_iv.tq_recv); - neigh_node->bat_iv.tq_avg = tq_avg; - spin_unlock_bh(&neigh_node->bat_iv.lq_update_lock); + tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); + neigh_ifinfo->bat_iv.tq_avg = tq_avg; + spin_unlock_bh(&neigh_node->ifinfo_lock); if (dup_status == BATADV_NO_DUP) { - orig_node->last_ttl = batadv_ogm_packet->header.ttl; - neigh_node->last_ttl = batadv_ogm_packet->header.ttl; + orig_ifinfo->last_ttl = batadv_ogm_packet->ttl; + neigh_ifinfo->last_ttl = batadv_ogm_packet->ttl; } - batadv_bonding_candidate_add(bat_priv, orig_node, neigh_node); - /* if this neighbor already is our next hop there is nothing * to change */ - router = batadv_orig_node_get_router(orig_node); + router = batadv_orig_router_get(orig_node, if_outgoing); if (router == neigh_node) goto out; - /* if this neighbor does not offer a better TQ we won't consider it */ - if (router && (router->bat_iv.tq_avg > neigh_node->bat_iv.tq_avg)) - goto out; + if (router) { + router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); + if (!router_ifinfo) + goto out; + + /* if this neighbor does not offer a better TQ we won't + * consider it + */ + if (router_ifinfo->bat_iv.tq_avg > neigh_ifinfo->bat_iv.tq_avg) + goto out; + } /* if the TQ is the same and the link not more symmetric we * won't consider it either */ - if (router && (neigh_node->bat_iv.tq_avg == router->bat_iv.tq_avg)) { + if (router_ifinfo && + (neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg)) { orig_node_tmp = router->orig_node; spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); if_num = router->if_incoming->if_num; @@ -1009,7 +1098,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, goto out; } - batadv_update_route(bat_priv, orig_node, neigh_node); + batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); goto out; unlock: @@ -1019,20 +1108,37 @@ out: batadv_neigh_node_free_ref(neigh_node); if (router) batadv_neigh_node_free_ref(router); + if (neigh_ifinfo) + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + if (router_ifinfo) + batadv_neigh_ifinfo_free_ref(router_ifinfo); } +/** + * batadv_iv_ogm_calc_tq - calculate tq for current received ogm packet + * @orig_node: the orig node who originally emitted the ogm packet + * @orig_neigh_node: the orig node struct of the neighbor who sent the packet + * @batadv_ogm_packet: the ogm packet + * @if_incoming: interface where the packet was received + * @if_outgoing: interface for which the retransmission should be considered + * + * Returns 1 if the link can be considered bidirectional, 0 otherwise + */ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, struct batadv_ogm_packet *batadv_ogm_packet, - struct batadv_hard_iface *if_incoming) + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; + struct batadv_neigh_ifinfo *neigh_ifinfo; uint8_t total_count; uint8_t orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0; unsigned int combined_tq; + int tq_iface_penalty; /* find corresponding one hop neighbor */ rcu_read_lock(); @@ -1072,7 +1178,13 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); if_num = if_incoming->if_num; orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; - neigh_rq_count = neigh_node->bat_iv.real_packet_count; + neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); + if (neigh_ifinfo) { + neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + } else { + neigh_rq_count = 0; + } spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ @@ -1108,15 +1220,31 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; - combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty; - combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; + /* penalize if the OGM is forwarded on the same interface. WiFi + * interfaces and other half duplex devices suffer from throughput + * drops as they can't send and receive at the same time. + */ + tq_iface_penalty = BATADV_TQ_MAX_VALUE; + if (if_outgoing && (if_incoming == if_outgoing) && + batadv_is_wifi_netdev(if_outgoing->net_dev)) + tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, + bat_priv); + + combined_tq = batadv_ogm_packet->tq * + tq_own * + tq_asym_penalty * + tq_iface_penalty; + combined_tq /= BATADV_TQ_MAX_VALUE * + BATADV_TQ_MAX_VALUE * + BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bidirectional: orig = %-15pM neigh = %-15pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, total tq: %3i\n", + "bidirectional: orig = %-15pM neigh = %-15pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, - neigh_rq_count, tq_own, - tq_asym_penalty, batadv_ogm_packet->tq); + neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty, + batadv_ogm_packet->tq, if_incoming->net_dev->name, + if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality * consider it bidirectional @@ -1136,17 +1264,21 @@ out: * @ethhdr: ethernet header of the packet * @batadv_ogm_packet: OGM packet to be considered * @if_incoming: interface on which the OGM packet was received + * @if_outgoing: interface for which the retransmission should be considered * * Returns duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, - const struct batadv_hard_iface *if_incoming) + const struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_node; - struct batadv_neigh_node *tmp_neigh_node; + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + struct batadv_neigh_node *neigh_node; + struct batadv_neigh_ifinfo *neigh_ifinfo; int is_dup; int32_t seq_diff; int need_update = 0; @@ -1161,27 +1293,37 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, if (!orig_node) return BATADV_NO_DUP; + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (WARN_ON(!orig_ifinfo)) { + batadv_orig_node_free_ref(orig_node); + return 0; + } + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - seq_diff = seqno - orig_node->last_real_seqno; + seq_diff = seqno - orig_ifinfo->last_real_seqno; /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, - &orig_node->batman_seqno_reset)) { + &orig_ifinfo->batman_seqno_reset)) { ret = BATADV_PROTECTED; goto out; } rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_neigh_node, - &orig_node->neigh_list, list) { - neigh_addr = tmp_neigh_node->addr; - is_dup = batadv_test_bit(tmp_neigh_node->bat_iv.real_bits, - orig_node->last_real_seqno, + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, + if_outgoing); + if (!neigh_ifinfo) + continue; + + neigh_addr = neigh_node->addr; + is_dup = batadv_test_bit(neigh_ifinfo->bat_iv.real_bits, + orig_ifinfo->last_real_seqno, seqno); if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && - tmp_neigh_node->if_incoming == if_incoming) { + neigh_node->if_incoming == if_incoming) { set_mark = 1; if (is_dup) ret = BATADV_NEIGH_DUP; @@ -1192,173 +1334,78 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, } /* if the window moved, set the update flag. */ - bitmap = tmp_neigh_node->bat_iv.real_bits; + bitmap = neigh_ifinfo->bat_iv.real_bits; need_update |= batadv_bit_get_packet(bat_priv, bitmap, seq_diff, set_mark); - packet_count = bitmap_weight(tmp_neigh_node->bat_iv.real_bits, + packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); - tmp_neigh_node->bat_iv.real_packet_count = packet_count; + neigh_ifinfo->bat_iv.real_packet_count = packet_count; + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); } rcu_read_unlock(); if (need_update) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "updating last_seqno: old %u, new %u\n", - orig_node->last_real_seqno, seqno); - orig_node->last_real_seqno = seqno; + "%s updating last_seqno: old %u, new %u\n", + if_outgoing ? if_outgoing->net_dev->name : "DEFAULT", + orig_ifinfo->last_real_seqno, seqno); + orig_ifinfo->last_real_seqno = seqno; } out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_free_ref(orig_node); + if (orig_ifinfo) + batadv_orig_ifinfo_free_ref(orig_ifinfo); return ret; } -static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, - struct batadv_ogm_packet *batadv_ogm_packet, - const unsigned char *tt_buff, - struct batadv_hard_iface *if_incoming) + +/** + * batadv_iv_ogm_process_per_outif - process a batman iv OGM for an outgoing if + * @skb: the skb containing the OGM + * @orig_node: the (cached) orig node for the originator of this OGM + * @if_incoming: the interface where this packet was received + * @if_outgoing: the interface for which the packet should be considered + */ +static void +batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, + struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); - struct batadv_hard_iface *hard_iface; - struct batadv_orig_node *orig_neigh_node, *orig_node, *orig_node_tmp; struct batadv_neigh_node *router = NULL, *router_router = NULL; + struct batadv_orig_node *orig_neigh_node; + struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *orig_neigh_router = NULL; - int has_directlink_flag; - int is_my_addr = 0, is_my_orig = 0, is_my_oldorig = 0; - int is_bidirect; - bool is_single_hop_neigh = false; - bool is_from_best_next_hop = false; - int sameseq, similar_ttl; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + struct batadv_ogm_packet *ogm_packet; enum batadv_dup_status dup_status; - uint32_t if_incoming_seqno; + bool is_from_best_next_hop = false; + bool is_single_hop_neigh = false; + bool sameseq, similar_ttl; + struct sk_buff *skb_priv; + struct ethhdr *ethhdr; uint8_t *prev_sender; + int is_bidirect; - /* Silently drop when the batman packet is actually not a - * correct packet. - * - * This might happen if a packet is padded (e.g. Ethernet has a - * minimum frame length of 64 byte) and the aggregation interprets - * it as an additional length. - * - * TODO: A more sane solution would be to have a bit in the - * batadv_ogm_packet to detect whether the packet is the last - * packet in an aggregation. Here we expect that the padding - * is always zero (or not 0x01) + /* create a private copy of the skb, as some functions change tq value + * and/or flags. */ - if (batadv_ogm_packet->header.packet_type != BATADV_IV_OGM) + skb_priv = skb_copy(skb, GFP_ATOMIC); + if (!skb_priv) return; - /* could be changed by schedule_own_packet() */ - if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno); - - if (batadv_ogm_packet->flags & BATADV_DIRECTLINK) - has_directlink_flag = 1; - else - has_directlink_flag = 0; + ethhdr = eth_hdr(skb_priv); + ogm_packet = (struct batadv_ogm_packet *)(skb_priv->data + ogm_offset); - if (batadv_compare_eth(ethhdr->h_source, batadv_ogm_packet->orig)) + dup_status = batadv_iv_ogm_update_seqnos(ethhdr, ogm_packet, + if_incoming, if_outgoing); + if (batadv_compare_eth(ethhdr->h_source, ogm_packet->orig)) is_single_hop_neigh = true; - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n", - ethhdr->h_source, if_incoming->net_dev->name, - if_incoming->net_dev->dev_addr, batadv_ogm_packet->orig, - batadv_ogm_packet->prev_sender, - ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, - batadv_ogm_packet->header.ttl, - batadv_ogm_packet->header.version, has_directlink_flag); - - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->if_status != BATADV_IF_ACTIVE) - continue; - - if (hard_iface->soft_iface != if_incoming->soft_iface) - continue; - - if (batadv_compare_eth(ethhdr->h_source, - hard_iface->net_dev->dev_addr)) - is_my_addr = 1; - - if (batadv_compare_eth(batadv_ogm_packet->orig, - hard_iface->net_dev->dev_addr)) - is_my_orig = 1; - - if (batadv_compare_eth(batadv_ogm_packet->prev_sender, - hard_iface->net_dev->dev_addr)) - is_my_oldorig = 1; - } - rcu_read_unlock(); - - if (is_my_addr) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Drop packet: received my own broadcast (sender: %pM)\n", - ethhdr->h_source); - return; - } - - if (is_my_orig) { - unsigned long *word; - int offset; - int32_t bit_pos; - int16_t if_num; - uint8_t *weight; - - orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, - ethhdr->h_source); - if (!orig_neigh_node) - return; - - /* neighbor has to indicate direct link and it has to - * come via the corresponding interface - * save packet seqno for bidirectional check - */ - if (has_directlink_flag && - batadv_compare_eth(if_incoming->net_dev->dev_addr, - batadv_ogm_packet->orig)) { - if_num = if_incoming->if_num; - offset = if_num * BATADV_NUM_WORDS; - - spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - word = &(orig_neigh_node->bat_iv.bcast_own[offset]); - bit_pos = if_incoming_seqno - 2; - bit_pos -= ntohl(batadv_ogm_packet->seqno); - batadv_set_bit(word, bit_pos); - weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num]; - *weight = bitmap_weight(word, - BATADV_TQ_LOCAL_WINDOW_SIZE); - spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - } - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Drop packet: originator packet from myself (via neighbor)\n"); - batadv_orig_node_free_ref(orig_neigh_node); - return; - } - - if (is_my_oldorig) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n", - ethhdr->h_source); - return; - } - - if (batadv_ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n", - ethhdr->h_source); - return; - } - - orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig); - if (!orig_node) - return; - - dup_status = batadv_iv_ogm_update_seqnos(ethhdr, batadv_ogm_packet, - if_incoming); - if (dup_status == BATADV_PROTECTED) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within seqno protection time (sender: %pM)\n", @@ -1366,27 +1413,28 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, goto out; } - if (batadv_ogm_packet->tq == 0) { + if (ogm_packet->tq == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with tq equal 0\n"); goto out; } - router = batadv_orig_node_get_router(orig_node); + router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { - orig_node_tmp = router->orig_node; - router_router = batadv_orig_node_get_router(orig_node_tmp); + router_router = batadv_orig_router_get(router->orig_node, + if_outgoing); + router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } - if ((router && router->bat_iv.tq_avg != 0) && + if ((router_ifinfo && router_ifinfo->bat_iv.tq_avg != 0) && (batadv_compare_eth(router->addr, ethhdr->h_source))) is_from_best_next_hop = true; - prev_sender = batadv_ogm_packet->prev_sender; + prev_sender = ogm_packet->prev_sender; /* avoid temporary routing loops */ if (router && router_router && (batadv_compare_eth(router->addr, prev_sender)) && - !(batadv_compare_eth(batadv_ogm_packet->orig, prev_sender)) && + !(batadv_compare_eth(ogm_packet->orig, prev_sender)) && (batadv_compare_eth(router->addr, router_router->addr))) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast packets that may make me loop (sender: %pM)\n", @@ -1394,7 +1442,8 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, goto out; } - batadv_tvlv_ogm_receive(bat_priv, batadv_ogm_packet, orig_node); + if (if_outgoing == BATADV_IF_DEFAULT) + batadv_tvlv_ogm_receive(bat_priv, ogm_packet, orig_node); /* if sender is a direct neighbor the sender mac equals * originator mac @@ -1410,9 +1459,10 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, /* Update nc_nodes of the originator */ batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, - batadv_ogm_packet, is_single_hop_neigh); + ogm_packet, is_single_hop_neigh); - orig_neigh_router = batadv_orig_node_get_router(orig_neigh_node); + orig_neigh_router = batadv_orig_router_get(orig_neigh_node, + if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it @@ -1424,28 +1474,48 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, } is_bidirect = batadv_iv_ogm_calc_tq(orig_node, orig_neigh_node, - batadv_ogm_packet, if_incoming); - - batadv_bonding_save_primary(orig_node, orig_neigh_node, - batadv_ogm_packet); + ogm_packet, if_incoming, + if_outgoing); /* update ranking if it is not a duplicate or has the same * seqno and similar ttl as the non-duplicate */ - sameseq = orig_node->last_real_seqno == ntohl(batadv_ogm_packet->seqno); - similar_ttl = orig_node->last_ttl - 3 <= batadv_ogm_packet->header.ttl; + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (!orig_ifinfo) + goto out_neigh; + + sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); + similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; + if (is_bidirect && ((dup_status == BATADV_NO_DUP) || - (sameseq && similar_ttl))) - batadv_iv_ogm_orig_update(bat_priv, orig_node, ethhdr, - batadv_ogm_packet, if_incoming, - tt_buff, dup_status); + (sameseq && similar_ttl))) { + batadv_iv_ogm_orig_update(bat_priv, orig_node, + orig_ifinfo, ethhdr, + ogm_packet, if_incoming, + if_outgoing, dup_status); + } + batadv_orig_ifinfo_free_ref(orig_ifinfo); + + /* only forward for specific interface, not for the default one. */ + if (if_outgoing == BATADV_IF_DEFAULT) + goto out_neigh; /* is single hop (direct) neighbor */ if (is_single_hop_neigh) { + /* OGMs from secondary interfaces should only scheduled once + * per interface where it has been received, not multiple times + */ + if ((ogm_packet->ttl <= 2) && + (if_incoming != if_outgoing)) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); + goto out_neigh; + } /* mark direct link on incoming interface */ - batadv_iv_ogm_forward(orig_node, ethhdr, batadv_ogm_packet, + batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, - is_from_best_next_hop, if_incoming); + is_from_best_next_hop, if_incoming, + if_outgoing); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast neighbor packet with direct link flag\n"); @@ -1467,14 +1537,16 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast originator packet\n"); - batadv_iv_ogm_forward(orig_node, ethhdr, batadv_ogm_packet, + batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, - if_incoming); + if_incoming, if_outgoing); out_neigh: if ((orig_neigh_node) && (!is_single_hop_neigh)) batadv_orig_node_free_ref(orig_neigh_node); out: + if (router_ifinfo) + batadv_neigh_ifinfo_free_ref(router_ifinfo); if (router) batadv_neigh_node_free_ref(router); if (router_router) @@ -1482,6 +1554,165 @@ out: if (orig_neigh_router) batadv_neigh_node_free_ref(orig_neigh_router); + kfree_skb(skb_priv); +} + +/** + * batadv_iv_ogm_process - process an incoming batman iv OGM + * @skb: the skb containing the OGM + * @ogm_offset: offset to the OGM which should be processed (for aggregates) + * @if_incoming: the interface where this packet was receved + */ +static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, + struct batadv_hard_iface *if_incoming) +{ + struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_orig_node *orig_neigh_node, *orig_node; + struct batadv_hard_iface *hard_iface; + struct batadv_ogm_packet *ogm_packet; + uint32_t if_incoming_seqno; + bool has_directlink_flag; + struct ethhdr *ethhdr; + bool is_my_oldorig = false; + bool is_my_addr = false; + bool is_my_orig = false; + + ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset); + ethhdr = eth_hdr(skb); + + /* Silently drop when the batman packet is actually not a + * correct packet. + * + * This might happen if a packet is padded (e.g. Ethernet has a + * minimum frame length of 64 byte) and the aggregation interprets + * it as an additional length. + * + * TODO: A more sane solution would be to have a bit in the + * batadv_ogm_packet to detect whether the packet is the last + * packet in an aggregation. Here we expect that the padding + * is always zero (or not 0x01) + */ + if (ogm_packet->packet_type != BATADV_IV_OGM) + return; + + /* could be changed by schedule_own_packet() */ + if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno); + + if (ogm_packet->flags & BATADV_DIRECTLINK) + has_directlink_flag = true; + else + has_directlink_flag = false; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n", + ethhdr->h_source, if_incoming->net_dev->name, + if_incoming->net_dev->dev_addr, ogm_packet->orig, + ogm_packet->prev_sender, ntohl(ogm_packet->seqno), + ogm_packet->tq, ogm_packet->ttl, + ogm_packet->version, has_directlink_flag); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status != BATADV_IF_ACTIVE) + continue; + + if (hard_iface->soft_iface != if_incoming->soft_iface) + continue; + + if (batadv_compare_eth(ethhdr->h_source, + hard_iface->net_dev->dev_addr)) + is_my_addr = true; + + if (batadv_compare_eth(ogm_packet->orig, + hard_iface->net_dev->dev_addr)) + is_my_orig = true; + + if (batadv_compare_eth(ogm_packet->prev_sender, + hard_iface->net_dev->dev_addr)) + is_my_oldorig = true; + } + rcu_read_unlock(); + + if (is_my_addr) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: received my own broadcast (sender: %pM)\n", + ethhdr->h_source); + return; + } + + if (is_my_orig) { + unsigned long *word; + int offset; + int32_t bit_pos; + int16_t if_num; + uint8_t *weight; + + orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, + ethhdr->h_source); + if (!orig_neigh_node) + return; + + /* neighbor has to indicate direct link and it has to + * come via the corresponding interface + * save packet seqno for bidirectional check + */ + if (has_directlink_flag && + batadv_compare_eth(if_incoming->net_dev->dev_addr, + ogm_packet->orig)) { + if_num = if_incoming->if_num; + offset = if_num * BATADV_NUM_WORDS; + + spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); + word = &(orig_neigh_node->bat_iv.bcast_own[offset]); + bit_pos = if_incoming_seqno - 2; + bit_pos -= ntohl(ogm_packet->seqno); + batadv_set_bit(word, bit_pos); + weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num]; + *weight = bitmap_weight(word, + BATADV_TQ_LOCAL_WINDOW_SIZE); + spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); + } + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: originator packet from myself (via neighbor)\n"); + batadv_orig_node_free_ref(orig_neigh_node); + return; + } + + if (is_my_oldorig) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n", + ethhdr->h_source); + return; + } + + if (ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n", + ethhdr->h_source); + return; + } + + orig_node = batadv_iv_ogm_orig_get(bat_priv, ogm_packet->orig); + if (!orig_node) + return; + + batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, + if_incoming, BATADV_IF_DEFAULT); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status != BATADV_IF_ACTIVE) + continue; + + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, + if_incoming, hard_iface); + } + rcu_read_unlock(); + batadv_orig_node_free_ref(orig_node); } @@ -1489,11 +1720,9 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); - struct batadv_ogm_packet *batadv_ogm_packet; - struct ethhdr *ethhdr; - int buff_pos = 0, packet_len; - unsigned char *tvlv_buff, *packet_buff; + struct batadv_ogm_packet *ogm_packet; uint8_t *packet_pos; + int ogm_offset; bool ret; ret = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN); @@ -1510,24 +1739,19 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); - packet_len = skb_headlen(skb); - ethhdr = eth_hdr(skb); - packet_buff = skb->data; - batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; + ogm_offset = 0; + ogm_packet = (struct batadv_ogm_packet *)skb->data; /* unpack the aggregated packets and process them one by one */ - while (batadv_iv_ogm_aggr_packet(buff_pos, packet_len, - batadv_ogm_packet->tvlv_len)) { - tvlv_buff = packet_buff + buff_pos + BATADV_OGM_HLEN; + while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb), + ogm_packet->tvlv_len)) { + batadv_iv_ogm_process(skb, ogm_offset, if_incoming); - batadv_iv_ogm_process(ethhdr, batadv_ogm_packet, - tvlv_buff, if_incoming); + ogm_offset += BATADV_OGM_HLEN; + ogm_offset += ntohs(ogm_packet->tvlv_len); - buff_pos += BATADV_OGM_HLEN; - buff_pos += ntohs(batadv_ogm_packet->tvlv_len); - - packet_pos = packet_buff + buff_pos; - batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; + packet_pos = skb->data + ogm_offset; + ogm_packet = (struct batadv_ogm_packet *)packet_pos; } kfree_skb(skb); @@ -1535,17 +1759,49 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, } /** + * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table + * @orig_node: the orig_node for which the neighbors are printed + * @if_outgoing: outgoing interface for these entries + * @seq: debugfs table seq_file struct + * + * Must be called while holding an rcu lock. + */ +static void +batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing, + struct seq_file *seq) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_neigh_ifinfo *n_ifinfo; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + continue; + + seq_printf(seq, " %pM (%3i)", + neigh_node->addr, + n_ifinfo->bat_iv.tq_avg); + + batadv_neigh_ifinfo_free_ref(n_ifinfo); + } +} + +/** * batadv_iv_ogm_orig_print - print the originator table * @bat_priv: the bat priv with all the soft interface information * @seq: debugfs table seq_file struct + * @if_outgoing: the outgoing interface for which this should be printed */ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, - struct seq_file *seq) + struct seq_file *seq, + struct batadv_hard_iface *if_outgoing) { - struct batadv_neigh_node *neigh_node, *neigh_node_tmp; + struct batadv_neigh_node *neigh_node; struct batadv_hashtable *hash = bat_priv->orig_hash; int last_seen_msecs, last_seen_secs; struct batadv_orig_node *orig_node; + struct batadv_neigh_ifinfo *n_ifinfo; unsigned long last_seen_jiffies; struct hlist_head *head; int batman_count = 0; @@ -1560,11 +1816,17 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - neigh_node = batadv_orig_node_get_router(orig_node); + neigh_node = batadv_orig_router_get(orig_node, + if_outgoing); if (!neigh_node) continue; - if (neigh_node->bat_iv.tq_avg == 0) + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, + if_outgoing); + if (!n_ifinfo) + goto next; + + if (n_ifinfo->bat_iv.tq_avg == 0) goto next; last_seen_jiffies = jiffies - orig_node->last_seen; @@ -1574,22 +1836,19 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, seq_printf(seq, "%pM %4i.%03is (%3i) %pM [%10s]:", orig_node->orig, last_seen_secs, - last_seen_msecs, neigh_node->bat_iv.tq_avg, + last_seen_msecs, n_ifinfo->bat_iv.tq_avg, neigh_node->addr, neigh_node->if_incoming->net_dev->name); - hlist_for_each_entry_rcu(neigh_node_tmp, - &orig_node->neigh_list, list) { - seq_printf(seq, " %pM (%3i)", - neigh_node_tmp->addr, - neigh_node_tmp->bat_iv.tq_avg); - } - + batadv_iv_ogm_orig_print_neigh(orig_node, if_outgoing, + seq); seq_puts(seq, "\n"); batman_count++; next: batadv_neigh_node_free_ref(neigh_node); + if (n_ifinfo) + batadv_neigh_ifinfo_free_ref(n_ifinfo); } rcu_read_unlock(); } @@ -1601,37 +1860,84 @@ next: /** * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison + * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison + * @if_outgoing2: outgoing interface for the second neighbor * * Returns a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, - struct batadv_neigh_node *neigh2) + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) { + struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; uint8_t tq1, tq2; + int diff; + + neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); - tq1 = neigh1->bat_iv.tq_avg; - tq2 = neigh2->bat_iv.tq_avg; + if (!neigh1_ifinfo || !neigh2_ifinfo) { + diff = 0; + goto out; + } + + tq1 = neigh1_ifinfo->bat_iv.tq_avg; + tq2 = neigh2_ifinfo->bat_iv.tq_avg; + diff = tq1 - tq2; - return tq1 - tq2; +out: + if (neigh1_ifinfo) + batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + if (neigh2_ifinfo) + batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + + return diff; } /** * batadv_iv_ogm_neigh_is_eob - check if neigh1 is equally good or better than * neigh2 from the metric prospective * @neigh1: the first neighbor object of the comparison + * @if_outgoing: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison - * - * Returns true if the metric via neigh1 is equally good or better than the - * metric via neigh2, false otherwise. + * @if_outgoing2: outgoing interface for the second neighbor + + * Returns true if the metric via neigh1 is equally good or better than + * the metric via neigh2, false otherwise. */ -static bool batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1, - struct batadv_neigh_node *neigh2) +static bool +batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) { - int diff = batadv_iv_ogm_neigh_cmp(neigh1, neigh2); + struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; + uint8_t tq1, tq2; + bool ret; + + neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + + /* we can't say that the metric is better */ + if (!neigh1_ifinfo || !neigh2_ifinfo) { + ret = false; + goto out; + } + + tq1 = neigh1_ifinfo->bat_iv.tq_avg; + tq2 = neigh2_ifinfo->bat_iv.tq_avg; + ret = (tq1 - tq2) > -BATADV_TQ_SIMILARITY_THRESHOLD; - return diff > -BATADV_TQ_SIMILARITY_THRESHOLD; +out: + if (neigh1_ifinfo) + batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + if (neigh2_ifinfo) + batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + + return ret; } static struct batadv_algo_ops batadv_batman_iv __read_mostly = { diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 973982414d5..9586750022f 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index a81b9322e38..cc2407351d3 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 28eb5e6d0a0..a957c814072 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -132,7 +130,9 @@ static void batadv_claim_free_ref(struct batadv_bla_claim *claim) call_rcu(&claim->rcu, batadv_claim_free_rcu); } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_claim_hash_find + * @bat_priv: the bat priv with all the soft interface information * @data: search data (may be local/static data) * * looks for a claim in the hash, and returns it if found @@ -191,7 +191,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, if (!hash) return NULL; - memcpy(search_entry.orig, addr, ETH_ALEN); + ether_addr_copy(search_entry.orig, addr); search_entry.vid = vid; index = batadv_choose_backbone_gw(&search_entry, hash->size); @@ -305,7 +305,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, /* normal claim frame * set Ethernet SRC to the clients mac */ - memcpy(ethhdr->h_source, mac, ETH_ALEN); + ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): CLAIM %pM on vid %d\n", mac, BATADV_PRINT_VID(vid)); @@ -314,7 +314,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, /* unclaim frame * set HW SRC to the clients mac */ - memcpy(hw_src, mac, ETH_ALEN); + ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): UNCLAIM %pM on vid %d\n", mac, BATADV_PRINT_VID(vid)); @@ -323,7 +323,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, /* announcement frame * set HW SRC to the special mac containg the crc */ - memcpy(hw_src, mac, ETH_ALEN); + ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): ANNOUNCE of %pM on vid %d\n", ethhdr->h_source, BATADV_PRINT_VID(vid)); @@ -333,8 +333,8 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, * set HW SRC and header destination to the receiving backbone * gws mac */ - memcpy(hw_src, mac, ETH_ALEN); - memcpy(ethhdr->h_dest, mac, ETH_ALEN); + ether_addr_copy(hw_src, mac); + ether_addr_copy(ethhdr->h_dest, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): REQUEST of %pM to %pM on vid %d\n", ethhdr->h_source, ethhdr->h_dest, @@ -395,7 +395,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig, entry->bat_priv = bat_priv; atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); - memcpy(entry->orig, orig, ETH_ALEN); + ether_addr_copy(entry->orig, orig); /* one for the hash, one for returning */ atomic_set(&entry->refcount, 2); @@ -451,7 +451,9 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, batadv_backbone_gw_free_ref(backbone_gw); } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_answer_request - answer a bla request by sending own claims + * @bat_priv: the bat priv with all the soft interface information * @vid: the vid where the request came on * * Repeat all of our own claims, and finally send an ANNOUNCE frame @@ -497,7 +499,9 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, batadv_backbone_gw_free_ref(backbone_gw); } -/* @backbone_gw: the backbone gateway from whom we are out of sync +/** + * batadv_bla_send_request - send a request to repeat claims + * @backbone_gw: the backbone gateway from whom we are out of sync * * When the crc is wrong, ask the backbone gateway for a full table update. * After the request, it will repeat all of his own claims and finally @@ -522,7 +526,9 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) } } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_send_announce + * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: our backbone gateway which should be announced * * This function sends an announcement. It is called from multiple @@ -557,7 +563,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, struct batadv_bla_claim search_claim; int hash_added; - memcpy(search_claim.addr, mac, ETH_ALEN); + ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); @@ -567,7 +573,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, if (!claim) return; - memcpy(claim->addr, mac, ETH_ALEN); + ether_addr_copy(claim->addr, mac); claim->vid = vid; claim->lasttime = jiffies; claim->backbone_gw = backbone_gw; @@ -618,7 +624,7 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, { struct batadv_bla_claim search_claim, *claim; - memcpy(search_claim.addr, mac, ETH_ALEN); + ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) @@ -794,11 +800,6 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; - /* check if it is a claim packet in general */ - if (memcmp(bla_dst->magic, bla_dst_own->magic, - sizeof(bla_dst->magic)) != 0) - return 0; - /* if announcement packet, use the source, * otherwise assume it is in the hw_src */ @@ -846,7 +847,9 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_process_claim + * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * * Check if this is a claim frame, and process it accordingly. @@ -858,12 +861,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct sk_buff *skb) { - struct batadv_bla_claim_dst *bla_dst; + struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; uint8_t *hw_src, *hw_dst; - struct vlan_ethhdr *vhdr; + struct vlan_hdr *vhdr, vhdr_buf; struct ethhdr *ethhdr; struct arphdr *arphdr; unsigned short vid; + int vlan_depth = 0; __be16 proto; int headlen; int ret; @@ -874,9 +878,24 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, proto = ethhdr->h_proto; headlen = ETH_HLEN; if (vid & BATADV_VLAN_HAS_TAG) { - vhdr = (struct vlan_ethhdr *)ethhdr; - proto = vhdr->h_vlan_encapsulated_proto; - headlen += VLAN_HLEN; + /* Traverse the VLAN/Ethertypes. + * + * At this point it is known that the first protocol is a VLAN + * header, so start checking at the encapsulated protocol. + * + * The depth of the VLAN headers is recorded to drop BLA claim + * frames encapsulated into multiple VLAN headers (QinQ). + */ + do { + vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN, + &vhdr_buf); + if (!vhdr) + return 0; + + proto = vhdr->h_vlan_encapsulated_proto; + headlen += VLAN_HLEN; + vlan_depth++; + } while (proto == htons(ETH_P_8021Q)); } if (proto != htons(ETH_P_ARP)) @@ -906,6 +925,19 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, hw_src = (uint8_t *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; + bla_dst_own = &bat_priv->bla.claim_dest; + + /* check if it is a claim frame in general */ + if (memcmp(bla_dst->magic, bla_dst_own->magic, + sizeof(bla_dst->magic)) != 0) + return 0; + + /* check if there is a claim frame encapsulated deeper in (QinQ) and + * drop that, as this is not supported by BLA but should also not be + * sent via the mesh. + */ + if (vlan_depth > 1) + return 1; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, @@ -1095,8 +1127,8 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, oldif->net_dev->dev_addr)) continue; - memcpy(backbone_gw->orig, - primary_if->net_dev->dev_addr, ETH_ALEN); + ether_addr_copy(backbone_gw->orig, + primary_if->net_dev->dev_addr); /* send an announce frame so others will ask for our * claims and update their tables. */ @@ -1302,7 +1334,7 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, entry = &bat_priv->bla.bcast_duplist[curr]; entry->crc = crc; entry->entrytime = jiffies; - memcpy(entry->orig, bcast_packet->orig, ETH_ALEN); + ether_addr_copy(entry->orig, bcast_packet->orig); bat_priv->bla.bcast_duplist_curr = curr; out: @@ -1313,7 +1345,9 @@ out: -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_is_backbone_gw_orig + * @bat_priv: the bat priv with all the soft interface information * @orig: originator mac address * @vid: VLAN identifier * @@ -1448,7 +1482,7 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) goto handled; - memcpy(search_claim.addr, ethhdr->h_source, ETH_ALEN); + ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); @@ -1537,9 +1571,6 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; - /* in VLAN case, the mac header might not be set. */ - skb_reset_mac_header(skb); - if (batadv_bla_process_claim(bat_priv, primary_if, skb)) goto handled; @@ -1550,7 +1581,7 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, if (is_multicast_ether_addr(ethhdr->h_dest)) goto handled; - memcpy(search_claim.addr, ethhdr->h_source, ETH_ALEN); + ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index da173e760e7..43c985d92c3 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BLA_H_ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 049a7a2ac5b..a12e25efaf6 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -247,18 +245,35 @@ static int batadv_algorithms_open(struct inode *inode, struct file *file) static int batadv_originators_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_orig_seq_print_text, net_dev); } +/** + * batadv_originators_hardif_open - handles debugfs output for the + * originator table of an hard interface + * @inode: inode pointer to debugfs file + * @file: pointer to the seq_file + */ +static int batadv_originators_hardif_open(struct inode *inode, + struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + + return single_open(file, batadv_orig_hardif_seq_print_text, net_dev); +} + static int batadv_gateways_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_gw_client_seq_print_text, net_dev); } static int batadv_transtable_global_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_tt_global_seq_print_text, net_dev); } @@ -266,6 +281,7 @@ static int batadv_transtable_global_open(struct inode *inode, struct file *file) static int batadv_bla_claim_table_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_bla_claim_table_seq_print_text, net_dev); } @@ -274,6 +290,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_bla_backbone_table_seq_print_text, net_dev); } @@ -289,6 +306,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode, static int batadv_dat_cache_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_dat_cache_seq_print_text, net_dev); } #endif @@ -296,6 +314,7 @@ static int batadv_dat_cache_open(struct inode *inode, struct file *file) static int batadv_transtable_local_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_tt_local_seq_print_text, net_dev); } @@ -308,6 +327,7 @@ struct batadv_debuginfo { static int batadv_nc_nodes_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_nc_nodes_seq_print_text, net_dev); } #endif @@ -322,7 +342,7 @@ struct batadv_debuginfo batadv_debuginfo_##_name = { \ .llseek = seq_lseek, \ .release = single_release, \ } \ -}; +} /* the following attributes are general and therefore they will be directly * placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs @@ -371,6 +391,28 @@ static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { NULL, }; +#define BATADV_HARDIF_DEBUGINFO(_name, _mode, _open) \ +struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = _mode, \ + }, \ + .fops = { \ + .owner = THIS_MODULE, \ + .open = _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + }, \ +} +static BATADV_HARDIF_DEBUGINFO(originators, S_IRUGO, + batadv_originators_hardif_open); + +static struct batadv_debuginfo *batadv_hardif_debuginfos[] = { + &batadv_hardif_debuginfo_originators, + NULL, +}; + void batadv_debugfs_init(void) { struct batadv_debuginfo **bat_debug; @@ -398,6 +440,7 @@ void batadv_debugfs_init(void) return; err: debugfs_remove_recursive(batadv_debugfs); + batadv_debugfs = NULL; } void batadv_debugfs_destroy(void) @@ -406,6 +449,59 @@ void batadv_debugfs_destroy(void) batadv_debugfs = NULL; } +/** + * batadv_debugfs_add_hardif - creates the base directory for a hard interface + * in debugfs. + * @hard_iface: hard interface which should be added. + */ +int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +{ + struct batadv_debuginfo **bat_debug; + struct dentry *file; + + if (!batadv_debugfs) + goto out; + + hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name, + batadv_debugfs); + if (!hard_iface->debug_dir) + goto out; + + for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) { + file = debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + hard_iface->debug_dir, + hard_iface->net_dev, + &(*bat_debug)->fops); + if (!file) + goto rem_attr; + } + + return 0; +rem_attr: + debugfs_remove_recursive(hard_iface->debug_dir); + hard_iface->debug_dir = NULL; +out: +#ifdef CONFIG_DEBUG_FS + return -ENOMEM; +#else + return 0; +#endif /* CONFIG_DEBUG_FS */ +} + +/** + * batadv_debugfs_del_hardif - delete the base directory for a hard interface + * in debugfs. + * @hard_iface: hard interface which is deleted. + */ +void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) +{ + if (batadv_debugfs) { + debugfs_remove_recursive(hard_iface->debug_dir); + hard_iface->debug_dir = NULL; + } +} + int batadv_debugfs_add_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index f8c3849edff..37c4d6ddd04 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_DEBUGFS_H_ @@ -26,5 +24,7 @@ void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); void batadv_debugfs_del_meshif(struct net_device *dev); +int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); +void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); #endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 6c8c3934bd7..f2c066b2171 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/if_ether.h> @@ -141,7 +139,7 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) const void *data1 = container_of(node, struct batadv_dat_entry, hash_entry); - return (memcmp(data1, data2, sizeof(__be32)) == 0 ? 1 : 0); + return memcmp(data1, data2, sizeof(__be32)) == 0 ? 1 : 0; } /** @@ -279,7 +277,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, /* if this entry is already known, just update it */ if (dat_entry) { if (!batadv_compare_eth(dat_entry->mac_addr, mac_addr)) - memcpy(dat_entry->mac_addr, mac_addr, ETH_ALEN); + ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; batadv_dbg(BATADV_DBG_DAT, bat_priv, "Entry updated: %pI4 %pM (vid: %d)\n", @@ -294,7 +292,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, dat_entry->ip = ip; dat_entry->vid = vid; - memcpy(dat_entry->mac_addr, mac_addr, ETH_ALEN); + ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; atomic_set(&dat_entry->refcount, 2); @@ -349,7 +347,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; - switch (unicast_4addr_packet->u.header.packet_type) { + switch (unicast_4addr_packet->u.packet_type) { case BATADV_UNICAST: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* encapsulated within a UNICAST packet\n"); @@ -374,7 +372,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, break; default: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: Unknown (%u)!\n", - unicast_4addr_packet->u.header.packet_type); + unicast_4addr_packet->u.packet_type); } break; case BATADV_BCAST: @@ -387,7 +385,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, default: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* encapsulated within an unknown packet type (0x%x)\n", - unicast_4addr_packet->u.header.packet_type); + unicast_4addr_packet->u.packet_type); } } @@ -591,11 +589,12 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, if (cand[i].type == BATADV_DAT_CANDIDATE_NOT_FOUND) continue; - neigh_node = batadv_orig_node_get_router(cand[i].orig_node); + neigh_node = batadv_orig_router_get(cand[i].orig_node, + BATADV_IF_DEFAULT); if (!neigh_node) goto free_orig; - tmp_skb = pskb_copy(skb, GFP_ATOMIC); + tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC); if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb, cand[i].orig_node, packet_subtype)) { @@ -663,6 +662,7 @@ static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv) void batadv_dat_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); + batadv_dat_tvlv_container_update(bat_priv); } @@ -941,8 +941,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, * additional DAT answer may trigger kernel warnings about * a packet coming from the wrong port. */ - if (batadv_is_my_client(bat_priv, dat_entry->mac_addr, - BATADV_NO_FLAGS)) { + if (batadv_is_my_client(bat_priv, dat_entry->mac_addr, vid)) { ret = true; goto out; } @@ -1028,6 +1027,11 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, if (!skb_new) goto out; + /* the rest of the TX path assumes that the mac_header offset pointing + * to the inner Ethernet header has been set, therefore reset it now. + */ + skb_reset_mac_header(skb_new); + if (vid & BATADV_VLAN_HAS_TAG) skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); @@ -1039,9 +1043,9 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, if (hdr_size == sizeof(struct batadv_unicast_4addr_packet)) err = batadv_send_skb_via_tt_4addr(bat_priv, skb_new, BATADV_P_DAT_CACHE_REPLY, - vid); + NULL, vid); else - err = batadv_send_skb_via_tt(bat_priv, skb_new, vid); + err = batadv_send_skb_via_tt(bat_priv, skb_new, NULL, vid); if (err != NET_XMIT_DROP) { batadv_inc_counter(bat_priv, BATADV_CNT_DAT_CACHED_REPLY_TX); diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 60d853beb8d..d76e1d06c5b 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -12,13 +12,11 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef _NET_BATMAN_ADV_ARP_H_ -#define _NET_BATMAN_ADV_ARP_H_ +#ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ +#define _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ #ifdef CONFIG_BATMAN_ADV_DAT @@ -27,6 +25,9 @@ #include <linux/if_arp.h> +/** + * BATADV_DAT_ADDR_MAX - maximum address value in the DHT space + */ #define BATADV_DAT_ADDR_MAX ((batadv_dat_addr_t)~(batadv_dat_addr_t)0) void batadv_dat_status_update(struct net_device *net_dev); @@ -169,4 +170,4 @@ static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, #endif /* CONFIG_BATMAN_ADV_DAT */ -#endif /* _NET_BATMAN_ADV_ARP_H_ */ +#endif /* _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ */ diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 271d321b3a0..f14e54a0569 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -355,7 +353,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, batadv_add_counter(bat_priv, BATADV_CNT_FRAG_FWD_BYTES, skb->len + ETH_HLEN); - packet->header.ttl--; + packet->ttl--; batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); ret = true; @@ -420,12 +418,13 @@ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) { struct batadv_priv *bat_priv; - struct batadv_hard_iface *primary_if; + struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; unsigned mtu = neigh_node->if_incoming->net_dev->mtu; unsigned header_size = sizeof(frag_header); unsigned max_fragment_size, max_packet_size; + bool ret = false; /* To avoid merge and refragmentation at next-hops we never send * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE @@ -444,15 +443,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb, goto out_err; /* Create one header to be copied to all fragments */ - frag_header.header.packet_type = BATADV_UNICAST_FRAG; - frag_header.header.version = BATADV_COMPAT_VERSION; - frag_header.header.ttl = BATADV_TTL; + frag_header.packet_type = BATADV_UNICAST_FRAG; + frag_header.version = BATADV_COMPAT_VERSION; + frag_header.ttl = BATADV_TTL; frag_header.seqno = htons(atomic_inc_return(&bat_priv->frag_seqno)); frag_header.reserved = 0; frag_header.no = 0; frag_header.total_size = htons(skb->len); - memcpy(frag_header.orig, primary_if->net_dev->dev_addr, ETH_ALEN); - memcpy(frag_header.dest, orig_node->orig, ETH_ALEN); + ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr); + ether_addr_copy(frag_header.dest, orig_node->orig); /* Eat and send fragments from the tail of skb */ while (skb->len > max_fragment_size) { @@ -485,7 +484,11 @@ bool batadv_frag_send_packet(struct sk_buff *skb, skb->len + ETH_HLEN); batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); - return true; + ret = true; + out_err: - return false; + if (primary_if) + batadv_hardif_free_ref(primary_if); + + return ret; } diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index ca029e2676e..5d7a0e66a22 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 2449afaa763..90cff585b37 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -30,16 +28,24 @@ #include <linux/udp.h> #include <linux/if_vlan.h> -/* This is the offset of the options field in a dhcp packet starting at - * the beginning of the dhcp header +/* These are the offsets of the "hw type" and "hw address length" in the dhcp + * packet starting at the beginning of the dhcp header */ -#define BATADV_DHCP_OPTIONS_OFFSET 240 -#define BATADV_DHCP_REQUEST 3 +#define BATADV_DHCP_HTYPE_OFFSET 1 +#define BATADV_DHCP_HLEN_OFFSET 2 +/* Value of htype representing Ethernet */ +#define BATADV_DHCP_HTYPE_ETHERNET 0x01 +/* This is the offset of the "chaddr" field in the dhcp packet starting at the + * beginning of the dhcp header + */ +#define BATADV_DHCP_CHADDR_OFFSET 28 static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node) { - if (atomic_dec_and_test(&gw_node->refcount)) + if (atomic_dec_and_test(&gw_node->refcount)) { + batadv_orig_node_free_ref(gw_node->orig_node); kfree_rcu(gw_node, rcu); + } } static struct batadv_gw_node * @@ -105,7 +111,18 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, spin_unlock_bh(&bat_priv->gw.list_lock); } -void batadv_gw_deselect(struct batadv_priv *bat_priv) +/** + * batadv_gw_reselect - force a gateway reselection + * @bat_priv: the bat priv with all the soft interface information + * + * Set a flag to remind the GW component to perform a new gateway reselection. + * However this function does not ensure that the current gateway is going to be + * deselected. The reselection mechanism may elect the same gateway once again. + * + * This means that invoking batadv_gw_reselect() does not guarantee a gateway + * change and therefore a uevent is not necessarily expected. + */ +void batadv_gw_reselect(struct batadv_priv *bat_priv) { atomic_set(&bat_priv->gw.reselect, 1); } @@ -114,6 +131,7 @@ static struct batadv_gw_node * batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; uint32_t max_gw_factor = 0, tmp_gw_factor = 0; uint32_t gw_divisor; @@ -130,14 +148,19 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) continue; orig_node = gw_node->orig_node; - router = batadv_orig_node_get_router(orig_node); + router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) continue; + router_ifinfo = batadv_neigh_ifinfo_get(router, + BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto next; + if (!atomic_inc_not_zero(&gw_node->refcount)) goto next; - tq_avg = router->bat_iv.tq_avg; + tq_avg = router_ifinfo->bat_iv.tq_avg; switch (atomic_read(&bat_priv->gw_sel_class)) { case 1: /* fast connection */ @@ -182,6 +205,8 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) next: batadv_neigh_node_free_ref(router); + if (router_ifinfo) + batadv_neigh_ifinfo_free_ref(router_ifinfo); } rcu_read_unlock(); @@ -207,6 +232,11 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) if (!curr_gw) return; + /* deselect the current gateway so that next time that client mode is + * enabled a proper GW_ADD event can be sent + */ + batadv_gw_select(bat_priv, NULL); + /* if batman-adv is switching the gw client mode off and a gateway was * already selected, send a DEL uevent */ @@ -219,6 +249,7 @@ void batadv_gw_election(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw = NULL, *next_gw = NULL; struct batadv_neigh_node *router = NULL; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) @@ -237,9 +268,17 @@ void batadv_gw_election(struct batadv_priv *bat_priv) if (next_gw) { sprintf(gw_addr, "%pM", next_gw->orig_node->orig); - router = batadv_orig_node_get_router(next_gw->orig_node); + router = batadv_orig_router_get(next_gw->orig_node, + BATADV_IF_DEFAULT); if (!router) { - batadv_gw_deselect(bat_priv); + batadv_gw_reselect(bat_priv); + goto out; + } + + router_ifinfo = batadv_neigh_ifinfo_get(router, + BATADV_IF_DEFAULT); + if (!router_ifinfo) { + batadv_gw_reselect(bat_priv); goto out; } } @@ -256,7 +295,8 @@ void batadv_gw_election(struct batadv_priv *bat_priv) next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, - next_gw->bandwidth_up % 10, router->bat_iv.tq_avg); + next_gw->bandwidth_up % 10, + router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_ADD, gw_addr); } else { @@ -266,7 +306,8 @@ void batadv_gw_election(struct batadv_priv *bat_priv) next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, - next_gw->bandwidth_up % 10, router->bat_iv.tq_avg); + next_gw->bandwidth_up % 10, + router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_CHANGE, gw_addr); } @@ -280,33 +321,47 @@ out: batadv_gw_node_free_ref(next_gw); if (router) batadv_neigh_node_free_ref(router); + if (router_ifinfo) + batadv_neigh_ifinfo_free_ref(router_ifinfo); } void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { + struct batadv_neigh_ifinfo *router_orig_tq = NULL; + struct batadv_neigh_ifinfo *router_gw_tq = NULL; struct batadv_orig_node *curr_gw_orig; struct batadv_neigh_node *router_gw = NULL, *router_orig = NULL; uint8_t gw_tq_avg, orig_tq_avg; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) - goto deselect; + goto reselect; - router_gw = batadv_orig_node_get_router(curr_gw_orig); + router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); if (!router_gw) - goto deselect; + goto reselect; + + router_gw_tq = batadv_neigh_ifinfo_get(router_gw, + BATADV_IF_DEFAULT); + if (!router_gw_tq) + goto reselect; /* this node already is the gateway */ if (curr_gw_orig == orig_node) goto out; - router_orig = batadv_orig_node_get_router(orig_node); + router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router_orig) goto out; - gw_tq_avg = router_gw->bat_iv.tq_avg; - orig_tq_avg = router_orig->bat_iv.tq_avg; + router_orig_tq = batadv_neigh_ifinfo_get(router_orig, + BATADV_IF_DEFAULT); + if (!router_orig_tq) + goto out; + + gw_tq_avg = router_gw_tq->bat_iv.tq_avg; + orig_tq_avg = router_orig_tq->bat_iv.tq_avg; /* the TQ value has to be better */ if (orig_tq_avg < gw_tq_avg) @@ -323,8 +378,8 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", gw_tq_avg, orig_tq_avg); -deselect: - batadv_gw_deselect(bat_priv); +reselect: + batadv_gw_reselect(bat_priv); out: if (curr_gw_orig) batadv_orig_node_free_ref(curr_gw_orig); @@ -332,8 +387,10 @@ out: batadv_neigh_node_free_ref(router_gw); if (router_orig) batadv_neigh_node_free_ref(router_orig); - - return; + if (router_gw_tq) + batadv_neigh_ifinfo_free_ref(router_gw_tq); + if (router_orig_tq) + batadv_neigh_ifinfo_free_ref(router_orig_tq); } /** @@ -351,9 +408,14 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; + if (!atomic_inc_not_zero(&orig_node->refcount)) + return; + gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); - if (!gw_node) + if (!gw_node) { + batadv_orig_node_free_ref(orig_node); return; + } INIT_HLIST_NODE(&gw_node->list); gw_node->orig_node = orig_node; @@ -454,7 +516,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, */ curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) - batadv_gw_deselect(bat_priv); + batadv_gw_reselect(bat_priv); } out: @@ -480,7 +542,7 @@ void batadv_gw_node_purge(struct batadv_priv *bat_priv) struct batadv_gw_node *gw_node, *curr_gw; struct hlist_node *node_tmp; unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT); - int do_deselect = 0; + int do_reselect = 0; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); @@ -494,7 +556,7 @@ void batadv_gw_node_purge(struct batadv_priv *bat_priv) continue; if (curr_gw == gw_node) - do_deselect = 1; + do_reselect = 1; hlist_del_rcu(&gw_node->list); batadv_gw_node_free_ref(gw_node); @@ -502,9 +564,9 @@ void batadv_gw_node_purge(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->gw.list_lock); - /* gw_deselect() needs to acquire the gw_list_lock */ - if (do_deselect) - batadv_gw_deselect(bat_priv); + /* gw_reselect() needs to acquire the gw_list_lock */ + if (do_reselect) + batadv_gw_reselect(bat_priv); if (curr_gw) batadv_gw_node_free_ref(curr_gw); @@ -517,28 +579,36 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv, { struct batadv_gw_node *curr_gw; struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; int ret = -1; - router = batadv_orig_node_get_router(gw_node->orig_node); + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); ret = seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", (curr_gw == gw_node ? "=>" : " "), gw_node->orig_node->orig, - router->bat_iv.tq_avg, router->addr, + router_ifinfo->bat_iv.tq_avg, router->addr, router->if_incoming->net_dev->name, gw_node->bandwidth_down / 10, gw_node->bandwidth_down % 10, gw_node->bandwidth_up / 10, gw_node->bandwidth_up % 10); - batadv_neigh_node_free_ref(router); if (curr_gw) batadv_gw_node_free_ref(curr_gw); out: + if (router_ifinfo) + batadv_neigh_ifinfo_free_ref(router_ifinfo); + if (router) + batadv_neigh_node_free_ref(router); return ret; } @@ -582,90 +652,49 @@ out: return 0; } -/* this call might reallocate skb data */ -static bool batadv_is_type_dhcprequest(struct sk_buff *skb, int header_len) -{ - int ret = false; - unsigned char *p; - int pkt_len; - - if (skb_linearize(skb) < 0) - goto out; - - pkt_len = skb_headlen(skb); - - if (pkt_len < header_len + BATADV_DHCP_OPTIONS_OFFSET + 1) - goto out; - - p = skb->data + header_len + BATADV_DHCP_OPTIONS_OFFSET; - pkt_len -= header_len + BATADV_DHCP_OPTIONS_OFFSET + 1; - - /* Access the dhcp option lists. Each entry is made up by: - * - octet 1: option type - * - octet 2: option data len (only if type != 255 and 0) - * - octet 3: option data - */ - while (*p != 255 && !ret) { - /* p now points to the first octet: option type */ - if (*p == 53) { - /* type 53 is the message type option. - * Jump the len octet and go to the data octet - */ - if (pkt_len < 2) - goto out; - p += 2; - - /* check if the message type is what we need */ - if (*p == BATADV_DHCP_REQUEST) - ret = true; - break; - } else if (*p == 0) { - /* option type 0 (padding), just go forward */ - if (pkt_len < 1) - goto out; - pkt_len--; - p++; - } else { - /* This is any other option. So we get the length... */ - if (pkt_len < 1) - goto out; - pkt_len--; - p++; - - /* ...and then we jump over the data */ - if (pkt_len < 1 + (*p)) - goto out; - pkt_len -= 1 + (*p); - p += 1 + (*p); - } - } -out: - return ret; -} - -/* this call might reallocate skb data */ -bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) +/** + * batadv_gw_dhcp_recipient_get - check if a packet is a DHCP message + * @skb: the packet to check + * @header_len: a pointer to the batman-adv header size + * @chaddr: buffer where the client address will be stored. Valid + * only if the function returns BATADV_DHCP_TO_CLIENT + * + * Returns: + * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error + * while parsing it + * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server + * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client + * + * This function may re-allocate the data buffer of the skb passed as argument. + */ +enum batadv_dhcp_recipient +batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, + uint8_t *chaddr) { + enum batadv_dhcp_recipient ret = BATADV_DHCP_NO; struct ethhdr *ethhdr; struct iphdr *iphdr; struct ipv6hdr *ipv6hdr; struct udphdr *udphdr; struct vlan_ethhdr *vhdr; + int chaddr_offset; __be16 proto; + uint8_t *p; /* check for ethernet header */ if (!pskb_may_pull(skb, *header_len + ETH_HLEN)) - return false; - ethhdr = (struct ethhdr *)skb->data; + return BATADV_DHCP_NO; + + ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; *header_len += ETH_HLEN; /* check for initial vlan header */ if (proto == htons(ETH_P_8021Q)) { if (!pskb_may_pull(skb, *header_len + VLAN_HLEN)) - return false; + return BATADV_DHCP_NO; - vhdr = (struct vlan_ethhdr *)skb->data; + vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; *header_len += VLAN_HLEN; } @@ -674,35 +703,37 @@ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) switch (proto) { case htons(ETH_P_IP): if (!pskb_may_pull(skb, *header_len + sizeof(*iphdr))) - return false; + return BATADV_DHCP_NO; + iphdr = (struct iphdr *)(skb->data + *header_len); *header_len += iphdr->ihl * 4; /* check for udp header */ if (iphdr->protocol != IPPROTO_UDP) - return false; + return BATADV_DHCP_NO; break; case htons(ETH_P_IPV6): if (!pskb_may_pull(skb, *header_len + sizeof(*ipv6hdr))) - return false; + return BATADV_DHCP_NO; + ipv6hdr = (struct ipv6hdr *)(skb->data + *header_len); *header_len += sizeof(*ipv6hdr); /* check for udp header */ if (ipv6hdr->nexthdr != IPPROTO_UDP) - return false; + return BATADV_DHCP_NO; break; default: - return false; + return BATADV_DHCP_NO; } if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) - return false; + return BATADV_DHCP_NO; /* skb->data might have been reallocated by pskb_may_pull() */ - ethhdr = (struct ethhdr *)skb->data; + ethhdr = eth_hdr(skb); if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); @@ -710,17 +741,40 @@ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) *header_len += sizeof(*udphdr); /* check for bootp port */ - if ((proto == htons(ETH_P_IP)) && - (udphdr->dest != htons(67))) - return false; + switch (proto) { + case htons(ETH_P_IP): + if (udphdr->dest == htons(67)) + ret = BATADV_DHCP_TO_SERVER; + else if (udphdr->source == htons(67)) + ret = BATADV_DHCP_TO_CLIENT; + break; + case htons(ETH_P_IPV6): + if (udphdr->dest == htons(547)) + ret = BATADV_DHCP_TO_SERVER; + else if (udphdr->source == htons(547)) + ret = BATADV_DHCP_TO_CLIENT; + break; + } - if ((proto == htons(ETH_P_IPV6)) && - (udphdr->dest != htons(547))) - return false; + chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET; + /* store the client address if the message is going to a client */ + if (ret == BATADV_DHCP_TO_CLIENT && + pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) { + /* check if the DHCP packet carries an Ethernet DHCP */ + p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET; + if (*p != BATADV_DHCP_HTYPE_ETHERNET) + return BATADV_DHCP_NO; + + /* check if the DHCP packet carries a valid Ethernet address */ + p = skb->data + *header_len + BATADV_DHCP_HLEN_OFFSET; + if (*p != ETH_ALEN) + return BATADV_DHCP_NO; + + ether_addr_copy(chaddr, skb->data + chaddr_offset); + } - return true; + return ret; } - /** * batadv_gw_out_of_range - check if the dhcp request destination is the best gw * @bat_priv: the bat priv with all the soft interface information @@ -734,6 +788,7 @@ bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) * false otherwise. * * This call might reallocate skb data. + * Must be invoked only when the DHCP packet is going TO a DHCP SERVER. */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -741,19 +796,14 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh_curr = NULL, *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; struct batadv_gw_node *gw_node = NULL, *curr_gw = NULL; - struct ethhdr *ethhdr; - bool ret, out_of_range = false; - unsigned int header_len = 0; + struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; + struct ethhdr *ethhdr = (struct ethhdr *)skb->data; + bool out_of_range = false; uint8_t curr_tq_avg; unsigned short vid; vid = batadv_get_vid(skb, 0); - ret = batadv_gw_is_dhcp_target(skb, &header_len); - if (!ret) - goto out; - - ethhdr = (struct ethhdr *)skb->data; orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid); if (!orig_dst_node) @@ -763,10 +813,6 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if (!gw_node->bandwidth_down == 0) goto out; - ret = batadv_is_type_dhcprequest(skb, header_len); - if (!ret) - goto out; - switch (atomic_read(&bat_priv->gw_mode)) { case BATADV_GW_MODE_SERVER: /* If we are a GW then we are our best GW. We can artificially @@ -792,7 +838,14 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if (!neigh_curr) goto out; - curr_tq_avg = neigh_curr->bat_iv.tq_avg; + curr_ifinfo = batadv_neigh_ifinfo_get(neigh_curr, + BATADV_IF_DEFAULT); + if (!curr_ifinfo) + goto out; + + curr_tq_avg = curr_ifinfo->bat_iv.tq_avg; + batadv_neigh_ifinfo_free_ref(curr_ifinfo); + break; case BATADV_GW_MODE_OFF: default: @@ -803,8 +856,13 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if (!neigh_old) goto out; - if (curr_tq_avg - neigh_old->bat_iv.tq_avg > BATADV_GW_THRESHOLD) + old_ifinfo = batadv_neigh_ifinfo_get(neigh_old, BATADV_IF_DEFAULT); + if (!old_ifinfo) + goto out; + + if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD) out_of_range = true; + batadv_neigh_ifinfo_free_ref(old_ifinfo); out: if (orig_dst_node) diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index d95c2d23195..7ee53bb7d50 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,16 +12,14 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); -void batadv_gw_deselect(struct batadv_priv *bat_priv); +void batadv_gw_reselect(struct batadv_priv *bat_priv); void batadv_gw_election(struct batadv_priv *bat_priv); struct batadv_orig_node * batadv_gw_get_selected_orig(struct batadv_priv *bat_priv); @@ -34,7 +32,9 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_purge(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); -bool batadv_gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); +enum batadv_dhcp_recipient +batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, + uint8_t *chaddr); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index b211b0f9cb7..6f5e621f220 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -164,7 +162,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, if ((down_curr == down_new) && (up_curr == up_new)) return count; - batadv_gw_deselect(bat_priv); + batadv_gw_reselect(bat_priv); batadv_info(net_dev, "Changing gateway bandwidth from: '%u.%u/%u.%u MBit' to: '%u.%u/%u.%u MBit'\n", down_curr / 10, down_curr % 10, up_curr / 10, up_curr % 10, diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 56384a4cd18..aa511656194 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 57c2a19dcb5..fbda6b54baf 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -25,6 +23,7 @@ #include "translation-table.h" #include "routing.h" #include "sysfs.h" +#include "debugfs.h" #include "originator.h" #include "hash.h" #include "bridge_loop_avoidance.h" @@ -84,19 +83,17 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return true; /* no more parents..stop recursion */ - if (net_dev->iflink == net_dev->ifindex) + if (net_dev->iflink == 0 || net_dev->iflink == net_dev->ifindex) return false; /* recurse over the parent device */ - parent_dev = dev_get_by_index(&init_net, net_dev->iflink); + parent_dev = __dev_get_by_index(&init_net, net_dev->iflink); /* if we got a NULL parent_dev there is something broken.. */ if (WARN(!parent_dev, "Cannot find parent device")) return false; ret = batadv_is_on_batman_iface(parent_dev); - if (parent_dev) - dev_put(parent_dev); return ret; } @@ -244,7 +241,7 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); const struct batadv_hard_iface *hard_iface; - int min_mtu = ETH_DATA_LEN; + int min_mtu = INT_MAX; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -259,8 +256,6 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) } rcu_read_unlock(); - atomic_set(&bat_priv->packet_size_max, min_mtu); - if (atomic_read(&bat_priv->fragmentation) == 0) goto out; @@ -271,13 +266,21 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE); min_mtu -= sizeof(struct batadv_frag_packet); min_mtu *= BATADV_FRAG_MAX_FRAGMENTS; - atomic_set(&bat_priv->packet_size_max, min_mtu); - - /* with fragmentation enabled we can fragment external packets easily */ - min_mtu = min_t(int, min_mtu, ETH_DATA_LEN); out: - return min_mtu - batadv_max_header_len(); + /* report to the other components the maximum amount of bytes that + * batman-adv can send over the wire (without considering the payload + * overhead). For example, this value is used by TT to compute the + * maximum local table table size + */ + atomic_set(&bat_priv->packet_size_max, min_mtu); + + /* the real soft-interface MTU is computed by removing the payload + * overhead from the maximum amount of bytes that was just computed. + * + * However batman-adv does not support MTUs bigger than ETH_DATA_LEN + */ + return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); } /* adjusts the MTU if a new interface with a smaller MTU appeared. */ @@ -541,6 +544,7 @@ static void batadv_hardif_remove_interface_finish(struct work_struct *work) hard_iface = container_of(work, struct batadv_hard_iface, cleanup_work); + batadv_debugfs_del_hardif(hard_iface); batadv_sysfs_del_hardif(&hard_iface->hardif_obj); batadv_hardif_free_ref(hard_iface); } @@ -571,6 +575,11 @@ batadv_hardif_add_interface(struct net_device *net_dev) hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; + + ret = batadv_debugfs_add_hardif(hard_iface); + if (ret) + goto free_sysfs; + INIT_LIST_HEAD(&hard_iface->list); INIT_WORK(&hard_iface->cleanup_work, batadv_hardif_remove_interface_finish); @@ -587,6 +596,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) return hard_iface; +free_sysfs: + batadv_sysfs_del_hardif(&hard_iface->hardif_obj); free_if: kfree(hard_iface); release_dev: diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index df4c8bd45c4..1918cd50b62 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ @@ -42,6 +40,7 @@ enum batadv_hard_if_cleanup { extern struct notifier_block batadv_hard_if_notifier; bool batadv_is_wifi_netdev(struct net_device *net_device); +bool batadv_is_wifi_iface(int ifindex); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, @@ -53,6 +52,11 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); void batadv_hardif_free_rcu(struct rcu_head *rcu); +/** + * batadv_hardif_free_ref - decrement the hard interface refcounter and + * possibly free it + * @hard_iface: the hard interface to free + */ static inline void batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) { @@ -60,6 +64,18 @@ batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); } +/** + * batadv_hardif_free_ref_now - decrement the hard interface refcounter and + * possibly free it (without rcu callback) + * @hard_iface: the hard interface to free + */ +static inline void +batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface) +{ + if (atomic_dec_and_test(&hard_iface->refcount)) + batadv_hardif_free_rcu(&hard_iface->rcu); +} + static inline struct batadv_hard_iface * batadv_primary_if_get_selected(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 7198dafd3bf..63bdf7e94f1 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 1b4da72f209..539fc126679 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_HASH_H_ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 29ae4efe354..161ef8f17d2 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -160,6 +158,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, struct batadv_orig_node *orig_node = NULL; struct batadv_neigh_node *neigh_node = NULL; size_t packet_len = sizeof(struct batadv_icmp_packet); + uint8_t *addr; if (len < sizeof(struct batadv_icmp_header)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -194,7 +193,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, goto free_skb; } - if (icmp_header->header.packet_type != BATADV_ICMP) { + if (icmp_header->packet_type != BATADV_ICMP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Error - can't send packet from char device: got bogus packet type (expected: BAT_ICMP)\n"); len = -EINVAL; @@ -217,7 +216,8 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, if (!orig_node) goto dst_unreach; - neigh_node = batadv_orig_node_get_router(orig_node); + neigh_node = batadv_orig_router_get(orig_node, + BATADV_IF_DEFAULT); if (!neigh_node) goto dst_unreach; @@ -228,10 +228,10 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, goto dst_unreach; icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmp_header; - if (packet_len == sizeof(*icmp_packet_rr)) - memcpy(icmp_packet_rr->rr, - neigh_node->if_incoming->net_dev->dev_addr, - ETH_ALEN); + if (packet_len == sizeof(*icmp_packet_rr)) { + addr = neigh_node->if_incoming->net_dev->dev_addr; + ether_addr_copy(icmp_packet_rr->rr[0], addr); + } break; default: @@ -243,15 +243,15 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, icmp_header->uid = socket_client->index; - if (icmp_header->header.version != BATADV_COMPAT_VERSION) { + if (icmp_header->version != BATADV_COMPAT_VERSION) { icmp_header->msg_type = BATADV_PARAMETER_PROBLEM; - icmp_header->header.version = BATADV_COMPAT_VERSION; + icmp_header->version = BATADV_COMPAT_VERSION; batadv_socket_add_packet(socket_client, icmp_header, packet_len); goto free_skb; } - memcpy(icmp_header->orig, primary_if->net_dev->dev_addr, ETH_ALEN); + ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr); batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); goto out; diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 6665080dff7..0c33950aa4a 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index c51a5e568f0..d1183e88216 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/crc32c.h> @@ -36,6 +34,7 @@ #include "gateway_client.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" +#include "multicast.h" #include "gateway_common.h" #include "hash.h" #include "bat_algo.h" @@ -112,6 +111,9 @@ int batadv_mesh_init(struct net_device *soft_iface) spin_lock_init(&bat_priv->tt.last_changeset_lock); spin_lock_init(&bat_priv->tt.commit_lock); spin_lock_init(&bat_priv->gw.list_lock); +#ifdef CONFIG_BATMAN_ADV_MCAST + spin_lock_init(&bat_priv->mcast.want_lists_lock); +#endif spin_lock_init(&bat_priv->tvlv.container_list_lock); spin_lock_init(&bat_priv->tvlv.handler_list_lock); spin_lock_init(&bat_priv->softif_vlan_list_lock); @@ -119,9 +121,17 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->forw_bat_list); INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); INIT_HLIST_HEAD(&bat_priv->gw.list); +#ifdef CONFIG_BATMAN_ADV_MCAST + INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list); + INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list); + INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list); +#endif INIT_LIST_HEAD(&bat_priv->tt.changes_list); INIT_LIST_HEAD(&bat_priv->tt.req_list); INIT_LIST_HEAD(&bat_priv->tt.roam_list); +#ifdef CONFIG_BATMAN_ADV_MCAST + INIT_HLIST_HEAD(&bat_priv->mcast.mla_list); +#endif INIT_HLIST_HEAD(&bat_priv->tvlv.container_list); INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); @@ -147,6 +157,7 @@ int batadv_mesh_init(struct net_device *soft_iface) goto err; batadv_gw_init(bat_priv); + batadv_mcast_init(bat_priv); atomic_set(&bat_priv->gw.reselect, 0); atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE); @@ -171,6 +182,8 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); + batadv_mcast_free(bat_priv); + /* Free the TT and the originator tables only after having terminated * all the other depending components which may use these structures for * their purposes. @@ -277,7 +290,7 @@ int batadv_max_header_len(void) sizeof(struct batadv_coded_packet)); #endif - return header_len; + return header_len + ETH_HLEN; } /** @@ -383,17 +396,17 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data; - if (batadv_ogm_packet->header.version != BATADV_COMPAT_VERSION) { + if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: incompatible batman version (%i)\n", - batadv_ogm_packet->header.version); + batadv_ogm_packet->version); goto err_free; } /* all receive handlers return whether they received or reused * the supplied skb. if not, we have to free the skb. */ - idx = batadv_ogm_packet->header.packet_type; + idx = batadv_ogm_packet->packet_type; ret = (*batadv_rx_handler[idx])(skb, hard_iface); if (ret == NET_RX_DROP) @@ -421,13 +434,23 @@ static void batadv_recv_handler_init(void) for (i = BATADV_UNICAST_MIN; i <= BATADV_UNICAST_MAX; i++) batadv_rx_handler[i] = batadv_recv_unhandled_unicast_packet; - /* compile time checks for struct member offsets */ - BUILD_BUG_ON(offsetof(struct batadv_unicast_4addr_packet, src) != 10); - BUILD_BUG_ON(offsetof(struct batadv_unicast_packet, dest) != 4); - BUILD_BUG_ON(offsetof(struct batadv_unicast_tvlv_packet, dst) != 4); - BUILD_BUG_ON(offsetof(struct batadv_frag_packet, dest) != 4); - BUILD_BUG_ON(offsetof(struct batadv_icmp_packet, icmph.dst) != 4); - BUILD_BUG_ON(offsetof(struct batadv_icmp_packet_rr, icmph.dst) != 4); + /* compile time checks for sizes */ + BUILD_BUG_ON(sizeof(struct batadv_bla_claim_dst) != 6); + BUILD_BUG_ON(sizeof(struct batadv_ogm_packet) != 24); + BUILD_BUG_ON(sizeof(struct batadv_icmp_header) != 20); + BUILD_BUG_ON(sizeof(struct batadv_icmp_packet) != 20); + BUILD_BUG_ON(sizeof(struct batadv_icmp_packet_rr) != 116); + BUILD_BUG_ON(sizeof(struct batadv_unicast_packet) != 10); + BUILD_BUG_ON(sizeof(struct batadv_unicast_4addr_packet) != 18); + BUILD_BUG_ON(sizeof(struct batadv_frag_packet) != 20); + BUILD_BUG_ON(sizeof(struct batadv_bcast_packet) != 14); + BUILD_BUG_ON(sizeof(struct batadv_coded_packet) != 46); + BUILD_BUG_ON(sizeof(struct batadv_unicast_tvlv_packet) != 20); + BUILD_BUG_ON(sizeof(struct batadv_tvlv_hdr) != 4); + BUILD_BUG_ON(sizeof(struct batadv_tvlv_gateway_data) != 8); + BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_vlan_data) != 8); + BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12); + BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8); /* broadcast packet */ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet; @@ -1119,14 +1142,14 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src, skb_reserve(skb, ETH_HLEN); tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; - unicast_tvlv_packet->header.packet_type = BATADV_UNICAST_TVLV; - unicast_tvlv_packet->header.version = BATADV_COMPAT_VERSION; - unicast_tvlv_packet->header.ttl = BATADV_TTL; + unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; + unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; + unicast_tvlv_packet->ttl = BATADV_TTL; unicast_tvlv_packet->reserved = 0; unicast_tvlv_packet->tvlv_len = htons(tvlv_len); unicast_tvlv_packet->align = 0; - memcpy(unicast_tvlv_packet->src, src, ETH_ALEN); - memcpy(unicast_tvlv_packet->dst, dst, ETH_ALEN); + ether_addr_copy(unicast_tvlv_packet->src, src); + ether_addr_copy(unicast_tvlv_packet->dst, dst); tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; @@ -1173,6 +1196,32 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) return vid; } +/** + * batadv_vlan_ap_isola_get - return the AP isolation status for the given vlan + * @bat_priv: the bat priv with all the soft interface information + * @vid: the VLAN identifier for which the AP isolation attributed as to be + * looked up + * + * Returns true if AP isolation is on for the VLAN idenfied by vid, false + * otherwise + */ +bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) +{ + bool ap_isolation_enabled = false; + struct batadv_softif_vlan *vlan; + + /* if the AP isolation is requested on a VLAN, then check for its + * setting in the proper VLAN private data structure + */ + vlan = batadv_softif_vlan_get(bat_priv, vid); + if (vlan) { + ap_isolation_enabled = atomic_read(&vlan->ap_isolation); + batadv_softif_vlan_free_ref(vlan); + } + + return ap_isolation_enabled; +} + static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { struct batadv_algo_ops *bat_algo_ops; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index f94f287b867..118b990bae2 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_MAIN_H_ @@ -26,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2013.5.0" +#define BATADV_SOURCE_VERSION "2014.3.0" #endif /* B.A.T.M.A.N. parameters */ @@ -72,6 +70,14 @@ #define BATADV_NULL_IFINDEX 0 /* dummy ifindex used to avoid iface checks */ +#define BATADV_NO_MARK 0 + +/* default interface for multi interface operation. The default interface is + * used for communication which originated locally (i.e. is not forwarded) + * or where special forwarding is not desired/necessary. + */ +#define BATADV_IF_DEFAULT ((struct batadv_hard_iface *)NULL) + #define BATADV_NUM_WORDS BITS_TO_LONGS(BATADV_TQ_LOCAL_WINDOW_SIZE) #define BATADV_LOG_BUF_LEN 8192 /* has to be a power of 2 */ @@ -170,6 +176,8 @@ enum batadv_uev_type { #include <linux/percpu.h> #include <linux/slab.h> #include <net/sock.h> /* struct sock */ +#include <net/addrconf.h> /* ipv6 address stuff */ +#include <linux/ip.h> #include <net/rtnetlink.h> #include <linux/jiffies.h> #include <linux/seq_file.h> @@ -266,7 +274,7 @@ static inline void batadv_dbg(int type __always_unused, */ static inline int batadv_compare_eth(const void *data1, const void *data2) { - return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); + return ether_addr_equal_unaligned(data1, data2); } /** @@ -369,5 +377,6 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src, uint8_t *dst, uint8_t type, uint8_t version, void *tvlv_value, uint16_t tvlv_value_len); unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); +bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); #endif /* _NET_BATMAN_ADV_MAIN_H_ */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c new file mode 100644 index 00000000000..96b66fd30f9 --- /dev/null +++ b/net/batman-adv/multicast.c @@ -0,0 +1,748 @@ +/* Copyright (C) 2014 B.A.T.M.A.N. contributors: + * + * Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "main.h" +#include "multicast.h" +#include "originator.h" +#include "hard-interface.h" +#include "translation-table.h" +#include "multicast.h" + +/** + * batadv_mcast_mla_softif_get - get softif multicast listeners + * @dev: the device to collect multicast addresses from + * @mcast_list: a list to put found addresses into + * + * Collect multicast addresses of the local multicast listeners + * on the given soft interface, dev, in the given mcast_list. + * + * Returns -ENOMEM on memory allocation error or the number of + * items added to the mcast_list otherwise. + */ +static int batadv_mcast_mla_softif_get(struct net_device *dev, + struct hlist_head *mcast_list) +{ + struct netdev_hw_addr *mc_list_entry; + struct batadv_hw_addr *new; + int ret = 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(mc_list_entry, dev) { + new = kmalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + ret = -ENOMEM; + break; + } + + ether_addr_copy(new->addr, mc_list_entry->addr); + hlist_add_head(&new->list, mcast_list); + ret++; + } + netif_addr_unlock_bh(dev); + + return ret; +} + +/** + * batadv_mcast_mla_is_duplicate - check whether an address is in a list + * @mcast_addr: the multicast address to check + * @mcast_list: the list with multicast addresses to search in + * + * Returns true if the given address is already in the given list. + * Otherwise returns false. + */ +static bool batadv_mcast_mla_is_duplicate(uint8_t *mcast_addr, + struct hlist_head *mcast_list) +{ + struct batadv_hw_addr *mcast_entry; + + hlist_for_each_entry(mcast_entry, mcast_list, list) + if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) + return true; + + return false; +} + +/** + * batadv_mcast_mla_list_free - free a list of multicast addresses + * @mcast_list: the list to free + * + * Removes and frees all items in the given mcast_list. + */ +static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) +{ + struct batadv_hw_addr *mcast_entry; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { + hlist_del(&mcast_entry->list); + kfree(mcast_entry); + } +} + +/** + * batadv_mcast_mla_tt_retract - clean up multicast listener announcements + * @bat_priv: the bat priv with all the soft interface information + * @mcast_list: a list of addresses which should _not_ be removed + * + * Retracts the announcement of any multicast listener from the + * translation table except the ones listed in the given mcast_list. + * + * If mcast_list is NULL then all are retracted. + */ +static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, + struct hlist_head *mcast_list) +{ + struct batadv_hw_addr *mcast_entry; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, + list) { + if (mcast_list && + batadv_mcast_mla_is_duplicate(mcast_entry->addr, + mcast_list)) + continue; + + batadv_tt_local_remove(bat_priv, mcast_entry->addr, + BATADV_NO_FLAGS, + "mcast TT outdated", false); + + hlist_del(&mcast_entry->list); + kfree(mcast_entry); + } +} + +/** + * batadv_mcast_mla_tt_add - add multicast listener announcements + * @bat_priv: the bat priv with all the soft interface information + * @mcast_list: a list of addresses which are going to get added + * + * Adds multicast listener announcements from the given mcast_list to the + * translation table if they have not been added yet. + */ +static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, + struct hlist_head *mcast_list) +{ + struct batadv_hw_addr *mcast_entry; + struct hlist_node *tmp; + + if (!mcast_list) + return; + + hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { + if (batadv_mcast_mla_is_duplicate(mcast_entry->addr, + &bat_priv->mcast.mla_list)) + continue; + + if (!batadv_tt_local_add(bat_priv->soft_iface, + mcast_entry->addr, BATADV_NO_FLAGS, + BATADV_NULL_IFINDEX, BATADV_NO_MARK)) + continue; + + hlist_del(&mcast_entry->list); + hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list); + } +} + +/** + * batadv_mcast_has_bridge - check whether the soft-iface is bridged + * @bat_priv: the bat priv with all the soft interface information + * + * Checks whether there is a bridge on top of our soft interface. Returns + * true if so, false otherwise. + */ +static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) +{ + struct net_device *upper = bat_priv->soft_iface; + + rcu_read_lock(); + do { + upper = netdev_master_upper_dev_get_rcu(upper); + } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); + rcu_read_unlock(); + + return upper; +} + +/** + * batadv_mcast_mla_tvlv_update - update multicast tvlv + * @bat_priv: the bat priv with all the soft interface information + * + * Updates the own multicast tvlv with our current multicast related settings, + * capabilities and inabilities. + * + * Returns true if the tvlv container is registered afterwards. Otherwise + * returns false. + */ +static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) +{ + struct batadv_tvlv_mcast_data mcast_data; + + mcast_data.flags = BATADV_NO_FLAGS; + memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); + + /* Avoid attaching MLAs, if there is a bridge on top of our soft + * interface, we don't support that yet (TODO) + */ + if (batadv_mcast_has_bridge(bat_priv)) { + if (bat_priv->mcast.enabled) { + batadv_tvlv_container_unregister(bat_priv, + BATADV_TVLV_MCAST, 1); + bat_priv->mcast.enabled = false; + } + + return false; + } + + if (!bat_priv->mcast.enabled || + mcast_data.flags != bat_priv->mcast.flags) { + batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 1, + &mcast_data, sizeof(mcast_data)); + bat_priv->mcast.flags = mcast_data.flags; + bat_priv->mcast.enabled = true; + } + + return true; +} + +/** + * batadv_mcast_mla_update - update the own MLAs + * @bat_priv: the bat priv with all the soft interface information + * + * Updates the own multicast listener announcements in the translation + * table as well as the own, announced multicast tvlv container. + */ +void batadv_mcast_mla_update(struct batadv_priv *bat_priv) +{ + struct net_device *soft_iface = bat_priv->soft_iface; + struct hlist_head mcast_list = HLIST_HEAD_INIT; + int ret; + + if (!batadv_mcast_mla_tvlv_update(bat_priv)) + goto update; + + ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list); + if (ret < 0) + goto out; + +update: + batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); + batadv_mcast_mla_tt_add(bat_priv, &mcast_list); + +out: + batadv_mcast_mla_list_free(&mcast_list); +} + +/** + * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential + * @bat_priv: the bat priv with all the soft interface information + * @skb: the IPv4 packet to check + * @is_unsnoopable: stores whether the destination is snoopable + * + * Checks whether the given IPv4 packet has the potential to be forwarded with a + * mode more optimal than classic flooding. + * + * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM in case of + * memory allocation failure. + */ +static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, + struct sk_buff *skb, + bool *is_unsnoopable) +{ + struct iphdr *iphdr; + + /* We might fail due to out-of-memory -> drop it */ + if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) + return -ENOMEM; + + iphdr = ip_hdr(skb); + + /* TODO: Implement Multicast Router Discovery (RFC4286), + * then allow scope > link local, too + */ + if (!ipv4_is_local_multicast(iphdr->daddr)) + return -EINVAL; + + /* link-local multicast listeners behind a bridge are + * not snoopable (see RFC4541, section 2.1.2.2) + */ + *is_unsnoopable = true; + + return 0; +} + +/** + * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential + * @bat_priv: the bat priv with all the soft interface information + * @skb: the IPv6 packet to check + * @is_unsnoopable: stores whether the destination is snoopable + * + * Checks whether the given IPv6 packet has the potential to be forwarded with a + * mode more optimal than classic flooding. + * + * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out + * of memory. + */ +static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, + struct sk_buff *skb, + bool *is_unsnoopable) +{ + struct ipv6hdr *ip6hdr; + + /* We might fail due to out-of-memory -> drop it */ + if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) + return -ENOMEM; + + ip6hdr = ipv6_hdr(skb); + + /* TODO: Implement Multicast Router Discovery (RFC4286), + * then allow scope > link local, too + */ + if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) != IPV6_ADDR_SCOPE_LINKLOCAL) + return -EINVAL; + + /* link-local-all-nodes multicast listeners behind a bridge are + * not snoopable (see RFC4541, section 3, paragraph 3) + */ + if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) + *is_unsnoopable = true; + + return 0; +} + +/** + * batadv_mcast_forw_mode_check - check for optimized forwarding potential + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast frame to check + * @is_unsnoopable: stores whether the destination is snoopable + * + * Checks whether the given multicast ethernet frame has the potential to be + * forwarded with a mode more optimal than classic flooding. + * + * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out + * of memory. + */ +static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, + struct sk_buff *skb, + bool *is_unsnoopable) +{ + struct ethhdr *ethhdr = eth_hdr(skb); + + if (!atomic_read(&bat_priv->multicast_mode)) + return -EINVAL; + + if (atomic_read(&bat_priv->mcast.num_disabled)) + return -EINVAL; + + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, + is_unsnoopable); + case ETH_P_IPV6: + return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, + is_unsnoopable); + default: + return -EINVAL; + } +} + +/** + * batadv_mcast_want_all_ip_count - count nodes with unspecific mcast interest + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: ethernet header of a packet + * + * Returns the number of nodes which want all IPv4 multicast traffic if the + * given ethhdr is from an IPv4 packet or the number of nodes which want all + * IPv6 traffic if it matches an IPv6 packet. + */ +static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, + struct ethhdr *ethhdr) +{ + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_IP: + return atomic_read(&bat_priv->mcast.num_want_all_ipv4); + case ETH_P_IPV6: + return atomic_read(&bat_priv->mcast.num_want_all_ipv6); + default: + /* we shouldn't be here... */ + return 0; + } +} + +/** + * batadv_mcast_forw_tt_node_get - get a multicast tt node + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: the ether header containing the multicast destination + * + * Returns an orig_node matching the multicast address provided by ethhdr + * via a translation table lookup. This increases the returned nodes refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, + struct ethhdr *ethhdr) +{ + return batadv_transtable_search(bat_priv, ethhdr->h_source, + ethhdr->h_dest, BATADV_NO_FLAGS); +} + +/** + * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag + * @bat_priv: the bat priv with all the soft interface information + * + * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and + * increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) +{ + struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_orig_node, + &bat_priv->mcast.want_all_ipv4_list, + mcast_want_all_ipv4_node) { + if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + continue; + + orig_node = tmp_orig_node; + break; + } + rcu_read_unlock(); + + return orig_node; +} + +/** + * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag + * @bat_priv: the bat priv with all the soft interface information + * + * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set + * and increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) +{ + struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_orig_node, + &bat_priv->mcast.want_all_ipv6_list, + mcast_want_all_ipv6_node) { + if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + continue; + + orig_node = tmp_orig_node; + break; + } + rcu_read_unlock(); + + return orig_node; +} + +/** + * batadv_mcast_want_forw_ip_node_get - get a node with an ipv4/ipv6 flag + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: an ethernet header to determine the protocol family from + * + * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or + * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and + * increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, + struct ethhdr *ethhdr) +{ + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_ipv4_node_get(bat_priv); + case ETH_P_IPV6: + return batadv_mcast_forw_ipv6_node_get(bat_priv); + default: + /* we shouldn't be here... */ + return NULL; + } +} + +/** + * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag + * @bat_priv: the bat priv with all the soft interface information + * + * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag + * set and increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) +{ + struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_orig_node, + &bat_priv->mcast.want_all_unsnoopables_list, + mcast_want_all_unsnoopables_node) { + if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + continue; + + orig_node = tmp_orig_node; + break; + } + rcu_read_unlock(); + + return orig_node; +} + +/** + * batadv_mcast_forw_mode - check on how to forward a multicast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: The multicast packet to check + * @orig: an originator to be set to forward the skb to + * + * Returns the forwarding mode as enum batadv_forw_mode and in case of + * BATADV_FORW_SINGLE set the orig to the single originator the skb + * should be forwarded to. + */ +enum batadv_forw_mode +batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, + struct batadv_orig_node **orig) +{ + int ret, tt_count, ip_count, unsnoop_count, total_count; + bool is_unsnoopable = false; + struct ethhdr *ethhdr; + + ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable); + if (ret == -ENOMEM) + return BATADV_FORW_NONE; + else if (ret < 0) + return BATADV_FORW_ALL; + + ethhdr = eth_hdr(skb); + + tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, + BATADV_NO_FLAGS); + ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); + unsnoop_count = !is_unsnoopable ? 0 : + atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); + + total_count = tt_count + ip_count + unsnoop_count; + + switch (total_count) { + case 1: + if (tt_count) + *orig = batadv_mcast_forw_tt_node_get(bat_priv, ethhdr); + else if (ip_count) + *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr); + else if (unsnoop_count) + *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv); + + if (*orig) + return BATADV_FORW_SINGLE; + + /* fall through */ + case 0: + return BATADV_FORW_NONE; + default: + return BATADV_FORW_ALL; + } +} + +/** + * batadv_mcast_want_unsnoop_update - update unsnoop counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, + * orig, has toggled then this method updates counter and list accordingly. + */ +static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + uint8_t mcast_flags) +{ + /* switched from flag unset to set */ + if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { + atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + hlist_add_head_rcu(&orig->mcast_want_all_unsnoopables_node, + &bat_priv->mcast.want_all_unsnoopables_list); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag set to unset */ + } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && + orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) { + atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + hlist_del_rcu(&orig->mcast_want_all_unsnoopables_node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_want_ipv4_update - update want-all-ipv4 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + */ +static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + uint8_t mcast_flags) +{ + /* switched from flag unset to set */ + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && + !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { + atomic_inc(&bat_priv->mcast.num_want_all_ipv4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + hlist_add_head_rcu(&orig->mcast_want_all_ipv4_node, + &bat_priv->mcast.want_all_ipv4_list); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag set to unset */ + } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && + orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) { + atomic_dec(&bat_priv->mcast.num_want_all_ipv4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + hlist_del_rcu(&orig->mcast_want_all_ipv4_node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_want_ipv6_update - update want-all-ipv6 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + */ +static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + uint8_t mcast_flags) +{ + /* switched from flag unset to set */ + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && + !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { + atomic_inc(&bat_priv->mcast.num_want_all_ipv6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + hlist_add_head_rcu(&orig->mcast_want_all_ipv6_node, + &bat_priv->mcast.want_all_ipv6_list); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag set to unset */ + } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && + orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) { + atomic_dec(&bat_priv->mcast.num_want_all_ipv6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + hlist_del_rcu(&orig->mcast_want_all_ipv6_node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_tvlv_ogm_handler_v1 - process incoming multicast tvlv container + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node of the ogm + * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) + * @tvlv_value: tvlv buffer containing the multicast data + * @tvlv_value_len: tvlv buffer length + */ +static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + uint8_t flags, + void *tvlv_value, + uint16_t tvlv_value_len) +{ + bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); + uint8_t mcast_flags = BATADV_NO_FLAGS; + bool orig_initialized; + + orig_initialized = orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST; + + /* If mcast support is turned on decrease the disabled mcast node + * counter only if we had increased it for this node before. If this + * is a completely new orig_node no need to decrease the counter. + */ + if (orig_mcast_enabled && + !(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST)) { + if (orig_initialized) + atomic_dec(&bat_priv->mcast.num_disabled); + orig->capabilities |= BATADV_ORIG_CAPA_HAS_MCAST; + /* If mcast support is being switched off increase the disabled + * mcast node counter. + */ + } else if (!orig_mcast_enabled && + orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST) { + atomic_inc(&bat_priv->mcast.num_disabled); + orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_MCAST; + } + + orig->capa_initialized |= BATADV_ORIG_CAPA_HAS_MCAST; + + if (orig_mcast_enabled && tvlv_value && + (tvlv_value_len >= sizeof(mcast_flags))) + mcast_flags = *(uint8_t *)tvlv_value; + + batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); + + orig->mcast_flags = mcast_flags; +} + +/** + * batadv_mcast_init - initialize the multicast optimizations structures + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_mcast_init(struct batadv_priv *bat_priv) +{ + batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler_v1, + NULL, BATADV_TVLV_MCAST, 1, + BATADV_TVLV_HANDLER_OGM_CIFNOTFND); +} + +/** + * batadv_mcast_free - free the multicast optimizations structures + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_mcast_free(struct batadv_priv *bat_priv) +{ + batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + + batadv_mcast_mla_tt_retract(bat_priv, NULL); +} + +/** + * batadv_mcast_purge_orig - reset originator global mcast state modifications + * @orig: the originator which is going to get purged + */ +void batadv_mcast_purge_orig(struct batadv_orig_node *orig) +{ + struct batadv_priv *bat_priv = orig->bat_priv; + + if (!(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST)) + atomic_dec(&bat_priv->mcast.num_disabled); + + batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); +} diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h new file mode 100644 index 00000000000..73b5d45819c --- /dev/null +++ b/net/batman-adv/multicast.h @@ -0,0 +1,80 @@ +/* Copyright (C) 2014 B.A.T.M.A.N. contributors: + * + * Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _NET_BATMAN_ADV_MULTICAST_H_ +#define _NET_BATMAN_ADV_MULTICAST_H_ + +/** + * batadv_forw_mode - the way a packet should be forwarded as + * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic + * flooding) + * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the + * BATMAN unicast routing protocol) + * @BATADV_FORW_NONE: don't forward, drop it + */ +enum batadv_forw_mode { + BATADV_FORW_ALL, + BATADV_FORW_SINGLE, + BATADV_FORW_NONE, +}; + +#ifdef CONFIG_BATMAN_ADV_MCAST + +void batadv_mcast_mla_update(struct batadv_priv *bat_priv); + +enum batadv_forw_mode +batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, + struct batadv_orig_node **mcast_single_orig); + +void batadv_mcast_init(struct batadv_priv *bat_priv); + +void batadv_mcast_free(struct batadv_priv *bat_priv); + +void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); + +#else + +static inline void batadv_mcast_mla_update(struct batadv_priv *bat_priv) +{ + return; +} + +static inline enum batadv_forw_mode +batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, + struct batadv_orig_node **mcast_single_orig) +{ + return BATADV_FORW_ALL; +} + +static inline int batadv_mcast_init(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_mcast_free(struct batadv_priv *bat_priv) +{ + return; +} + +static inline void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node) +{ + return; +} + +#endif /* CONFIG_BATMAN_ADV_MCAST */ + +#endif /* _NET_BATMAN_ADV_MULTICAST_H_ */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 351e199bc0a..8d04d174669 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/debugfs.h> @@ -88,6 +86,7 @@ static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) void batadv_nc_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); + batadv_nc_tvlv_container_update(bat_priv); } @@ -720,9 +719,21 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_ogm_packet *ogm_packet) { - if (orig_node->last_real_seqno != ntohl(ogm_packet->seqno)) + struct batadv_orig_ifinfo *orig_ifinfo; + uint32_t last_real_seqno; + uint8_t last_ttl; + + orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); + if (!orig_ifinfo) + return false; + + last_ttl = orig_ifinfo->last_ttl; + last_real_seqno = orig_ifinfo->last_real_seqno; + batadv_orig_ifinfo_free_ref(orig_ifinfo); + + if (last_real_seqno != ntohl(ogm_packet->seqno)) return false; - if (orig_node->last_ttl != ogm_packet->header.ttl + 1) + if (last_ttl != ogm_packet->ttl + 1) return false; if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender)) return false; @@ -809,7 +820,7 @@ static struct batadv_nc_node /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); - memcpy(nc_node->addr, orig_node->orig, ETH_ALEN); + ether_addr_copy(nc_node->addr, orig_node->orig); nc_node->orig_node = orig_neigh_node; atomic_set(&nc_node->refcount, 2); @@ -931,8 +942,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, spin_lock_init(&nc_path->packet_list_lock); atomic_set(&nc_path->refcount, 2); nc_path->last_valid = jiffies; - memcpy(nc_path->next_hop, dst, ETH_ALEN); - memcpy(nc_path->prev_hop, src, ETH_ALEN); + ether_addr_copy(nc_path->next_hop, dst); + ether_addr_copy(nc_path->prev_hop, src); batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n", nc_path->prev_hop, @@ -1010,6 +1021,8 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct batadv_coded_packet *coded_packet; struct batadv_neigh_node *neigh_tmp, *router_neigh; struct batadv_neigh_node *router_coding = NULL; + struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; + struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; uint8_t *first_source, *first_dest, *second_source, *second_dest; __be32 packet_id1, packet_id2; size_t count; @@ -1019,19 +1032,34 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, int coded_size = sizeof(*coded_packet); int header_add = coded_size - unicast_size; - router_neigh = batadv_orig_node_get_router(neigh_node->orig_node); + /* TODO: do we need to consider the outgoing interface for + * coded packets? + */ + router_neigh = batadv_orig_router_get(neigh_node->orig_node, + BATADV_IF_DEFAULT); if (!router_neigh) goto out; + router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh, + BATADV_IF_DEFAULT); + if (!router_neigh_ifinfo) + goto out; + neigh_tmp = nc_packet->neigh_node; - router_coding = batadv_orig_node_get_router(neigh_tmp->orig_node); + router_coding = batadv_orig_router_get(neigh_tmp->orig_node, + BATADV_IF_DEFAULT); if (!router_coding) goto out; - tq_tmp = batadv_nc_random_weight_tq(router_neigh->bat_iv.tq_avg); - tq_weighted_neigh = tq_tmp; - tq_tmp = batadv_nc_random_weight_tq(router_coding->bat_iv.tq_avg); - tq_weighted_coding = tq_tmp; + router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding, + BATADV_IF_DEFAULT); + if (!router_coding_ifinfo) + goto out; + + tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg; + tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp); + tq_tmp = router_coding_ifinfo->bat_iv.tq_avg; + tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp); /* Select one destination for the MAC-header dst-field based on * weighted TQ-values. @@ -1082,22 +1110,22 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, coded_packet = (struct batadv_coded_packet *)skb_dest->data; skb_reset_mac_header(skb_dest); - coded_packet->header.packet_type = BATADV_CODED; - coded_packet->header.version = BATADV_COMPAT_VERSION; - coded_packet->header.ttl = packet1->header.ttl; + coded_packet->packet_type = BATADV_CODED; + coded_packet->version = BATADV_COMPAT_VERSION; + coded_packet->ttl = packet1->ttl; /* Info about first unicast packet */ - memcpy(coded_packet->first_source, first_source, ETH_ALEN); - memcpy(coded_packet->first_orig_dest, packet1->dest, ETH_ALEN); + ether_addr_copy(coded_packet->first_source, first_source); + ether_addr_copy(coded_packet->first_orig_dest, packet1->dest); coded_packet->first_crc = packet_id1; coded_packet->first_ttvn = packet1->ttvn; /* Info about second unicast packet */ - memcpy(coded_packet->second_dest, second_dest, ETH_ALEN); - memcpy(coded_packet->second_source, second_source, ETH_ALEN); - memcpy(coded_packet->second_orig_dest, packet2->dest, ETH_ALEN); + ether_addr_copy(coded_packet->second_dest, second_dest); + ether_addr_copy(coded_packet->second_source, second_source); + ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); coded_packet->second_crc = packet_id2; - coded_packet->second_ttl = packet2->header.ttl; + coded_packet->second_ttl = packet2->ttl; coded_packet->second_ttvn = packet2->ttvn; coded_packet->coded_len = htons(coding_len); @@ -1155,6 +1183,10 @@ out: batadv_neigh_node_free_ref(router_neigh); if (router_coding) batadv_neigh_node_free_ref(router_coding); + if (router_neigh_ifinfo) + batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo); + if (router_coding_ifinfo) + batadv_neigh_ifinfo_free_ref(router_coding_ifinfo); return res; } @@ -1312,14 +1344,14 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, struct ethhdr *ethhdr; /* Copy skb header to change the mac header */ - skb = pskb_copy(skb, GFP_ATOMIC); + skb = pskb_copy_for_clone(skb, GFP_ATOMIC); if (!skb) return; /* Set the mac header as if we actually sent the packet uncoded */ ethhdr = eth_hdr(skb); - memcpy(ethhdr->h_source, ethhdr->h_dest, ETH_ALEN); - memcpy(ethhdr->h_dest, eth_dst_new, ETH_ALEN); + ether_addr_copy(ethhdr->h_source, ethhdr->h_dest); + ether_addr_copy(ethhdr->h_dest, eth_dst_new); /* Set data pointer to MAC header to mimic packets from our tx path */ skb_push(skb, ETH_HLEN); @@ -1452,7 +1484,7 @@ bool batadv_nc_skb_forward(struct sk_buff *skb, /* We only handle unicast packets */ payload = skb_network_header(skb); packet = (struct batadv_unicast_packet *)payload; - if (packet->header.packet_type != BATADV_UNICAST) + if (packet->packet_type != BATADV_UNICAST) goto out; /* Try to find a coding opportunity and send the skb if one is found */ @@ -1505,7 +1537,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, /* Check for supported packet type */ payload = skb_network_header(skb); packet = (struct batadv_unicast_packet *)payload; - if (packet->header.packet_type != BATADV_UNICAST) + if (packet->packet_type != BATADV_UNICAST) goto out; /* Find existing nc_path or create a new */ @@ -1605,7 +1637,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, /* Reconstruct original mac header */ ethhdr = eth_hdr(skb); - memcpy(ethhdr, ðhdr_tmp, sizeof(*ethhdr)); + *ethhdr = ethhdr_tmp; /* Select the correct unicast header information based on the location * of our mac address in the coded_packet header @@ -1615,7 +1647,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, * so the Ethernet address must be copied to h_dest and * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST */ - memcpy(ethhdr->h_dest, coded_packet_tmp.second_dest, ETH_ALEN); + ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest); skb->pkt_type = PACKET_HOST; orig_dest = coded_packet_tmp.second_orig_dest; @@ -1623,7 +1655,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, ttvn = coded_packet_tmp.second_ttvn; } else { orig_dest = coded_packet_tmp.first_orig_dest; - ttl = coded_packet_tmp.header.ttl; + ttl = coded_packet_tmp.ttl; ttvn = coded_packet_tmp.first_ttvn; } @@ -1648,10 +1680,10 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, /* Create decoded unicast packet */ unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_packet->header.packet_type = BATADV_UNICAST; - unicast_packet->header.version = BATADV_COMPAT_VERSION; - unicast_packet->header.ttl = ttl; - memcpy(unicast_packet->dest, orig_dest, ETH_ALEN); + unicast_packet->packet_type = BATADV_UNICAST; + unicast_packet->version = BATADV_COMPAT_VERSION; + unicast_packet->ttl = ttl; + ether_addr_copy(unicast_packet->dest, orig_dest); unicast_packet->ttvn = ttvn; batadv_nc_packet_free(nc_packet); diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index d4fd315b526..358c0d686ab 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ @@ -64,7 +62,6 @@ static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv) static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv) { - return; } static inline void @@ -74,7 +71,6 @@ batadv_nc_update_nc_node(struct batadv_priv *bat_priv, struct batadv_ogm_packet *ogm_packet, int is_single_hop_neigh) { - return; } static inline void @@ -83,17 +79,14 @@ batadv_nc_purge_orig(struct batadv_priv *bat_priv, bool (*to_purge)(struct batadv_priv *, struct batadv_nc_node *)) { - return; } static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) { - return; } static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node) { - return; } static inline bool batadv_nc_skb_forward(struct sk_buff *skb, @@ -106,14 +99,12 @@ static inline void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, struct sk_buff *skb) { - return; } static inline void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb) { - return; } static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq, diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8ab14340d10..6a484514cd3 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -29,6 +27,7 @@ #include "bridge_loop_avoidance.h" #include "network-coding.h" #include "fragmentation.h" +#include "multicast.h" /* hash class keys */ static struct lock_class_key batadv_orig_hash_lock_class_key; @@ -41,7 +40,7 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2) const void *data1 = container_of(node, struct batadv_orig_node, hash_entry); - return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); + return batadv_compare_eth(data1, data2); } /** @@ -150,20 +149,114 @@ err: return -ENOMEM; } +/** + * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object + * @rcu: rcu pointer of the neigh_ifinfo object + */ +static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu) +{ + struct batadv_neigh_ifinfo *neigh_ifinfo; + + neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu); + + if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) + batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing); + + kfree(neigh_ifinfo); +} + +/** + * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free + * the neigh_ifinfo (without rcu callback) + * @neigh_ifinfo: the neigh_ifinfo object to release + */ +static void +batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo) +{ + if (atomic_dec_and_test(&neigh_ifinfo->refcount)) + batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu); +} + +/** + * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free + * the neigh_ifinfo + * @neigh_ifinfo: the neigh_ifinfo object to release + */ +void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) +{ + if (atomic_dec_and_test(&neigh_ifinfo->refcount)) + call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu); +} + +/** + * batadv_neigh_node_free_rcu - free the neigh_node + * @rcu: rcu pointer of the neigh_node + */ +static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) +{ + struct hlist_node *node_tmp; + struct batadv_neigh_node *neigh_node; + struct batadv_neigh_ifinfo *neigh_ifinfo; + + neigh_node = container_of(rcu, struct batadv_neigh_node, rcu); + + hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, + &neigh_node->ifinfo_list, list) { + batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo); + } + batadv_hardif_free_ref_now(neigh_node->if_incoming); + + kfree(neigh_node); +} + +/** + * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter + * and possibly free it (without rcu callback) + * @neigh_node: neigh neighbor to free + */ +static void +batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node) +{ + if (atomic_dec_and_test(&neigh_node->refcount)) + batadv_neigh_node_free_rcu(&neigh_node->rcu); +} + +/** + * batadv_neigh_node_free_ref - decrement the neighbors refcounter + * and possibly free it + * @neigh_node: neigh neighbor to free + */ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) { if (atomic_dec_and_test(&neigh_node->refcount)) - kfree_rcu(neigh_node, rcu); + call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu); } -/* increases the refcounter of a found router */ +/** + * batadv_orig_node_get_router - router to the originator depending on iface + * @orig_node: the orig node for the router + * @if_outgoing: the interface where the payload packet has been received or + * the OGM should be sent to + * + * Returns the neighbor which should be router for this orig_node/iface. + * + * The object is returned with refcounter increased by 1. + */ struct batadv_neigh_node * -batadv_orig_node_get_router(struct batadv_orig_node *orig_node) +batadv_orig_router_get(struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *if_outgoing) { - struct batadv_neigh_node *router; + struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_neigh_node *router = NULL; rcu_read_lock(); - router = rcu_dereference(orig_node->router); + hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) { + if (orig_ifinfo->if_outgoing != if_outgoing) + continue; + + router = rcu_dereference(orig_ifinfo->router); + break; + } if (router && !atomic_inc_not_zero(&router->refcount)) router = NULL; @@ -173,6 +266,164 @@ batadv_orig_node_get_router(struct batadv_orig_node *orig_node) } /** + * batadv_orig_ifinfo_get - find the ifinfo from an orig_node + * @orig_node: the orig node to be queried + * @if_outgoing: the interface for which the ifinfo should be acquired + * + * Returns the requested orig_ifinfo or NULL if not found. + * + * The object is returned with refcounter increased by 1. + */ +struct batadv_orig_ifinfo * +batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_orig_ifinfo *tmp, *orig_ifinfo = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp, &orig_node->ifinfo_list, + list) { + if (tmp->if_outgoing != if_outgoing) + continue; + + if (!atomic_inc_not_zero(&tmp->refcount)) + continue; + + orig_ifinfo = tmp; + break; + } + rcu_read_unlock(); + + return orig_ifinfo; +} + +/** + * batadv_orig_ifinfo_new - search and possibly create an orig_ifinfo object + * @orig_node: the orig node to be queried + * @if_outgoing: the interface for which the ifinfo should be acquired + * + * Returns NULL in case of failure or the orig_ifinfo object for the if_outgoing + * interface otherwise. The object is created and added to the list + * if it does not exist. + * + * The object is returned with refcounter increased by 1. + */ +struct batadv_orig_ifinfo * +batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + unsigned long reset_time; + + spin_lock_bh(&orig_node->neigh_list_lock); + + orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); + if (orig_ifinfo) + goto out; + + orig_ifinfo = kzalloc(sizeof(*orig_ifinfo), GFP_ATOMIC); + if (!orig_ifinfo) + goto out; + + if (if_outgoing != BATADV_IF_DEFAULT && + !atomic_inc_not_zero(&if_outgoing->refcount)) { + kfree(orig_ifinfo); + orig_ifinfo = NULL; + goto out; + } + + reset_time = jiffies - 1; + reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); + orig_ifinfo->batman_seqno_reset = reset_time; + orig_ifinfo->if_outgoing = if_outgoing; + INIT_HLIST_NODE(&orig_ifinfo->list); + atomic_set(&orig_ifinfo->refcount, 2); + hlist_add_head_rcu(&orig_ifinfo->list, + &orig_node->ifinfo_list); +out: + spin_unlock_bh(&orig_node->neigh_list_lock); + return orig_ifinfo; +} + +/** + * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node + * @neigh_node: the neigh node to be queried + * @if_outgoing: the interface for which the ifinfo should be acquired + * + * The object is returned with refcounter increased by 1. + * + * Returns the requested neigh_ifinfo or NULL if not found + */ +struct batadv_neigh_ifinfo * +batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL, + *tmp_neigh_ifinfo; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_ifinfo, &neigh->ifinfo_list, + list) { + if (tmp_neigh_ifinfo->if_outgoing != if_outgoing) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_ifinfo->refcount)) + continue; + + neigh_ifinfo = tmp_neigh_ifinfo; + break; + } + rcu_read_unlock(); + + return neigh_ifinfo; +} + +/** + * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object + * @neigh_node: the neigh node to be queried + * @if_outgoing: the interface for which the ifinfo should be acquired + * + * Returns NULL in case of failure or the neigh_ifinfo object for the + * if_outgoing interface otherwise. The object is created and added to the list + * if it does not exist. + * + * The object is returned with refcounter increased by 1. + */ +struct batadv_neigh_ifinfo * +batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_ifinfo *neigh_ifinfo; + + spin_lock_bh(&neigh->ifinfo_lock); + + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh, if_outgoing); + if (neigh_ifinfo) + goto out; + + neigh_ifinfo = kzalloc(sizeof(*neigh_ifinfo), GFP_ATOMIC); + if (!neigh_ifinfo) + goto out; + + if (if_outgoing && !atomic_inc_not_zero(&if_outgoing->refcount)) { + kfree(neigh_ifinfo); + neigh_ifinfo = NULL; + goto out; + } + + INIT_HLIST_NODE(&neigh_ifinfo->list); + atomic_set(&neigh_ifinfo->refcount, 2); + neigh_ifinfo->if_outgoing = if_outgoing; + + hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); + +out: + spin_unlock_bh(&neigh->ifinfo_lock); + + return neigh_ifinfo; +} + +/** * batadv_neigh_node_new - create and init a new neigh_node object * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface @@ -193,13 +444,13 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, goto out; INIT_HLIST_NODE(&neigh_node->list); + INIT_HLIST_HEAD(&neigh_node->ifinfo_list); + spin_lock_init(&neigh_node->ifinfo_lock); - memcpy(neigh_node->addr, neigh_addr, ETH_ALEN); + ether_addr_copy(neigh_node->addr, neigh_addr); neigh_node->if_incoming = hard_iface; neigh_node->orig_node = orig_node; - INIT_LIST_HEAD(&neigh_node->bonding_list); - /* extra reference for return */ atomic_set(&neigh_node->refcount, 2); @@ -207,32 +458,113 @@ out: return neigh_node; } +/** + * batadv_neigh_node_get - retrieve a neighbour from the list + * @orig_node: originator which the neighbour belongs to + * @hard_iface: the interface where this neighbour is connected to + * @addr: the address of the neighbour + * + * Looks for and possibly returns a neighbour belonging to this originator list + * which is connected through the provided hard interface. + * Returns NULL if the neighbour is not found. + */ +struct batadv_neigh_node * +batadv_neigh_node_get(const struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *hard_iface, + const uint8_t *addr) +{ + struct batadv_neigh_node *tmp_neigh_node, *res = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { + if (!batadv_compare_eth(tmp_neigh_node->addr, addr)) + continue; + + if (tmp_neigh_node->if_incoming != hard_iface) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + continue; + + res = tmp_neigh_node; + break; + } + rcu_read_unlock(); + + return res; +} + +/** + * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object + * @rcu: rcu pointer of the orig_ifinfo object + */ +static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu) +{ + struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_neigh_node *router; + + orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu); + + if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) + batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing); + + /* this is the last reference to this object */ + router = rcu_dereference_protected(orig_ifinfo->router, true); + if (router) + batadv_neigh_node_free_ref_now(router); + kfree(orig_ifinfo); +} + +/** + * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free + * the orig_ifinfo (without rcu callback) + * @orig_ifinfo: the orig_ifinfo object to release + */ +static void +batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo) +{ + if (atomic_dec_and_test(&orig_ifinfo->refcount)) + batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu); +} + +/** + * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free + * the orig_ifinfo + * @orig_ifinfo: the orig_ifinfo object to release + */ +void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) +{ + if (atomic_dec_and_test(&orig_ifinfo->refcount)) + call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu); +} + static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { struct hlist_node *node_tmp; - struct batadv_neigh_node *neigh_node, *tmp_neigh_node; + struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; + struct batadv_orig_ifinfo *orig_ifinfo; orig_node = container_of(rcu, struct batadv_orig_node, rcu); spin_lock_bh(&orig_node->neigh_list_lock); - /* for all bonding members ... */ - list_for_each_entry_safe(neigh_node, tmp_neigh_node, - &orig_node->bond_list, bonding_list) { - list_del_rcu(&neigh_node->bonding_list); - batadv_neigh_node_free_ref(neigh_node); - } - /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_free_ref_now(neigh_node); } + hlist_for_each_entry_safe(orig_ifinfo, node_tmp, + &orig_node->ifinfo_list, list) { + hlist_del_rcu(&orig_ifinfo->list); + batadv_orig_ifinfo_free_ref_now(orig_ifinfo); + } spin_unlock_bh(&orig_node->neigh_list_lock); + batadv_mcast_purge_orig(orig_node); + /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); @@ -327,8 +659,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, return NULL; INIT_HLIST_HEAD(&orig_node->neigh_list); - INIT_LIST_HEAD(&orig_node->bond_list); INIT_LIST_HEAD(&orig_node->vlan_list); + INIT_HLIST_HEAD(&orig_node->ifinfo_list); spin_lock_init(&orig_node->bcast_seqno_lock); spin_lock_init(&orig_node->neigh_list_lock); spin_lock_init(&orig_node->tt_buff_lock); @@ -340,19 +672,17 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, /* extra reference for return */ atomic_set(&orig_node->refcount, 2); - orig_node->tt_initialised = false; orig_node->bat_priv = bat_priv; - memcpy(orig_node->orig, addr, ETH_ALEN); + ether_addr_copy(orig_node->orig, addr); batadv_dat_init_orig_node_addr(orig_node); - orig_node->router = NULL; atomic_set(&orig_node->last_ttvn, 0); orig_node->tt_buff = NULL; orig_node->tt_buff_len = 0; reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); orig_node->bcast_seqno_reset = reset_time; - orig_node->batman_seqno_reset = reset_time; - - atomic_set(&orig_node->bond_candidates, 0); +#ifdef CONFIG_BATMAN_ADV_MCAST + orig_node->mcast_flags = BATADV_NO_FLAGS; +#endif /* create a vlan object for the "untagged" LAN */ vlan = batadv_orig_node_vlan_new(orig_node, BATADV_NO_FLAGS); @@ -376,20 +706,117 @@ free_orig_node: return NULL; } +/** + * batadv_purge_neigh_ifinfo - purge obsolete ifinfo entries from neighbor + * @bat_priv: the bat priv with all the soft interface information + * @neigh: orig node which is to be checked + */ +static void +batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, + struct batadv_neigh_node *neigh) +{ + struct batadv_neigh_ifinfo *neigh_ifinfo; + struct batadv_hard_iface *if_outgoing; + struct hlist_node *node_tmp; + + spin_lock_bh(&neigh->ifinfo_lock); + + /* for all ifinfo objects for this neighinator */ + hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, + &neigh->ifinfo_list, list) { + if_outgoing = neigh_ifinfo->if_outgoing; + + /* always keep the default interface */ + if (if_outgoing == BATADV_IF_DEFAULT) + continue; + + /* don't purge if the interface is not (going) down */ + if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && + (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && + (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + continue; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "neighbor/ifinfo purge: neighbor %pM, iface: %s\n", + neigh->addr, if_outgoing->net_dev->name); + + hlist_del_rcu(&neigh_ifinfo->list); + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + } + + spin_unlock_bh(&neigh->ifinfo_lock); +} + +/** + * batadv_purge_orig_ifinfo - purge obsolete ifinfo entries from originator + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is to be checked + * + * Returns true if any ifinfo entry was purged, false otherwise. + */ +static bool +batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node) +{ + struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_hard_iface *if_outgoing; + struct hlist_node *node_tmp; + bool ifinfo_purged = false; + + spin_lock_bh(&orig_node->neigh_list_lock); + + /* for all ifinfo objects for this originator */ + hlist_for_each_entry_safe(orig_ifinfo, node_tmp, + &orig_node->ifinfo_list, list) { + if_outgoing = orig_ifinfo->if_outgoing; + + /* always keep the default interface */ + if (if_outgoing == BATADV_IF_DEFAULT) + continue; + + /* don't purge if the interface is not (going) down */ + if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && + (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && + (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + continue; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "router/ifinfo purge: originator %pM, iface: %s\n", + orig_node->orig, if_outgoing->net_dev->name); + + ifinfo_purged = true; + + hlist_del_rcu(&orig_ifinfo->list); + batadv_orig_ifinfo_free_ref(orig_ifinfo); + if (orig_node->last_bonding_candidate == orig_ifinfo) { + orig_node->last_bonding_candidate = NULL; + batadv_orig_ifinfo_free_ref(orig_ifinfo); + } + } + + spin_unlock_bh(&orig_node->neigh_list_lock); + + return ifinfo_purged; +} + + +/** + * batadv_purge_orig_neighbors - purges neighbors from originator + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is to be checked + * + * Returns true if any neighbor was purged, false otherwise + */ static bool batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node **best_neigh) + struct batadv_orig_node *orig_node) { - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; bool neigh_purged = false; unsigned long last_seen; struct batadv_hard_iface *if_incoming; - *best_neigh = NULL; - spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ @@ -418,15 +845,12 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, neigh_purged = true; hlist_del_rcu(&neigh_node->list); - batadv_bonding_candidate_del(orig_node, neigh_node); batadv_neigh_node_free_ref(neigh_node); } else { - /* store the best_neighbour if this is the first - * iteration or if a better neighbor has been found + /* only necessary if not the whole neighbor is to be + * deleted, but some interface has been removed. */ - if (!*best_neigh || - bao->bat_neigh_cmp(neigh_node, *best_neigh) > 0) - *best_neigh = neigh_node; + batadv_purge_neigh_ifinfo(bat_priv, neigh_node); } } @@ -434,10 +858,57 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, return neigh_purged; } +/** + * batadv_find_best_neighbor - finds the best neighbor after purging + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is to be checked + * @if_outgoing: the interface for which the metric should be compared + * + * Returns the current best neighbor, with refcount increased. + */ +static struct batadv_neigh_node * +batadv_find_best_neighbor(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *best = NULL, *neigh; + struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + + rcu_read_lock(); + hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) { + if (best && (bao->bat_neigh_cmp(neigh, if_outgoing, + best, if_outgoing) <= 0)) + continue; + + if (!atomic_inc_not_zero(&neigh->refcount)) + continue; + + if (best) + batadv_neigh_node_free_ref(best); + + best = neigh; + } + rcu_read_unlock(); + + return best; +} + +/** + * batadv_purge_orig_node - purges obsolete information from an orig_node + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is to be checked + * + * This function checks if the orig_node or substructures of it have become + * obsolete, and purges this information if that's the case. + * + * Returns true if the orig_node is to be removed, false otherwise. + */ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_neigh_node *best_neigh_node; + struct batadv_hard_iface *hard_iface; + bool changed_ifinfo, changed_neigh; if (batadv_has_timed_out(orig_node->last_seen, 2 * BATADV_PURGE_TIMEOUT)) { @@ -446,12 +917,39 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, orig_node->orig, jiffies_to_msecs(orig_node->last_seen)); return true; - } else { - if (batadv_purge_orig_neighbors(bat_priv, orig_node, - &best_neigh_node)) - batadv_update_route(bat_priv, orig_node, - best_neigh_node); } + changed_ifinfo = batadv_purge_orig_ifinfo(bat_priv, orig_node); + changed_neigh = batadv_purge_orig_neighbors(bat_priv, orig_node); + + if (!changed_ifinfo && !changed_neigh) + return false; + + /* first for NULL ... */ + best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node, + BATADV_IF_DEFAULT); + batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, + best_neigh_node); + if (best_neigh_node) + batadv_neigh_node_free_ref(best_neigh_node); + + /* ... then for all other interfaces. */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status != BATADV_IF_ACTIVE) + continue; + + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + best_neigh_node = batadv_find_best_neighbor(bat_priv, + orig_node, + hard_iface); + batadv_update_route(bat_priv, orig_node, hard_iface, + best_neigh_node); + if (best_neigh_node) + batadv_neigh_node_free_ref(best_neigh_node); + } + rcu_read_unlock(); return false; } @@ -534,11 +1032,58 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) return 0; } - bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq); + bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, + BATADV_IF_DEFAULT); return 0; } +/** + * batadv_orig_hardif_seq_print_text - writes originator infos for a specific + * outgoing interface + * @seq: debugfs table seq_file struct + * @offset: not used + * + * Returns 0 + */ +int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_hard_iface *hard_iface; + struct batadv_priv *bat_priv; + + hard_iface = batadv_hardif_get_by_netdev(net_dev); + + if (!hard_iface || !hard_iface->soft_iface) { + seq_puts(seq, "Interface not known to B.A.T.M.A.N.\n"); + goto out; + } + + bat_priv = netdev_priv(hard_iface->soft_iface); + if (!bat_priv->bat_algo_ops->bat_orig_print) { + seq_puts(seq, + "No printing function for this routing protocol\n"); + goto out; + } + + if (hard_iface->if_status != BATADV_IF_ACTIVE) { + seq_puts(seq, "Interface not active\n"); + goto out; + } + + seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n", + BATADV_SOURCE_VERSION, hard_iface->net_dev->name, + hard_iface->net_dev->dev_addr, + hard_iface->soft_iface->name, bat_priv->bat_algo_ops->name); + + bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, hard_iface); + +out: + if (hard_iface) + batadv_hardif_free_ref(hard_iface); + return 0; +} + int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num) { diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 6f77d808a91..db3a9ed734c 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ @@ -31,13 +29,35 @@ void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_neigh_node * +batadv_neigh_node_get(const struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *hard_iface, + const uint8_t *addr); +struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, const uint8_t *neigh_addr, struct batadv_orig_node *orig_node); void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * -batadv_orig_node_get_router(struct batadv_orig_node *orig_node); +batadv_orig_router_get(struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *if_outgoing); +struct batadv_neigh_ifinfo * +batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, + struct batadv_hard_iface *if_outgoing); +struct batadv_neigh_ifinfo * +batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, + struct batadv_hard_iface *if_outgoing); +void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo); + +struct batadv_orig_ifinfo * +batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing); +struct batadv_orig_ifinfo * +batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing); +void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo); + int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); +int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num); int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 207459b6296..34e096d2dce 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_PACKET_H_ @@ -91,6 +89,19 @@ enum batadv_icmp_packettype { BATADV_PARAMETER_PROBLEM = 12, }; +/** + * enum batadv_mcast_flags - flags for multicast capabilities and settings + * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for + * 224.0.0.0/24 or ff02::1 + * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets + * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets + */ +enum batadv_mcast_flags { + BATADV_MCAST_WANT_ALL_UNSNOOPABLES = BIT(0), + BATADV_MCAST_WANT_ALL_IPV4 = BIT(1), + BATADV_MCAST_WANT_ALL_IPV6 = BIT(2), +}; + /* tt data subtypes */ #define BATADV_TT_DATA_TYPE_MASK 0x0F @@ -108,15 +119,36 @@ enum batadv_tt_data_flags { BATADV_TT_FULL_TABLE = BIT(4), }; -/* BATADV_TT_CLIENT flags. - * Flags from BIT(0) to BIT(7) are sent on the wire, while flags from BIT(8) to - * BIT(15) are used for local computation only. - * Flags from BIT(4) to BIT(7) are kept in sync with the rest of the network. +/** + * enum batadv_tt_client_flags - TT client specific flags + * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table + * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new + * update telling its new real location has not been received/sent yet + * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface. + * This information is used by the "AP Isolation" feature + * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This + * information is used by the Extended Isolation feature + * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table + * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has + * not been announced yet + * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept + * in the table for one more originator interval for consistency purposes + * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of + * the network but no nnode has already announced it + * + * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. + * Bits from 8 to 15 are called _local flags_ because they are used for local + * computations only. + * + * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with + * the other nodes in the network. To achieve this goal these flags are included + * in the TT CRC computation. */ enum batadv_tt_client_flags { BATADV_TT_CLIENT_DEL = BIT(0), BATADV_TT_CLIENT_ROAM = BIT(1), BATADV_TT_CLIENT_WIFI = BIT(4), + BATADV_TT_CLIENT_ISOLA = BIT(5), BATADV_TT_CLIENT_NOPURGE = BIT(8), BATADV_TT_CLIENT_NEW = BIT(9), BATADV_TT_CLIENT_PENDING = BIT(10), @@ -146,6 +178,7 @@ enum batadv_bla_claimframe { * @BATADV_TVLV_NC: network coding tvlv * @BATADV_TVLV_TT: translation table tvlv * @BATADV_TVLV_ROAM: roaming advertisement tvlv + * @BATADV_TVLV_MCAST: multicast capability tvlv */ enum batadv_tvlv_type { BATADV_TVLV_GW = 0x01, @@ -153,8 +186,10 @@ enum batadv_tvlv_type { BATADV_TVLV_NC = 0x03, BATADV_TVLV_TT = 0x04, BATADV_TVLV_ROAM = 0x05, + BATADV_TVLV_MCAST = 0x06, }; +#pragma pack(2) /* the destination hardware field in the ARP frame is used to * transport the claim type and the group id */ @@ -163,24 +198,20 @@ struct batadv_bla_claim_dst { uint8_t type; /* bla_claimframe */ __be16 group; /* group id */ }; - -struct batadv_header { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; - /* the parent struct has to add a byte after the header to make - * everything 4 bytes aligned again - */ -}; +#pragma pack() /** * struct batadv_ogm_packet - ogm (routing protocol) packet - * @header: common batman packet header + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header * @flags: contains routing relevant flags - see enum batadv_iv_flags * @tvlv_len: length of tvlv data following the ogm header */ struct batadv_ogm_packet { - struct batadv_header header; + uint8_t packet_type; + uint8_t version; + uint8_t ttl; uint8_t flags; __be32 seqno; uint8_t orig[ETH_ALEN]; @@ -196,29 +227,51 @@ struct batadv_ogm_packet { #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) /** - * batadv_icmp_header - common ICMP header - * @header: common batman header + * batadv_icmp_header - common members among all the ICMP packets + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node * @uid: local ICMP socket identifier + * @align: not used - useful for alignment purposes only + * + * This structure is used for ICMP packets parsing only and it is never sent + * over the wire. The alignment field at the end is there to ensure that + * members are padded the same way as they are in real packets. */ struct batadv_icmp_header { - struct batadv_header header; + uint8_t packet_type; + uint8_t version; + uint8_t ttl; uint8_t msg_type; /* see ICMP message types above */ uint8_t dst[ETH_ALEN]; uint8_t orig[ETH_ALEN]; uint8_t uid; + uint8_t align[3]; }; /** * batadv_icmp_packet - ICMP packet - * @icmph: common ICMP header + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier * @reserved: not used - useful for alignment * @seqno: ICMP sequence number */ struct batadv_icmp_packet { - struct batadv_icmp_header icmph; + uint8_t packet_type; + uint8_t version; + uint8_t ttl; + uint8_t msg_type; /* see ICMP message types above */ + uint8_t dst[ETH_ALEN]; + uint8_t orig[ETH_ALEN]; + uint8_t uid; uint8_t reserved; __be16 seqno; }; @@ -227,13 +280,25 @@ struct batadv_icmp_packet { /** * batadv_icmp_packet_rr - ICMP RouteRecord packet - * @icmph: common ICMP header + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier * @rr_cur: number of entries the rr array * @seqno: ICMP sequence number * @rr: route record array */ struct batadv_icmp_packet_rr { - struct batadv_icmp_header icmph; + uint8_t packet_type; + uint8_t version; + uint8_t ttl; + uint8_t msg_type; /* see ICMP message types above */ + uint8_t dst[ETH_ALEN]; + uint8_t orig[ETH_ALEN]; + uint8_t uid; uint8_t rr_cur; __be16 seqno; uint8_t rr[BATADV_RR_LEN][ETH_ALEN]; @@ -253,8 +318,18 @@ struct batadv_icmp_packet_rr { */ #pragma pack(2) +/** + * struct batadv_unicast_packet - unicast packet for network payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @ttvn: translation table version number + * @dest: originator destination of the unicast packet + */ struct batadv_unicast_packet { - struct batadv_header header; + uint8_t packet_type; + uint8_t version; + uint8_t ttl; uint8_t ttvn; /* destination translation table version number */ uint8_t dest[ETH_ALEN]; /* "4 bytes boundary + 2 bytes" long to make the payload after the @@ -280,7 +355,9 @@ struct batadv_unicast_4addr_packet { /** * struct batadv_frag_packet - fragmented packet - * @header: common batman packet header with type, compatversion, and ttl + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header * @dest: final destination used when routing fragments * @orig: originator of the fragment used when merging the packet * @no: fragment number within this sequence @@ -289,7 +366,9 @@ struct batadv_unicast_4addr_packet { * @total_size: size of the merged packet */ struct batadv_frag_packet { - struct batadv_header header; + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t ttl; #if defined(__BIG_ENDIAN_BITFIELD) uint8_t no:4; uint8_t reserved:4; @@ -305,8 +384,19 @@ struct batadv_frag_packet { __be16 total_size; }; +/** + * struct batadv_bcast_packet - broadcast packet for network payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @reserved: reserved byte for alignment + * @seqno: sequence identification + * @orig: originator of the broadcast packet + */ struct batadv_bcast_packet { - struct batadv_header header; + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t ttl; uint8_t reserved; __be32 seqno; uint8_t orig[ETH_ALEN]; @@ -315,11 +405,11 @@ struct batadv_bcast_packet { */ }; -#pragma pack() - /** * struct batadv_coded_packet - network coded packet - * @header: common batman packet header and ttl of first included packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header * @reserved: Align following fields to 2-byte boundaries * @first_source: original source of first included packet * @first_orig_dest: original destinal of first included packet @@ -334,7 +424,9 @@ struct batadv_bcast_packet { * @coded_len: length of network coded part of the payload */ struct batadv_coded_packet { - struct batadv_header header; + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t ttl; uint8_t first_ttvn; /* uint8_t first_dest[ETH_ALEN]; - saved in mac header destination */ uint8_t first_source[ETH_ALEN]; @@ -349,9 +441,13 @@ struct batadv_coded_packet { __be16 coded_len; }; +#pragma pack() + /** * struct batadv_unicast_tvlv - generic unicast packet with tvlv payload - * @header: common batman packet header + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header * @reserved: reserved field (for packet alignment) * @src: address of the source * @dst: address of the destination @@ -359,7 +455,9 @@ struct batadv_coded_packet { * @align: 2 bytes to align the header to a 4 byte boundry */ struct batadv_unicast_tvlv_packet { - struct batadv_header header; + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t ttl; uint8_t reserved; uint8_t dst[ETH_ALEN]; uint8_t src[ETH_ALEN]; @@ -420,13 +518,13 @@ struct batadv_tvlv_tt_vlan_data { * struct batadv_tvlv_tt_change - translation table diff data * @flags: status indicators concerning the non-mesh client (see * batadv_tt_client_flags) - * @reserved: reserved field + * @reserved: reserved field - useful for alignment purposes only * @addr: mac address of non-mesh client that triggered this tt change * @vid: VLAN identifier */ struct batadv_tvlv_tt_change { uint8_t flags; - uint8_t reserved; + uint8_t reserved[3]; uint8_t addr[ETH_ALEN]; __be16 vid; }; @@ -441,4 +539,14 @@ struct batadv_tvlv_roam_adv { __be16 vid; }; +/** + * struct batadv_tvlv_mcast_data - payload of a multicast tvlv + * @flags: multicast flags announced by the orig node + * @reserved: reserved field + */ +struct batadv_tvlv_mcast_data { + uint8_t flags; + uint8_t reserved[3]; +}; + #endif /* _NET_BATMAN_ADV_PACKET_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index d4114d775ad..35141534938 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -35,13 +33,32 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); +/** + * _batadv_update_route - set the router for this originator + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is to be configured + * @recv_if: the receive interface for which this route is set + * @neigh_node: neighbor which should be the next router + * + * This function does not perform any error checks + */ static void _batadv_update_route(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, + struct batadv_hard_iface *recv_if, struct batadv_neigh_node *neigh_node) { + struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *curr_router; - curr_router = batadv_orig_node_get_router(orig_node); + orig_ifinfo = batadv_orig_ifinfo_get(orig_node, recv_if); + if (!orig_ifinfo) + return; + + rcu_read_lock(); + curr_router = rcu_dereference(orig_ifinfo->router); + if (curr_router && !atomic_inc_not_zero(&curr_router->refcount)) + curr_router = NULL; + rcu_read_unlock(); /* route deleted */ if ((curr_router) && (!neigh_node)) { @@ -71,16 +88,25 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, neigh_node = NULL; spin_lock_bh(&orig_node->neigh_list_lock); - rcu_assign_pointer(orig_node->router, neigh_node); + rcu_assign_pointer(orig_ifinfo->router, neigh_node); spin_unlock_bh(&orig_node->neigh_list_lock); + batadv_orig_ifinfo_free_ref(orig_ifinfo); /* decrease refcount of previous best neighbor */ if (curr_router) batadv_neigh_node_free_ref(curr_router); } +/** + * batadv_update_route - set the router for this originator + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is to be configured + * @recv_if: the receive interface for which this route is set + * @neigh_node: neighbor which should be the next router + */ void batadv_update_route(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, + struct batadv_hard_iface *recv_if, struct batadv_neigh_node *neigh_node) { struct batadv_neigh_node *router = NULL; @@ -88,125 +114,16 @@ void batadv_update_route(struct batadv_priv *bat_priv, if (!orig_node) goto out; - router = batadv_orig_node_get_router(orig_node); + router = batadv_orig_router_get(orig_node, recv_if); if (router != neigh_node) - _batadv_update_route(bat_priv, orig_node, neigh_node); - -out: - if (router) - batadv_neigh_node_free_ref(router); -} - -/* caller must hold the neigh_list_lock */ -void batadv_bonding_candidate_del(struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node) -{ - /* this neighbor is not part of our candidate list */ - if (list_empty(&neigh_node->bonding_list)) - goto out; - - list_del_rcu(&neigh_node->bonding_list); - INIT_LIST_HEAD(&neigh_node->bonding_list); - batadv_neigh_node_free_ref(neigh_node); - atomic_dec(&orig_node->bond_candidates); - -out: - return; -} - -/** - * batadv_bonding_candidate_add - consider a new link for bonding mode towards - * the given originator - * @bat_priv: the bat priv with all the soft interface information - * @orig_node: the target node - * @neigh_node: the neighbor representing the new link to consider for bonding - * mode - */ -void batadv_bonding_candidate_add(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node) -{ - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; - struct batadv_neigh_node *tmp_neigh_node, *router = NULL; - uint8_t interference_candidate = 0; - - spin_lock_bh(&orig_node->neigh_list_lock); - - /* only consider if it has the same primary address ... */ - if (!batadv_compare_eth(orig_node->orig, - neigh_node->orig_node->primary_addr)) - goto candidate_del; - - router = batadv_orig_node_get_router(orig_node); - if (!router) - goto candidate_del; - - - /* ... and is good enough to be considered */ - if (bao->bat_neigh_is_equiv_or_better(neigh_node, router)) - goto candidate_del; - - /* check if we have another candidate with the same mac address or - * interface. If we do, we won't select this candidate because of - * possible interference. - */ - hlist_for_each_entry_rcu(tmp_neigh_node, - &orig_node->neigh_list, list) { - if (tmp_neigh_node == neigh_node) - continue; - - /* we only care if the other candidate is even - * considered as candidate. - */ - if (list_empty(&tmp_neigh_node->bonding_list)) - continue; - - if ((neigh_node->if_incoming == tmp_neigh_node->if_incoming) || - (batadv_compare_eth(neigh_node->addr, - tmp_neigh_node->addr))) { - interference_candidate = 1; - break; - } - } - - /* don't care further if it is an interference candidate */ - if (interference_candidate) - goto candidate_del; - - /* this neighbor already is part of our candidate list */ - if (!list_empty(&neigh_node->bonding_list)) - goto out; - - if (!atomic_inc_not_zero(&neigh_node->refcount)) - goto out; - - list_add_rcu(&neigh_node->bonding_list, &orig_node->bond_list); - atomic_inc(&orig_node->bond_candidates); - goto out; - -candidate_del: - batadv_bonding_candidate_del(orig_node, neigh_node); + _batadv_update_route(bat_priv, orig_node, recv_if, neigh_node); out: - spin_unlock_bh(&orig_node->neigh_list_lock); - if (router) batadv_neigh_node_free_ref(router); } -/* copy primary address for bonding */ -void -batadv_bonding_save_primary(const struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - const struct batadv_ogm_packet *batman_ogm_packet) -{ - if (!(batman_ogm_packet->flags & BATADV_PRIMARIES_FIRST_HOP)) - return; - - memcpy(orig_neigh_node->primary_addr, orig_node->orig, ETH_ALEN); -} - /* checks whether the host restarted and is in the protection time. * returns: * 0 if the packet is to be accepted @@ -305,10 +222,10 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, icmph = (struct batadv_icmp_header *)skb->data; - memcpy(icmph->dst, icmph->orig, ETH_ALEN); - memcpy(icmph->orig, primary_if->net_dev->dev_addr, ETH_ALEN); + ether_addr_copy(icmph->dst, icmph->orig); + ether_addr_copy(icmph->orig, primary_if->net_dev->dev_addr); icmph->msg_type = BATADV_ECHO_REPLY; - icmph->header.ttl = BATADV_TTL; + icmph->ttl = BATADV_TTL; res = batadv_send_skb_to_orig(skb, orig_node, NULL); if (res != NET_XMIT_DROP) @@ -338,9 +255,9 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, icmp_packet = (struct batadv_icmp_packet *)skb->data; /* send TTL exceeded if packet is an echo request (traceroute) */ - if (icmp_packet->icmph.msg_type != BATADV_ECHO_REQUEST) { + if (icmp_packet->msg_type != BATADV_ECHO_REQUEST) { pr_debug("Warning - can't forward icmp packet from %pM to %pM: ttl exceeded\n", - icmp_packet->icmph.orig, icmp_packet->icmph.dst); + icmp_packet->orig, icmp_packet->dst); goto out; } @@ -349,7 +266,7 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, goto out; /* get routing information */ - orig_node = batadv_orig_hash_find(bat_priv, icmp_packet->icmph.orig); + orig_node = batadv_orig_hash_find(bat_priv, icmp_packet->orig); if (!orig_node) goto out; @@ -359,11 +276,10 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, icmp_packet = (struct batadv_icmp_packet *)skb->data; - memcpy(icmp_packet->icmph.dst, icmp_packet->icmph.orig, ETH_ALEN); - memcpy(icmp_packet->icmph.orig, primary_if->net_dev->dev_addr, - ETH_ALEN); - icmp_packet->icmph.msg_type = BATADV_TTL_EXCEEDED; - icmp_packet->icmph.header.ttl = BATADV_TTL; + ether_addr_copy(icmp_packet->dst, icmp_packet->orig); + ether_addr_copy(icmp_packet->orig, primary_if->net_dev->dev_addr); + icmp_packet->msg_type = BATADV_TTL_EXCEEDED; + icmp_packet->ttl = BATADV_TTL; if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) ret = NET_RX_SUCCESS; @@ -424,8 +340,8 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN) goto out; - memcpy(&(icmp_packet_rr->rr[icmp_packet_rr->rr_cur]), - ethhdr->h_dest, ETH_ALEN); + ether_addr_copy(icmp_packet_rr->rr[icmp_packet_rr->rr_cur], + ethhdr->h_dest); icmp_packet_rr->rr_cur++; } @@ -434,7 +350,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, return batadv_recv_my_icmp_packet(bat_priv, skb); /* TTL exceeded */ - if (icmph->header.ttl < 2) + if (icmph->ttl < 2) return batadv_recv_icmp_ttl_exceeded(bat_priv, skb); /* get routing information */ @@ -449,7 +365,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, icmph = (struct batadv_icmp_header *)skb->data; /* decrement ttl */ - icmph->header.ttl--; + icmph->ttl--; /* route it */ if (batadv_send_skb_to_orig(skb, orig_node, recv_if) != NET_XMIT_DROP) @@ -461,114 +377,6 @@ out: return ret; } -/* In the bonding case, send the packets in a round - * robin fashion over the remaining interfaces. - * - * This method rotates the bonding list and increases the - * returned router's refcount. - */ -static struct batadv_neigh_node * -batadv_find_bond_router(struct batadv_orig_node *primary_orig, - const struct batadv_hard_iface *recv_if) -{ - struct batadv_neigh_node *tmp_neigh_node; - struct batadv_neigh_node *router = NULL, *first_candidate = NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp_neigh_node, &primary_orig->bond_list, - bonding_list) { - if (!first_candidate) - first_candidate = tmp_neigh_node; - - /* recv_if == NULL on the first node. */ - if (tmp_neigh_node->if_incoming == recv_if) - continue; - - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) - continue; - - router = tmp_neigh_node; - break; - } - - /* use the first candidate if nothing was found. */ - if (!router && first_candidate && - atomic_inc_not_zero(&first_candidate->refcount)) - router = first_candidate; - - if (!router) - goto out; - - /* selected should point to the next element - * after the current router - */ - spin_lock_bh(&primary_orig->neigh_list_lock); - /* this is a list_move(), which unfortunately - * does not exist as rcu version - */ - list_del_rcu(&primary_orig->bond_list); - list_add_rcu(&primary_orig->bond_list, - &router->bonding_list); - spin_unlock_bh(&primary_orig->neigh_list_lock); - -out: - rcu_read_unlock(); - return router; -} - -/** - * batadv_find_ifalter_router - find the best of the remaining candidates which - * are not using this interface - * @bat_priv: the bat priv with all the soft interface information - * @primary_orig: the destination - * @recv_if: the interface that the router returned by this function has to not - * use - * - * Returns the best candidate towards primary_orig that is not using recv_if. - * Increases the returned neighbor's refcount - */ -static struct batadv_neigh_node * -batadv_find_ifalter_router(struct batadv_priv *bat_priv, - struct batadv_orig_node *primary_orig, - const struct batadv_hard_iface *recv_if) -{ - struct batadv_neigh_node *router = NULL, *first_candidate = NULL; - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; - struct batadv_neigh_node *tmp_neigh_node; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp_neigh_node, &primary_orig->bond_list, - bonding_list) { - if (!first_candidate) - first_candidate = tmp_neigh_node; - - /* recv_if == NULL on the first node. */ - if (tmp_neigh_node->if_incoming == recv_if) - continue; - - if (router && bao->bat_neigh_cmp(tmp_neigh_node, router)) - continue; - - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) - continue; - - /* decrement refcount of previously selected router */ - if (router) - batadv_neigh_node_free_ref(router); - - /* we found a better router (or at least one valid router) */ - router = tmp_neigh_node; - } - - /* use the first candidate if nothing was found. */ - if (!router && first_candidate && - atomic_inc_not_zero(&first_candidate->refcount)) - router = first_candidate; - - rcu_read_unlock(); - return router; -} - /** * batadv_check_unicast_packet - Check for malformed unicast packets * @bat_priv: the bat priv with all the soft interface information @@ -606,95 +414,141 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, return 0; } -/* find a suitable router for this originator, and use - * bonding if possible. increases the found neighbors - * refcount. +/** + * batadv_find_router - find a suitable router for this originator + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: the destination node + * @recv_if: pointer to interface this packet was received on + * + * Returns the router which should be used for this orig_node on + * this interface, or NULL if not available. */ struct batadv_neigh_node * batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - const struct batadv_hard_iface *recv_if) + struct batadv_hard_iface *recv_if) { - struct batadv_orig_node *primary_orig_node; - struct batadv_orig_node *router_orig; - struct batadv_neigh_node *router; - static uint8_t zero_mac[ETH_ALEN] = {0, 0, 0, 0, 0, 0}; - int bonding_enabled; - uint8_t *primary_addr; + struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_neigh_node *first_candidate_router = NULL; + struct batadv_neigh_node *next_candidate_router = NULL; + struct batadv_neigh_node *router, *cand_router = NULL; + struct batadv_neigh_node *last_cand_router = NULL; + struct batadv_orig_ifinfo *cand, *first_candidate = NULL; + struct batadv_orig_ifinfo *next_candidate = NULL; + struct batadv_orig_ifinfo *last_candidate; + bool last_candidate_found = false; if (!orig_node) return NULL; - router = batadv_orig_node_get_router(orig_node); - if (!router) - goto err; + router = batadv_orig_router_get(orig_node, recv_if); - /* without bonding, the first node should - * always choose the default router. + /* only consider bonding for recv_if == BATADV_IF_DEFAULT (first hop) + * and if activated. + */ + if (recv_if == BATADV_IF_DEFAULT || !atomic_read(&bat_priv->bonding) || + !router) + return router; + + /* bonding: loop through the list of possible routers found + * for the various outgoing interfaces and find a candidate after + * the last chosen bonding candidate (next_candidate). If no such + * router is found, use the first candidate found (the previously + * chosen bonding candidate might have been the last one in the list). + * If this can't be found either, return the previously choosen + * router - obviously there are no other candidates. */ - bonding_enabled = atomic_read(&bat_priv->bonding); - rcu_read_lock(); - /* select default router to output */ - router_orig = router->orig_node; - if (!router_orig) - goto err_unlock; + last_candidate = orig_node->last_bonding_candidate; + if (last_candidate) + last_cand_router = rcu_dereference(last_candidate->router); - if ((!recv_if) && (!bonding_enabled)) - goto return_router; + hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) { + /* acquire some structures and references ... */ + if (!atomic_inc_not_zero(&cand->refcount)) + continue; - primary_addr = router_orig->primary_addr; + cand_router = rcu_dereference(cand->router); + if (!cand_router) + goto next; - /* if we have something in the primary_addr, we can search - * for a potential bonding candidate. - */ - if (batadv_compare_eth(primary_addr, zero_mac)) - goto return_router; + if (!atomic_inc_not_zero(&cand_router->refcount)) { + cand_router = NULL; + goto next; + } - /* find the orig_node which has the primary interface. might - * even be the same as our router_orig in many cases - */ - if (batadv_compare_eth(primary_addr, router_orig->orig)) { - primary_orig_node = router_orig; - } else { - primary_orig_node = batadv_orig_hash_find(bat_priv, - primary_addr); - if (!primary_orig_node) - goto return_router; + /* alternative candidate should be good enough to be + * considered + */ + if (!bao->bat_neigh_is_equiv_or_better(cand_router, + cand->if_outgoing, + router, recv_if)) + goto next; + + /* don't use the same router twice */ + if (last_cand_router == cand_router) + goto next; + + /* mark the first possible candidate */ + if (!first_candidate) { + atomic_inc(&cand_router->refcount); + atomic_inc(&cand->refcount); + first_candidate = cand; + first_candidate_router = cand_router; + } + + /* check if the loop has already passed the previously selected + * candidate ... this function should select the next candidate + * AFTER the previously used bonding candidate. + */ + if (!last_candidate || last_candidate_found) { + next_candidate = cand; + next_candidate_router = cand_router; + break; + } - batadv_orig_node_free_ref(primary_orig_node); + if (last_candidate == cand) + last_candidate_found = true; +next: + /* free references */ + if (cand_router) { + batadv_neigh_node_free_ref(cand_router); + cand_router = NULL; + } + batadv_orig_ifinfo_free_ref(cand); } + rcu_read_unlock(); - /* with less than 2 candidates, we can't do any - * bonding and prefer the original router. - */ - if (atomic_read(&primary_orig_node->bond_candidates) < 2) - goto return_router; + /* last_bonding_candidate is reset below, remove the old reference. */ + if (orig_node->last_bonding_candidate) + batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate); - /* all nodes between should choose a candidate which - * is is not on the interface where the packet came - * in. + /* After finding candidates, handle the three cases: + * 1) there is a next candidate, use that + * 2) there is no next candidate, use the first of the list + * 3) there is no candidate at all, return the default router */ - batadv_neigh_node_free_ref(router); + if (next_candidate) { + batadv_neigh_node_free_ref(router); - if (bonding_enabled) - router = batadv_find_bond_router(primary_orig_node, recv_if); - else - router = batadv_find_ifalter_router(bat_priv, primary_orig_node, - recv_if); + /* remove references to first candidate, we don't need it. */ + if (first_candidate) { + batadv_neigh_node_free_ref(first_candidate_router); + batadv_orig_ifinfo_free_ref(first_candidate); + } + router = next_candidate_router; + orig_node->last_bonding_candidate = next_candidate; + } else if (first_candidate) { + batadv_neigh_node_free_ref(router); -return_router: - if (router && router->if_incoming->if_status != BATADV_IF_ACTIVE) - goto err_unlock; + /* refcounting has already been done in the loop above. */ + router = first_candidate_router; + orig_node->last_bonding_candidate = first_candidate; + } else { + orig_node->last_bonding_candidate = NULL; + } - rcu_read_unlock(); return router; -err_unlock: - rcu_read_unlock(); -err: - if (router) - batadv_neigh_node_free_ref(router); - return NULL; } static int batadv_route_unicast_packet(struct sk_buff *skb, @@ -709,7 +563,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, unicast_packet = (struct batadv_unicast_packet *)skb->data; /* TTL exceeded */ - if (unicast_packet->header.ttl < 2) { + if (unicast_packet->ttl < 2) { pr_debug("Warning - can't forward unicast packet from %pM to %pM: ttl exceeded\n", ethhdr->h_source, unicast_packet->dest); goto out; @@ -727,9 +581,9 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, /* decrement ttl */ unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_packet->header.ttl--; + unicast_packet->ttl--; - switch (unicast_packet->header.packet_type) { + switch (unicast_packet->packet_type) { case BATADV_UNICAST_4ADDR: hdr_len = sizeof(struct batadv_unicast_4addr_packet); break; @@ -809,7 +663,7 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, } /* update the packet header */ - memcpy(unicast_packet->dest, orig_addr, ETH_ALEN); + ether_addr_copy(unicast_packet->dest, orig_addr); unicast_packet->ttvn = orig_ttvn; ret = true; @@ -833,7 +687,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, int is_old_ttvn; /* check if there is enough data before accessing it */ - if (pskb_may_pull(skb, hdr_len + ETH_HLEN) < 0) + if (!pskb_may_pull(skb, hdr_len + ETH_HLEN)) return 0; /* create a copy of the skb (in case of for re-routing) to modify it. */ @@ -919,7 +773,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, if (!primary_if) return 0; - memcpy(unicast_packet->dest, primary_if->net_dev->dev_addr, ETH_ALEN); + ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); batadv_hardif_free_ref(primary_if); @@ -970,7 +824,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, unicast_packet = (struct batadv_unicast_packet *)skb->data; unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; - is4addr = unicast_packet->header.packet_type == BATADV_UNICAST_4ADDR; + is4addr = unicast_packet->packet_type == BATADV_UNICAST_4ADDR; /* the caller function should have already pulled 2 bytes */ if (is4addr) hdr_size = sizeof(*unicast_4addr_packet); @@ -1063,6 +917,8 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, if (ret != NET_RX_SUCCESS) ret = batadv_route_unicast_packet(skb, recv_if); + else + consume_skb(skb); return ret; } @@ -1135,6 +991,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, int hdr_size = sizeof(*bcast_packet); int ret = NET_RX_DROP; int32_t seq_diff; + uint32_t seqno; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) @@ -1160,7 +1017,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, if (batadv_is_my_mac(bat_priv, bcast_packet->orig)) goto out; - if (bcast_packet->header.ttl < 2) + if (bcast_packet->ttl < 2) goto out; orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig); @@ -1170,12 +1027,13 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, spin_lock_bh(&orig_node->bcast_seqno_lock); + seqno = ntohl(bcast_packet->seqno); /* check whether the packet is a duplicate */ if (batadv_test_bit(orig_node->bcast_bits, orig_node->last_bcast_seqno, - ntohl(bcast_packet->seqno))) + seqno)) goto spin_unlock; - seq_diff = ntohl(bcast_packet->seqno) - orig_node->last_bcast_seqno; + seq_diff = seqno - orig_node->last_bcast_seqno; /* check whether the packet is old and the host just restarted. */ if (batadv_window_protected(bat_priv, seq_diff, @@ -1186,7 +1044,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, * if required. */ if (batadv_bit_get_packet(bat_priv, orig_node->bcast_bits, seq_diff, 1)) - orig_node->last_bcast_seqno = ntohl(bcast_packet->seqno); + orig_node->last_bcast_seqno = seqno; spin_unlock_bh(&orig_node->bcast_seqno_lock); diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 19544ddb81b..557d3d12a9a 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_ROUTING_H_ @@ -25,6 +23,7 @@ bool batadv_check_management_packet(struct sk_buff *skb, int header_len); void batadv_update_route(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, + struct batadv_hard_iface *recv_if, struct batadv_neigh_node *neigh_node); int batadv_recv_icmp_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); @@ -45,16 +44,7 @@ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, struct batadv_neigh_node * batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - const struct batadv_hard_iface *recv_if); -void batadv_bonding_candidate_del(struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node); -void batadv_bonding_candidate_add(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node); -void batadv_bonding_save_primary(const struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - const struct batadv_ogm_packet - *batman_ogm_packet); + struct batadv_hard_iface *recv_if); int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff, unsigned long *last_reset); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index c83be5ebaa2..3d64ed20c39 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -29,6 +27,7 @@ #include "originator.h" #include "network-coding.h" #include "fragmentation.h" +#include "multicast.h" static void batadv_send_outstanding_bcast_packet(struct work_struct *work); @@ -61,8 +60,8 @@ int batadv_send_skb_packet(struct sk_buff *skb, skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); - memcpy(ethhdr->h_source, hard_iface->net_dev->dev_addr, ETH_ALEN); - memcpy(ethhdr->h_dest, dst_addr, ETH_ALEN); + ether_addr_copy(ethhdr->h_source, hard_iface->net_dev->dev_addr); + ether_addr_copy(ethhdr->h_dest, dst_addr); ethhdr->h_proto = htons(ETH_P_BATMAN); skb_set_network_header(skb, ETH_HLEN); @@ -161,13 +160,13 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, return false; unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_packet->header.version = BATADV_COMPAT_VERSION; + unicast_packet->version = BATADV_COMPAT_VERSION; /* batman packet type: unicast */ - unicast_packet->header.packet_type = BATADV_UNICAST; + unicast_packet->packet_type = BATADV_UNICAST; /* set unicast ttl */ - unicast_packet->header.ttl = BATADV_TTL; + unicast_packet->ttl = BATADV_TTL; /* copy the destination for faster routing */ - memcpy(unicast_packet->dest, orig_node->orig, ETH_ALEN); + ether_addr_copy(unicast_packet->dest, orig_node->orig); /* set the destination tt version number */ unicast_packet->ttvn = ttvn; @@ -221,8 +220,8 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, goto out; uc_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; - uc_4addr_packet->u.header.packet_type = BATADV_UNICAST_4ADDR; - memcpy(uc_4addr_packet->src, primary_if->net_dev->dev_addr, ETH_ALEN); + uc_4addr_packet->u.packet_type = BATADV_UNICAST_4ADDR; + ether_addr_copy(uc_4addr_packet->src, primary_if->net_dev->dev_addr); uc_4addr_packet->subtype = packet_subtype; uc_4addr_packet->reserved = 0; @@ -250,13 +249,13 @@ out: * * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ -static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb, int packet_type, - int packet_subtype, - struct batadv_orig_node *orig_node, - unsigned short vid) +int batadv_send_skb_unicast(struct batadv_priv *bat_priv, + struct sk_buff *skb, int packet_type, + int packet_subtype, + struct batadv_orig_node *orig_node, + unsigned short vid) { - struct ethhdr *ethhdr = (struct ethhdr *)skb->data; + struct ethhdr *ethhdr; struct batadv_unicast_packet *unicast_packet; int ret = NET_XMIT_DROP; @@ -281,6 +280,10 @@ static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, goto out; } + /* skb->data might have been reallocated by + * batadv_send_skb_prepare_unicast{,_4addr}() + */ + ethhdr = eth_hdr(skb); unicast_packet = (struct batadv_unicast_packet *)skb->data; /* inform the destination node that we are still missing a correct route @@ -309,6 +312,7 @@ out: * @packet_type: the batman unicast packet type to use * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast * 4addr packets) + * @dst_hint: can be used to override the destination contained in the skb * @vid: the vid to be used to search the translation table * * Look up the recipient node for the destination address in the ethernet @@ -321,13 +325,23 @@ out: */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, - int packet_subtype, unsigned short vid) + int packet_subtype, uint8_t *dst_hint, + unsigned short vid) { struct ethhdr *ethhdr = (struct ethhdr *)skb->data; struct batadv_orig_node *orig_node; + uint8_t *src, *dst; + + src = ethhdr->h_source; + dst = ethhdr->h_dest; + + /* if we got an hint! let's send the packet to this client (if any) */ + if (dst_hint) { + src = NULL; + dst = dst_hint; + } + orig_node = batadv_transtable_search(bat_priv, src, dst, vid); - orig_node = batadv_transtable_search(bat_priv, ethhdr->h_source, - ethhdr->h_dest, vid); return batadv_send_skb_unicast(bat_priv, skb, packet_type, packet_subtype, orig_node, vid); } @@ -379,6 +393,8 @@ static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) kfree_skb(forw_packet->skb); if (forw_packet->if_incoming) batadv_hardif_free_ref(forw_packet->if_incoming); + if (forw_packet->if_outgoing) + batadv_hardif_free_ref(forw_packet->if_outgoing); kfree(forw_packet); } @@ -436,12 +452,13 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, /* as we have a copy now, it is safe to decrease the TTL */ bcast_packet = (struct batadv_bcast_packet *)newskb->data; - bcast_packet->header.ttl--; + bcast_packet->ttl--; skb_reset_mac_header(newskb); forw_packet->skb = newskb; forw_packet->if_incoming = primary_if; + forw_packet->if_outgoing = NULL; /* how often did we send the bcast packet ? */ forw_packet->num_packets = 0; @@ -537,11 +554,16 @@ void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work) bat_priv->bat_algo_ops->bat_ogm_emit(forw_packet); - /* we have to have at least one packet in the queue - * to determine the queues wake up time unless we are - * shutting down + /* we have to have at least one packet in the queue to determine the + * queues wake up time unless we are shutting down. + * + * only re-schedule if this is the "original" copy, e.g. the OGM of the + * primary interface should only be rescheduled once per period, but + * this function will be called for the forw_packet instances of the + * other secondary interfaces as well. */ - if (forw_packet->own) + if (forw_packet->own && + forw_packet->if_incoming == forw_packet->if_outgoing) batadv_schedule_bat_ogm(forw_packet->if_incoming); out: @@ -602,7 +624,8 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, * we delete only packets belonging to the given interface */ if ((hard_iface) && - (forw_packet->if_incoming != hard_iface)) + (forw_packet->if_incoming != hard_iface) && + (forw_packet->if_outgoing != hard_iface)) continue; spin_unlock_bh(&bat_priv->forw_bat_list_lock); diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index aa2e2537a73..38d0ec1833a 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_SEND_H_ @@ -38,9 +36,15 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig_node, int packet_subtype); +int batadv_send_skb_unicast(struct batadv_priv *bat_priv, + struct sk_buff *skb, int packet_type, + int packet_subtype, + struct batadv_orig_node *orig_node, + unsigned short vid); int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, - int packet_subtype, unsigned short vid); + int packet_subtype, uint8_t *dst_hint, + unsigned short vid); int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid); @@ -48,6 +52,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, * batadv_send_skb_via_tt - send an skb via TT lookup * @bat_priv: the bat priv with all the soft interface information * @skb: the payload to send + * @dst_hint: can be used to override the destination contained in the skb * @vid: the vid to be used to search the translation table * * Look up the recipient node for the destination address in the ethernet @@ -57,11 +62,11 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, - struct sk_buff *skb, + struct sk_buff *skb, uint8_t *dst_hint, unsigned short vid) { return batadv_send_skb_via_tt_generic(bat_priv, skb, BATADV_UNICAST, 0, - vid); + dst_hint, vid); } /** @@ -69,6 +74,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @skb: the payload to send * @packet_subtype: the unicast 4addr packet subtype to use + * @dst_hint: can be used to override the destination contained in the skb * @vid: the vid to be used to search the translation table * * Look up the recipient node for the destination address in the ethernet @@ -81,11 +87,12 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_subtype, + uint8_t *dst_hint, unsigned short vid) { return batadv_send_skb_via_tt_generic(bat_priv, skb, BATADV_UNICAST_4ADDR, - packet_subtype, vid); + packet_subtype, dst_hint, vid); } #endif /* _NET_BATMAN_ADV_SEND_H_ */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 36f050876f8..cbd677f48c0 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -34,6 +32,7 @@ #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> +#include "multicast.h" #include "bridge_loop_avoidance.h" #include "network-coding.h" @@ -113,15 +112,15 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; - memcpy(old_addr, dev->dev_addr, ETH_ALEN); - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + ether_addr_copy(old_addr, dev->dev_addr); + ether_addr_copy(dev->dev_addr, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) { batadv_tt_local_remove(bat_priv, old_addr, BATADV_NO_FLAGS, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, BATADV_NO_FLAGS, - BATADV_NULL_IFINDEX); + BATADV_NULL_IFINDEX, BATADV_NO_MARK); } return 0; @@ -162,6 +161,8 @@ static int batadv_interface_tx(struct sk_buff *skb, 0x00, 0x00}; static const uint8_t ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; + enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; + uint8_t *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; @@ -169,17 +170,20 @@ static int batadv_interface_tx(struct sk_buff *skb, bool do_bcast = false, client_added; unsigned short vid; uint32_t seqno; + int gw_mode; + enum batadv_forw_mode forw_mode; + struct batadv_orig_node *mcast_single_orig = NULL; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; soft_iface->trans_start = jiffies; vid = batadv_get_vid(skb, 0); - ethhdr = (struct ethhdr *)skb->data; + ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: - vhdr = (struct vlan_ethhdr *)skb->data; + vhdr = vlan_eth_hdr(skb); if (vhdr->h_vlan_encapsulated_proto != ethertype) break; @@ -193,12 +197,13 @@ static int batadv_interface_tx(struct sk_buff *skb, goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ - ethhdr = (struct ethhdr *)skb->data; + ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, - vid, skb->skb_iif); + vid, skb->skb_iif, + skb->mark); if (!client_added) goto dropped; } @@ -215,34 +220,47 @@ static int batadv_interface_tx(struct sk_buff *skb, if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; + gw_mode = atomic_read(&bat_priv->gw_mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { - do_bcast = true; + /* if gw mode is off, broadcast every packet */ + if (gw_mode == BATADV_GW_MODE_OFF) { + do_bcast = true; + goto send; + } - switch (atomic_read(&bat_priv->gw_mode)) { - case BATADV_GW_MODE_SERVER: - /* gateway servers should not send dhcp - * requests into the mesh + dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, + chaddr); + /* skb->data may have been modified by + * batadv_gw_dhcp_recipient_get() + */ + ethhdr = eth_hdr(skb); + /* if gw_mode is on, broadcast any non-DHCP message. + * All the DHCP packets are going to be sent as unicast + */ + if (dhcp_rcp == BATADV_DHCP_NO) { + do_bcast = true; + goto send; + } + + if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) + dst_hint = chaddr; + else if ((gw_mode == BATADV_GW_MODE_SERVER) && + (dhcp_rcp == BATADV_DHCP_TO_SERVER)) + /* gateways should not forward any DHCP message if + * directed to a DHCP server */ - ret = batadv_gw_is_dhcp_target(skb, &header_len); - if (ret) + goto dropped; + +send: + if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { + forw_mode = batadv_mcast_forw_mode(bat_priv, skb, + &mcast_single_orig); + if (forw_mode == BATADV_FORW_NONE) goto dropped; - break; - case BATADV_GW_MODE_CLIENT: - /* gateway clients should send dhcp requests - * via unicast to their gateway - */ - ret = batadv_gw_is_dhcp_target(skb, &header_len); - if (ret) + + if (forw_mode == BATADV_FORW_SINGLE) do_bcast = false; - break; - case BATADV_GW_MODE_OFF: - default: - break; } - - /* reminder: ethhdr might have become unusable from here on - * (batadv_gw_is_dhcp_target() might have reallocated skb data) - */ } batadv_skb_set_priority(skb, 0); @@ -264,18 +282,18 @@ static int batadv_interface_tx(struct sk_buff *skb, goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; - bcast_packet->header.version = BATADV_COMPAT_VERSION; - bcast_packet->header.ttl = BATADV_TTL; + bcast_packet->version = BATADV_COMPAT_VERSION; + bcast_packet->ttl = BATADV_TTL; /* batman packet type: broadcast */ - bcast_packet->header.packet_type = BATADV_BCAST; + bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ - memcpy(bcast_packet->orig, - primary_if->net_dev->dev_addr, ETH_ALEN); + ether_addr_copy(bcast_packet->orig, + primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); @@ -290,22 +308,26 @@ static int batadv_interface_tx(struct sk_buff *skb, /* unicast packet */ } else { - if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_OFF) { + /* DHCP packets going to a server will use the GW feature */ + if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; - } - - if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) - goto dropped; - - batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); - - if (is_multicast_ether_addr(ethhdr->h_dest)) ret = batadv_send_skb_via_gw(bat_priv, skb, vid); - else - ret = batadv_send_skb_via_tt(bat_priv, skb, vid); + } else if (mcast_single_orig) { + ret = batadv_send_skb_unicast(bat_priv, skb, + BATADV_UNICAST, 0, + mcast_single_orig, vid); + } else { + if (batadv_dat_snoop_outgoing_arp_request(bat_priv, + skb)) + goto dropped; + + batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); + ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, + vid); + } if (ret == NET_XMIT_DROP) goto dropped_freed; } @@ -328,7 +350,7 @@ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, struct batadv_hard_iface *recv_if, int hdr_size, struct batadv_orig_node *orig_node) { - struct batadv_header *batadv_header = (struct batadv_header *)skb->data; + struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); __be16 ethertype = htons(ETH_P_BATMAN); struct vlan_ethhdr *vhdr; @@ -336,7 +358,8 @@ void batadv_interface_rx(struct net_device *soft_iface, unsigned short vid; bool is_bcast; - is_bcast = (batadv_header->packet_type == BATADV_BCAST); + batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; + is_bcast = (batadv_bcast_packet->packet_type == BATADV_BCAST); /* check if enough space is available for pulling, and pull */ if (!pskb_may_pull(skb, hdr_size)) @@ -345,7 +368,12 @@ void batadv_interface_rx(struct net_device *soft_iface, skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); - vid = batadv_get_vid(skb, hdr_size); + /* clean the netfilter state now that the batman-adv header has been + * removed + */ + nf_reset(skb); + + vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { @@ -388,9 +416,23 @@ void batadv_interface_rx(struct net_device *soft_iface, batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); - if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, - vid)) + if (is_multicast_ether_addr(ethhdr->h_dest)) { + /* set the mark on broadcast packets if AP isolation is ON and + * the packet is coming from an "isolated" client + */ + if (batadv_vlan_ap_isola_get(bat_priv, vid) && + batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, + vid)) { + /* save bits in skb->mark not covered by the mask and + * apply the mark on the rest + */ + skb->mark &= ~bat_priv->isolation_mark_mask; + skb->mark |= bat_priv->isolation_mark; + } + } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, + ethhdr->h_dest, vid)) { goto dropped; + } netif_rx(skb); goto out; @@ -406,10 +448,15 @@ out: * possibly free it * @softif_vlan: the vlan object to release */ -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan) +void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) { - if (atomic_dec_and_test(&softif_vlan->refcount)) - kfree_rcu(softif_vlan, rcu); + if (atomic_dec_and_test(&vlan->refcount)) { + spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); + hlist_del_rcu(&vlan->list); + spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); + + kfree_rcu(vlan, rcu); + } } /** @@ -463,6 +510,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) if (!vlan) return -ENOMEM; + vlan->bat_priv = bat_priv; vlan->vid = vid; atomic_set(&vlan->refcount, 1); @@ -474,16 +522,16 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) return err; } + spin_lock_bh(&bat_priv->softif_vlan_list_lock); + hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, - BATADV_NULL_IFINDEX); - - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } @@ -496,18 +544,13 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - hlist_del_rcu(&vlan->list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); - - batadv_sysfs_del_vlan(bat_priv, vlan); - /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); + batadv_sysfs_del_vlan(bat_priv, vlan); batadv_softif_vlan_free_ref(vlan); } @@ -525,6 +568,8 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct batadv_softif_vlan *vlan; + int ret; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types @@ -534,7 +579,36 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, vid |= BATADV_VLAN_HAS_TAG; - return batadv_softif_create_vlan(bat_priv, vid); + /* if a new vlan is getting created and it already exists, it means that + * it was not deleted yet. batadv_softif_vlan_get() increases the + * refcount in order to revive the object. + * + * if it does not exist then create it. + */ + vlan = batadv_softif_vlan_get(bat_priv, vid); + if (!vlan) + return batadv_softif_create_vlan(bat_priv, vid); + + /* recreate the sysfs object if it was already destroyed (and it should + * be since we received a kill_vid() for this vlan + */ + if (!vlan->kobj) { + ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); + if (ret) { + batadv_softif_vlan_free_ref(vlan); + return ret; + } + } + + /* add a new TT local entry. This one will be marked with the NOPURGE + * flag. This must be added again, even if the vlan object already + * exists, because the entry was deleted by kill_vid() + */ + batadv_tt_local_add(bat_priv->soft_iface, + bat_priv->soft_iface->dev_addr, vid, + BATADV_NULL_IFINDEX, BATADV_NO_MARK); + + return 0; } /** @@ -627,10 +701,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work) } batadv_sysfs_del_meshif(soft_iface); - - rtnl_lock(); - unregister_netdevice(soft_iface); - rtnl_unlock(); + unregister_netdev(soft_iface); } /** @@ -667,12 +738,20 @@ static int batadv_softif_init_late(struct net_device *dev) #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif +#ifdef CONFIG_BATMAN_ADV_MCAST + bat_priv->mcast.flags = BATADV_NO_FLAGS; + atomic_set(&bat_priv->multicast_mode, 1); + atomic_set(&bat_priv->mcast.num_disabled, 0); + atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); + atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); + atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); +#endif atomic_set(&bat_priv->gw_mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw_sel_class, 20); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); - atomic_set(&bat_priv->hop_penalty, 30); + atomic_set(&bat_priv->hop_penalty, 15); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif @@ -691,6 +770,8 @@ static int batadv_softif_init_late(struct net_device *dev) #endif bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; + bat_priv->isolation_mark = 0; + bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); @@ -835,7 +916,7 @@ static void batadv_softif_init_early(struct net_device *dev) /* generate random address */ eth_hw_addr_random(dev); - SET_ETHTOOL_OPS(dev, &batadv_ethtool_ops); + dev->ethtool_ops = &batadv_ethtool_ops; memset(priv, 0, sizeof(*priv)); } diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 06fc91ff5a0..dbab22fd89a 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 6335433310a..fc47baa888c 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -31,12 +29,14 @@ static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) { struct device *dev = container_of(obj->parent, struct device, kobj); + return to_net_dev(dev); } static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj) { struct net_device *net_dev = batadv_kobj_to_netdev(obj); + return netdev_priv(net_dev); } @@ -108,7 +108,7 @@ struct batadv_attribute batadv_attr_vlan_##_name = { \ .mode = _mode }, \ .show = _show, \ .store = _store, \ -}; +} /* Use this, if you have customized show and store functions */ #define BATADV_ATTR(_name, _mode, _show, _store) \ @@ -117,7 +117,7 @@ struct batadv_attribute batadv_attr_##_name = { \ .mode = _mode }, \ .show = _show, \ .store = _store, \ -}; +} #define BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \ ssize_t batadv_store_##_name(struct kobject *kobj, \ @@ -126,6 +126,7 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ { \ struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ struct batadv_priv *bat_priv = netdev_priv(net_dev); \ + \ return __batadv_store_bool_attr(buff, count, _post_func, attr, \ &bat_priv->_name, net_dev); \ } @@ -135,6 +136,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff) \ { \ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ + \ return sprintf(buff, "%s\n", \ atomic_read(&bat_priv->_name) == 0 ? \ "disabled" : "enabled"); \ @@ -157,6 +159,7 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ { \ struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \ struct batadv_priv *bat_priv = netdev_priv(net_dev); \ + \ return __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ &bat_priv->_name, net_dev); \ @@ -167,6 +170,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff) \ { \ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ + \ return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name)); \ } \ @@ -190,6 +194,7 @@ ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \ size_t res = __batadv_store_bool_attr(buff, count, _post_func, \ attr, &vlan->_name, \ bat_priv->soft_iface); \ + \ batadv_softif_vlan_free_ref(vlan); \ return res; \ } @@ -204,6 +209,7 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ size_t res = sprintf(buff, "%s\n", \ atomic_read(&vlan->_name) == 0 ? \ "disabled" : "enabled"); \ + \ batadv_softif_vlan_free_ref(vlan); \ return res; \ } @@ -326,13 +332,15 @@ static ssize_t batadv_show_bat_algo(struct kobject *kobj, struct attribute *attr, char *buff) { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + return sprintf(buff, "%s\n", bat_priv->bat_algo_ops->name); } -static void batadv_post_gw_deselect(struct net_device *net_dev) +static void batadv_post_gw_reselect(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); - batadv_gw_deselect(bat_priv); + + batadv_gw_reselect(bat_priv); } static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr, @@ -408,7 +416,16 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, batadv_info(net_dev, "Changing gw mode from: %s to: %s\n", curr_gw_mode_str, buff); - batadv_gw_deselect(bat_priv); + /* Invoking batadv_gw_reselect() is not enough to really de-select the + * current GW. It will only instruct the gateway client code to perform + * a re-election the next time that this is needed. + * + * When gw client mode is being switched off the current GW must be + * de-selected explicitly otherwise no GW_ADD uevent is thrown on + * client mode re-activation. This is operation is performed in + * batadv_gw_check_client_stop(). + */ + batadv_gw_reselect(bat_priv); /* always call batadv_gw_check_client_stop() before changing the gateway * state */ @@ -443,6 +460,74 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj, return batadv_gw_bandwidth_set(net_dev, buff, count); } +/** + * batadv_show_isolation_mark - print the current isolation mark/mask + * @kobj: kobject representing the private mesh sysfs directory + * @attr: the batman-adv attribute the user is interacting with + * @buff: the buffer that will contain the data to send back to the user + * + * Returns the number of bytes written into 'buff' on success or a negative + * error code in case of failure + */ +static ssize_t batadv_show_isolation_mark(struct kobject *kobj, + struct attribute *attr, char *buff) +{ + struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + + return sprintf(buff, "%#.8x/%#.8x\n", bat_priv->isolation_mark, + bat_priv->isolation_mark_mask); +} + +/** + * batadv_store_isolation_mark - parse and store the isolation mark/mask entered + * by the user + * @kobj: kobject representing the private mesh sysfs directory + * @attr: the batman-adv attribute the user is interacting with + * @buff: the buffer containing the user data + * @count: number of bytes in the buffer + * + * Returns 'count' on success or a negative error code in case of failure + */ +static ssize_t batadv_store_isolation_mark(struct kobject *kobj, + struct attribute *attr, char *buff, + size_t count) +{ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct batadv_priv *bat_priv = netdev_priv(net_dev); + uint32_t mark, mask; + char *mask_ptr; + + /* parse the mask if it has been specified, otherwise assume the mask is + * the biggest possible + */ + mask = 0xFFFFFFFF; + mask_ptr = strchr(buff, '/'); + if (mask_ptr) { + *mask_ptr = '\0'; + mask_ptr++; + + /* the mask must be entered in hex base as it is going to be a + * bitmask and not a prefix length + */ + if (kstrtou32(mask_ptr, 16, &mask) < 0) + return -EINVAL; + } + + /* the mark can be entered in any base */ + if (kstrtou32(buff, 0, &mark) < 0) + return -EINVAL; + + bat_priv->isolation_mark_mask = mask; + /* erase bits not covered by the mask */ + bat_priv->isolation_mark = mark & bat_priv->isolation_mark_mask; + + batadv_info(net_dev, + "New skb mark for extended isolation: %#.8x/%#.8x\n", + bat_priv->isolation_mark, bat_priv->isolation_mark_mask); + + return count; +} + BATADV_ATTR_SIF_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL); BATADV_ATTR_SIF_BOOL(bonding, S_IRUGO | S_IWUSR, NULL); #ifdef CONFIG_BATMAN_ADV_BLA @@ -461,9 +546,12 @@ BATADV_ATTR_SIF_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, BATADV_ATTR_SIF_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, NULL); BATADV_ATTR_SIF_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE, - batadv_post_gw_deselect); + batadv_post_gw_reselect); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); +#ifdef CONFIG_BATMAN_ADV_MCAST +BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL); +#endif #ifdef CONFIG_BATMAN_ADV_DEBUG BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL); #endif @@ -471,6 +559,8 @@ BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL); BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR, batadv_nc_status_update); #endif +static BATADV_ATTR(isolation_mark, S_IRUGO | S_IWUSR, + batadv_show_isolation_mark, batadv_store_isolation_mark); static struct batadv_attribute *batadv_mesh_attrs[] = { &batadv_attr_aggregated_ogms, @@ -481,6 +571,9 @@ static struct batadv_attribute *batadv_mesh_attrs[] = { #ifdef CONFIG_BATMAN_ADV_DAT &batadv_attr_distributed_arp_table, #endif +#ifdef CONFIG_BATMAN_ADV_MCAST + &batadv_attr_multicast_mode, +#endif &batadv_attr_fragmentation, &batadv_attr_routing_algo, &batadv_attr_gw_mode, @@ -494,6 +587,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = { #ifdef CONFIG_BATMAN_ADV_NC &batadv_attr_network_coding, #endif + &batadv_attr_isolation_mark, NULL, }; diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index c7d725de50a..b715b60db7c 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_SYSFS_H_ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 4add57d4857..5f59e7f899a 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -26,6 +24,7 @@ #include "originator.h" #include "routing.h" #include "bridge_loop_avoidance.h" +#include "multicast.h" #include <linux/crc32c.h> @@ -51,7 +50,7 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); - return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); + return batadv_compare_eth(data1, data2); } /** @@ -98,7 +97,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr, if (!hash) return NULL; - memcpy(to_search.addr, addr, ETH_ALEN); + ether_addr_copy(to_search.addr, addr); to_search.vid = vid; index = batadv_choose_tt(&to_search, hash->size); @@ -194,6 +193,31 @@ batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry) } } +/** + * batadv_tt_global_hash_count - count the number of orig entries + * @hash: hash table containing the tt entries + * @addr: the mac address of the client to count entries for + * @vid: VLAN identifier + * + * Return the number of originators advertising the given address/data + * (excluding ourself). + */ +int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, + const uint8_t *addr, unsigned short vid) +{ + struct batadv_tt_global_entry *tt_global_entry; + int count; + + tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); + if (!tt_global_entry) + return 0; + + count = atomic_read(&tt_global_entry->orig_list_count); + batadv_tt_global_entry_free_ref(tt_global_entry); + + return count; +} + static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) { struct batadv_tt_orig_list_entry *orig_entry; @@ -333,8 +357,9 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, return; tt_change_node->change.flags = flags; - tt_change_node->change.reserved = 0; - memcpy(tt_change_node->change.addr, common->addr, ETH_ALEN); + memset(tt_change_node->change.reserved, 0, + sizeof(tt_change_node->change.reserved)); + ether_addr_copy(tt_change_node->change.addr, common->addr); tt_change_node->change.vid = htons(common->vid); del_op_requested = flags & BATADV_TT_CLIENT_DEL; @@ -475,27 +500,33 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, * @vid: VLAN identifier * @ifindex: index of the interface where the client is connected to (useful to * identify wireless clients) + * @mark: the value contained in the skb->mark field of the received packet (if + * any) * * Returns true if the client was successfully added, false otherwise. */ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, - unsigned short vid, int ifindex) + unsigned short vid, int ifindex, uint32_t mark) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; - struct batadv_tt_global_entry *tt_global; + struct batadv_tt_global_entry *tt_global = NULL; + struct batadv_softif_vlan *vlan; struct net_device *in_dev = NULL; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; int hash_added, table_size, packet_size_max; bool ret = false, roamed_back = false; uint8_t remote_flags; + uint32_t match_mark; if (ifindex != BATADV_NULL_IFINDEX) in_dev = dev_get_by_index(&init_net, ifindex); tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid); - tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); + + if (!is_multicast_ether_addr(addr)) + tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (tt_local) { tt_local->last_seen = jiffies; @@ -542,12 +573,15 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, if (!tt_local) goto out; + /* increase the refcounter of the related vlan */ + vlan = batadv_softif_vlan_get(bat_priv, vid); + batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", addr, BATADV_PRINT_VID(vid), (uint8_t)atomic_read(&bat_priv->tt.vn)); - memcpy(tt_local->common.addr, addr, ETH_ALEN); + ether_addr_copy(tt_local->common.addr, addr); /* The local entry has to be marked as NEW to avoid to send it in * a full table response going out before the next ttvn increment * (consistency check) @@ -560,8 +594,11 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; - /* the batman interface mac address should never be purged */ - if (batadv_compare_eth(addr, soft_iface->dev_addr)) + /* the batman interface mac and multicast addresses should never be + * purged + */ + if (batadv_compare_eth(addr, soft_iface->dev_addr) || + is_multicast_ether_addr(addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, @@ -571,6 +608,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_local_entry_free_ref(tt_local); + batadv_softif_vlan_free_ref(vlan); goto out; } @@ -614,6 +652,17 @@ check_roaming: else tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI; + /* check the mark in the skb: if it's equal to the configured + * isolation_mark, it means the packet is coming from an isolated + * non-mesh client + */ + match_mark = (mark & bat_priv->isolation_mark_mask); + if (bat_priv->isolation_mark_mask && + match_mark == bat_priv->isolation_mark) + tt_local->common.flags |= BATADV_TT_CLIENT_ISOLA; + else + tt_local->common.flags &= ~BATADV_TT_CLIENT_ISOLA; + /* if any "dynamic" flag has been modified, resend an ADD event for this * entry so that all the nodes can get the new flags */ @@ -874,7 +923,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n", net_dev->name, (uint8_t)atomic_read(&bat_priv->tt.vn)); - seq_printf(seq, " %-13s %s %-7s %-9s (%-10s)\n", "Client", "VID", + seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID", "Flags", "Last seen", "CRC"); for (i = 0; i < hash->size; i++) { @@ -902,7 +951,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) } seq_printf(seq, - " * %pM %4i [%c%c%c%c%c] %3u.%03u (%#.8x)\n", + " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, BATADV_PRINT_VID(tt_common_entry->vid), (tt_common_entry->flags & @@ -914,6 +963,8 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) BATADV_TT_CLIENT_PENDING ? 'X' : '.'), (tt_common_entry->flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), + (tt_common_entry->flags & + BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, vlan->tt.crc); @@ -963,6 +1014,7 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, { struct batadv_tt_local_entry *tt_local_entry; uint16_t flags, curr_flags = BATADV_NO_FLAGS; + struct batadv_softif_vlan *vlan; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) @@ -993,6 +1045,11 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, hlist_del_rcu(&tt_local_entry->common.hash_entry); batadv_tt_local_entry_free_ref(tt_local_entry); + /* decrease the reference held for this vlan */ + vlan = batadv_softif_vlan_get(bat_priv, vid); + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + out: if (tt_local_entry) batadv_tt_local_entry_free_ref(tt_local_entry); @@ -1065,6 +1122,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; + struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; uint32_t i; @@ -1085,6 +1143,13 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); + + /* decrease the reference held for this vlan */ + vlan = batadv_softif_vlan_get(bat_priv, + tt_common_entry->vid); + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_free_ref(tt_local); } spin_unlock_bh(list_lock); @@ -1204,6 +1269,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); spin_unlock_bh(&tt_global->list_lock); + atomic_inc(&tt_global->orig_list_count); + out: if (orig_entry) batadv_tt_orig_list_entry_free_ref(orig_entry); @@ -1262,7 +1329,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, goto out; common = &tt_global_entry->common; - memcpy(common->addr, tt_addr, ETH_ALEN); + ether_addr_copy(common->addr, tt_addr); common->vid = vid; common->flags = flags; @@ -1277,6 +1344,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); + atomic_set(&tt_global_entry->orig_list_count, 0); spin_lock_init(&tt_global_entry->list_lock); hash_added = batadv_hash_add(bat_priv->tt.global_hash, @@ -1346,6 +1414,11 @@ add_orig_entry: ret = true; out_remove: + /* Do not remove multicast addresses from the local hash on + * global additions + */ + if (is_multicast_ether_addr(tt_addr)) + goto out; /* remove address from local hash if present */ local_flags = batadv_tt_local_remove(bat_priv, tt_addr, vid, @@ -1367,7 +1440,8 @@ out: return ret; } -/* batadv_transtable_best_orig - Get best originator list entry from tt entry +/** + * batadv_transtable_best_orig - Get best originator list entry from tt entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * @@ -1385,12 +1459,14 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, head = &tt_global_entry->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { - router = batadv_orig_node_get_router(orig_entry->orig_node); + router = batadv_orig_router_get(orig_entry->orig_node, + BATADV_IF_DEFAULT); if (!router) continue; if (best_router && - bao->bat_neigh_cmp(router, best_router) <= 0) { + bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT, + best_router, BATADV_IF_DEFAULT) <= 0) { batadv_neigh_node_free_ref(router); continue; } @@ -1409,8 +1485,9 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, return best_entry; } -/* batadv_tt_global_print_entry - print all orig nodes who announce the address - * for this global entry +/** + * batadv_tt_global_print_entry - print all orig nodes who announce the address + * for this global entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be printed * @seq: debugfs table seq_file struct @@ -1446,13 +1523,14 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, last_ttvn = atomic_read(&best_entry->orig_node->last_ttvn); seq_printf(seq, - " %c %pM %4i (%3u) via %pM (%3u) (%#.8x) [%c%c%c]\n", + " %c %pM %4i (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n", '*', tt_global_entry->common.addr, BATADV_PRINT_VID(tt_global_entry->common.vid), best_entry->ttvn, best_entry->orig_node->orig, last_ttvn, vlan->tt.crc, (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), + (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); @@ -1477,13 +1555,14 @@ print_list: last_ttvn = atomic_read(&orig_entry->orig_node->last_ttvn); seq_printf(seq, - " %c %pM %4d (%3u) via %pM (%3u) (%#.8x) [%c%c%c]\n", + " %c %pM %4d (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n", '+', tt_global_entry->common.addr, BATADV_PRINT_VID(tt_global_entry->common.vid), orig_entry->ttvn, orig_entry->orig_node->orig, last_ttvn, vlan->tt.crc, (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), + (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); @@ -1531,6 +1610,25 @@ out: return 0; } +/** + * batadv_tt_global_del_orig_entry - remove and free an orig_entry + * @tt_global_entry: the global entry to remove the orig_entry from + * @orig_entry: the orig entry to remove and free + * + * Remove an orig_entry from its list in the given tt_global_entry and + * free this orig_entry afterwards. + */ +static void +batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, + struct batadv_tt_orig_list_entry *orig_entry) +{ + batadv_tt_global_size_dec(orig_entry->orig_node, + tt_global_entry->common.vid); + atomic_dec(&tt_global_entry->orig_list_count); + hlist_del_rcu(&orig_entry->list); + batadv_tt_orig_list_entry_free_ref(orig_entry); +} + /* deletes the orig list of a tt_global_entry */ static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) @@ -1541,20 +1639,26 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; - hlist_for_each_entry_safe(orig_entry, safe, head, list) { - hlist_del_rcu(&orig_entry->list); - batadv_tt_global_size_dec(orig_entry->orig_node, - tt_global_entry->common.vid); - batadv_tt_orig_list_entry_free_ref(orig_entry); - } + hlist_for_each_entry_safe(orig_entry, safe, head, list) + batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); spin_unlock_bh(&tt_global_entry->list_lock); } +/** + * batadv_tt_global_del_orig_node - remove orig_node from a global tt entry + * @bat_priv: the bat priv with all the soft interface information + * @tt_global_entry: the global entry to remove the orig_node from + * @orig_node: the originator announcing the client + * @message: message to append to the log on deletion + * + * Remove the given orig_node and its according orig_entry from the given + * global tt entry. + */ static void -batadv_tt_global_del_orig_entry(struct batadv_priv *bat_priv, - struct batadv_tt_global_entry *tt_global_entry, - struct batadv_orig_node *orig_node, - const char *message) +batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv, + struct batadv_tt_global_entry *tt_global_entry, + struct batadv_orig_node *orig_node, + const char *message) { struct hlist_head *head; struct hlist_node *safe; @@ -1571,10 +1675,8 @@ batadv_tt_global_del_orig_entry(struct batadv_priv *bat_priv, orig_node->orig, tt_global_entry->common.addr, BATADV_PRINT_VID(vid), message); - hlist_del_rcu(&orig_entry->list); - batadv_tt_global_size_dec(orig_node, - tt_global_entry->common.vid); - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_global_del_orig_entry(tt_global_entry, + orig_entry); } } spin_unlock_bh(&tt_global_entry->list_lock); @@ -1616,8 +1718,8 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, /* there is another entry, we can simply delete this * one and can still use the other one. */ - batadv_tt_global_del_orig_entry(bat_priv, tt_global_entry, - orig_node, message); + batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, + orig_node, message); } /** @@ -1643,8 +1745,8 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, goto out; if (!roaming) { - batadv_tt_global_del_orig_entry(bat_priv, tt_global_entry, - orig_node, message); + batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, + orig_node, message); if (hlist_empty(&tt_global_entry->orig_list)) batadv_tt_global_free(bat_priv, tt_global_entry, @@ -1727,8 +1829,8 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry, common); - batadv_tt_global_del_orig_entry(bat_priv, tt_global, - orig_node, message); + batadv_tt_global_del_orig_node(bat_priv, tt_global, + orig_node, message); if (hlist_empty(&tt_global->orig_list)) { vid = tt_global->common.vid; @@ -1742,7 +1844,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, } spin_unlock_bh(list_lock); } - orig_node->tt_initialised = false; + orig_node->capa_initialized &= ~BATADV_ORIG_CAPA_HAS_TT; } static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global, @@ -1852,6 +1954,11 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, tt_global_entry->common.flags & BATADV_TT_CLIENT_WIFI) ret = true; + /* check if the two clients are marked as isolated */ + if (tt_local_entry->common.flags & BATADV_TT_CLIENT_ISOLA && + tt_global_entry->common.flags & BATADV_TT_CLIENT_ISOLA) + ret = true; + return ret; } @@ -1878,19 +1985,8 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry = NULL; struct batadv_orig_node *orig_node = NULL; struct batadv_tt_orig_list_entry *best_entry; - bool ap_isolation_enabled = false; - struct batadv_softif_vlan *vlan; - - /* if the AP isolation is requested on a VLAN, then check for its - * setting in the proper VLAN private data structure - */ - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (vlan) { - ap_isolation_enabled = atomic_read(&vlan->ap_isolation); - batadv_softif_vlan_free_ref(vlan); - } - if (src && ap_isolation_enabled) { + if (src && batadv_vlan_ap_isola_get(bat_priv, vid)) { tt_local_entry = batadv_tt_local_hash_find(bat_priv, src, vid); if (!tt_local_entry || (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING)) @@ -1960,6 +2056,7 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, struct hlist_head *head; uint32_t i, crc_tmp, crc = 0; uint8_t flags; + __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1996,8 +2093,11 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, orig_node)) continue; - crc_tmp = crc32c(0, &tt_common->vid, - sizeof(tt_common->vid)); + /* use network order to read the VID: this ensures that + * every node reads the bytes in the same order. + */ + tmp_vid = htons(tt_common->vid); + crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes @@ -2031,6 +2131,7 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, struct hlist_head *head; uint32_t i, crc_tmp, crc = 0; uint8_t flags; + __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2049,8 +2150,11 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, if (tt_common->flags & BATADV_TT_CLIENT_NEW) continue; - crc_tmp = crc32c(0, &tt_common->vid, - sizeof(tt_common->vid)); + /* use network order to read the VID: this ensures that + * every node reads the bytes in the same order. + */ + tmp_vid = htons(tt_common->vid); + crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes @@ -2137,7 +2241,7 @@ batadv_new_tt_req_node(struct batadv_priv *bat_priv, if (!tt_req_node) goto unlock; - memcpy(tt_req_node->addr, orig_node->orig, ETH_ALEN); + ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; list_add(&tt_req_node->list, &bat_priv->tt.req_list); @@ -2217,11 +2321,11 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, if ((valid_cb) && (!valid_cb(tt_common_entry, cb_data))) continue; - memcpy(tt_change->addr, tt_common_entry->addr, - ETH_ALEN); + ether_addr_copy(tt_change->addr, tt_common_entry->addr); tt_change->flags = tt_common_entry->flags; tt_change->vid = htons(tt_common_entry->vid); - tt_change->reserved = 0; + memset(tt_change->reserved, 0, + sizeof(tt_change->reserved)); tt_num_entries++; tt_change++; @@ -2246,6 +2350,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; + uint32_t crc; int i; /* check if each received CRC matches the locally stored one */ @@ -2265,7 +2370,10 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, if (!vlan) return false; - if (vlan->tt.crc != ntohl(tt_vlan_tmp->crc)) + crc = vlan->tt.crc; + batadv_orig_node_vlan_free_ref(vlan); + + if (crc != ntohl(tt_vlan_tmp->crc)) return false; } @@ -2696,7 +2804,7 @@ static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, return; } } - orig_node->tt_initialised = true; + orig_node->capa_initialized |= BATADV_ORIG_CAPA_HAS_TT; } static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, @@ -2904,7 +3012,7 @@ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, tt_roam_node->first_time = jiffies; atomic_set(&tt_roam_node->counter, BATADV_ROAMING_MAX_COUNT - 1); - memcpy(tt_roam_node->addr, client, ETH_ALEN); + ether_addr_copy(tt_roam_node->addr, client); list_add(&tt_roam_node->list, &bat_priv->tt.roam_list); ret = true; @@ -3050,6 +3158,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; + struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -3078,6 +3187,12 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) tt_local = container_of(tt_common, struct batadv_tt_local_entry, common); + + /* decrease the reference held for this vlan */ + vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_free_ref(tt_local); } spin_unlock_bh(list_lock); @@ -3093,6 +3208,9 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) */ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) { + /* Update multicast addresses in local translation table */ + batadv_mcast_mla_update(bat_priv); + if (atomic_read(&bat_priv->tt.local_changes) < 1) { if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt)) batadv_tt_tvlv_container_update(bat_priv); @@ -3183,13 +3301,15 @@ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, uint8_t orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); struct batadv_tvlv_tt_vlan_data *tt_vlan; bool full_table = true; + bool has_tt_init; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff; + has_tt_init = orig_node->capa_initialized & BATADV_ORIG_CAPA_HAS_TT; + /* orig table not initialised AND first diff is in the OGM OR the ttvn * increased by one -> we can apply the attached changes */ - if ((!orig_node->tt_initialised && ttvn == 1) || - ttvn - orig_ttvn == 1) { + if ((!has_tt_init && ttvn == 1) || ttvn - orig_ttvn == 1) { /* the OGM could not contain the changes due to their size or * because they have already been sent BATADV_TT_OGM_APPEND_MAX * times. @@ -3202,7 +3322,6 @@ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, spin_lock_bh(&orig_node->tt_lock); - tt_change = (struct batadv_tvlv_tt_change *)tt_buff; batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes, ttvn, tt_change); @@ -3230,7 +3349,7 @@ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, /* if we missed more than one change or our tables are not * in sync anymore -> request fresh tt data */ - if (!orig_node->tt_initialised || ttvn != orig_ttvn || + if (!has_tt_init || ttvn != orig_ttvn || !batadv_tt_global_check_crc(orig_node, tt_vlan, tt_num_vlan)) { request_table: @@ -3565,3 +3684,29 @@ int batadv_tt_init(struct batadv_priv *bat_priv) return 1; } + +/** + * batadv_tt_global_is_isolated - check if a client is marked as isolated + * @bat_priv: the bat priv with all the soft interface information + * @addr: the mac address of the client + * @vid: the identifier of the VLAN where this client is connected + * + * Returns true if the client is marked with the TT_CLIENT_ISOLA flag, false + * otherwise + */ +bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, + const uint8_t *addr, unsigned short vid) +{ + struct batadv_tt_global_entry *tt; + bool ret; + + tt = batadv_tt_global_hash_find(bat_priv, addr, vid); + if (!tt) + return false; + + ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA; + + batadv_tt_global_entry_free_ref(tt); + + return ret; +} diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 026b1ffa674..ad84d7b89e3 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ @@ -22,7 +20,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, - unsigned short vid, int ifindex); + unsigned short vid, int ifindex, uint32_t mark); uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, const uint8_t *addr, unsigned short vid, const char *message, bool roaming); @@ -31,6 +29,8 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, int32_t match_vid, const char *message); +int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, + const uint8_t *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const uint8_t *src, const uint8_t *addr, @@ -50,5 +50,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); +bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, + const uint8_t *addr, unsigned short vid); #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 91dd369b0ff..8854c05622a 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -12,9 +12,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_TYPES_H_ @@ -26,8 +24,9 @@ #ifdef CONFIG_BATMAN_ADV_DAT -/* batadv_dat_addr_t is the type used for all DHT addresses. If it is changed, - * BATADV_DAT_ADDR_MAX is changed as well. +/** + * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is + * changed, BATADV_DAT_ADDR_MAX is changed as well. * * *Please be careful: batadv_dat_addr_t must be UNSIGNED* */ @@ -36,6 +35,18 @@ #endif /* CONFIG_BATMAN_ADV_DAT */ /** + * enum batadv_dhcp_recipient - dhcp destination + * @BATADV_DHCP_NO: packet is not a dhcp message + * @BATADV_DHCP_TO_SERVER: dhcp message is directed to a server + * @BATADV_DHCP_TO_CLIENT: dhcp message is directed to a client + */ +enum batadv_dhcp_recipient { + BATADV_DHCP_NO = 0, + BATADV_DHCP_TO_SERVER, + BATADV_DHCP_TO_CLIENT, +}; + +/** * BATADV_TT_REMOTE_MASK - bitmask selecting the flags that are sent over the * wire only */ @@ -74,6 +85,7 @@ struct batadv_hard_iface_bat_iv { * @rcu: struct used for freeing in an RCU-safe manner * @bat_iv: BATMAN IV specific per hard interface data * @cleanup_work: work queue callback item for hard interface deinit + * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs */ struct batadv_hard_iface { struct list_head list; @@ -88,6 +100,29 @@ struct batadv_hard_iface { struct rcu_head rcu; struct batadv_hard_iface_bat_iv bat_iv; struct work_struct cleanup_work; + struct dentry *debug_dir; +}; + +/** + * struct batadv_orig_ifinfo - originator info per outgoing interface + * @list: list node for orig_node::ifinfo_list + * @if_outgoing: pointer to outgoing hard interface + * @router: router that should be used to reach this originator + * @last_real_seqno: last and best known sequence number + * @last_ttl: ttl of last received packet + * @batman_seqno_reset: time when the batman seqno window was reset + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ +struct batadv_orig_ifinfo { + struct hlist_node list; + struct batadv_hard_iface *if_outgoing; + struct batadv_neigh_node __rcu *router; /* rcu protected pointer */ + uint32_t last_real_seqno; + uint8_t last_ttl; + unsigned long batman_seqno_reset; + atomic_t refcount; + struct rcu_head rcu; }; /** @@ -129,7 +164,7 @@ struct batadv_vlan_tt { }; /** - * batadv_orig_node_vlan - VLAN specific data per orig_node + * struct batadv_orig_node_vlan - VLAN specific data per orig_node * @vid: the VLAN identifier * @tt: VLAN specific TT attributes * @list: list node for orig_node::vlan_list @@ -165,36 +200,36 @@ struct batadv_orig_bat_iv { * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh * @orig: originator ethernet address * @primary_addr: hosts primary interface address - * @router: router that should be used to reach this originator + * @ifinfo_list: list for routers per outgoing interface + * @last_bonding_candidate: pointer to last ifinfo of last used router * @batadv_dat_addr_t: address of the orig node in the distributed hash * @last_seen: time when last packet from this node was received * @bcast_seqno_reset: time when the broadcast seqno window was reset - * @batman_seqno_reset: time when the batman seqno window was reset + * @mcast_flags: multicast flags announced by the orig node + * @mcast_want_all_unsnoop_node: a list node for the + * mcast.want_all_unsnoopables list + * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4 list + * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6 list * @capabilities: announced capabilities of this originator + * @capa_initialized: bitfield to remember whether a capability was initialized * @last_ttvn: last seen translation table version number * @tt_buff: last tt changeset this node received from the orig node * @tt_buff_len: length of the last tt changeset this node received from the * orig node * @tt_buff_lock: lock that protects tt_buff and tt_buff_len - * @tt_initialised: bool keeping track of whether or not this node have received - * any translation table information from the orig node yet * @tt_lock: prevents from updating the table while reading it. Table update is * made up by two operations (data structure update and metdata -CRC/TTVN- * recalculation) and they have to be executed atomically in order to avoid * another thread to read the table/metadata between those. - * @last_real_seqno: last and best known sequence number - * @last_ttl: ttl of last received packet * @bcast_bits: bitfield containing the info which payload broadcast originated * from this orig node this host already has seen (relative to * last_bcast_seqno) * @last_bcast_seqno: last broadcast sequence number received by this host * @neigh_list: list of potential next hop neighbor towards this orig node - * @neigh_list_lock: lock protecting neigh_list, router and bonding_list + * @neigh_list_lock: lock protecting neigh_list and router * @hash_entry: hlist node for batadv_priv::orig_hash * @bat_priv: pointer to soft_iface this orig node belongs to * @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno - * @bond_candidates: how many candidates are available - * @bond_list: list of bonding candidates * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner * @in_coding_list: list of nodes this orig can hear @@ -210,34 +245,36 @@ struct batadv_orig_bat_iv { struct batadv_orig_node { uint8_t orig[ETH_ALEN]; uint8_t primary_addr[ETH_ALEN]; - struct batadv_neigh_node __rcu *router; /* rcu protected pointer */ + struct hlist_head ifinfo_list; + struct batadv_orig_ifinfo *last_bonding_candidate; #ifdef CONFIG_BATMAN_ADV_DAT batadv_dat_addr_t dat_addr; #endif unsigned long last_seen; unsigned long bcast_seqno_reset; - unsigned long batman_seqno_reset; +#ifdef CONFIG_BATMAN_ADV_MCAST + uint8_t mcast_flags; + struct hlist_node mcast_want_all_unsnoopables_node; + struct hlist_node mcast_want_all_ipv4_node; + struct hlist_node mcast_want_all_ipv6_node; +#endif uint8_t capabilities; + uint8_t capa_initialized; atomic_t last_ttvn; unsigned char *tt_buff; int16_t tt_buff_len; spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */ - bool tt_initialised; /* prevents from changing the table while reading it */ spinlock_t tt_lock; - uint32_t last_real_seqno; - uint8_t last_ttl; DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); uint32_t last_bcast_seqno; struct hlist_head neigh_list; - /* neigh_list_lock protects: neigh_list, router & bonding_list */ + /* neigh_list_lock protects: neigh_list and router */ spinlock_t neigh_list_lock; struct hlist_node hash_entry; struct batadv_priv *bat_priv; /* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */ spinlock_t bcast_seqno_lock; - atomic_t bond_candidates; - struct list_head bond_list; atomic_t refcount; struct rcu_head rcu; #ifdef CONFIG_BATMAN_ADV_NC @@ -256,10 +293,15 @@ struct batadv_orig_node { * enum batadv_orig_capabilities - orig node capabilities * @BATADV_ORIG_CAPA_HAS_DAT: orig node has distributed arp table enabled * @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled + * @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability + * @BATADV_ORIG_CAPA_HAS_MCAST: orig node has some multicast capability + * (= orig node announces a tvlv of type BATADV_TVLV_MCAST) */ enum batadv_orig_capabilities { BATADV_ORIG_CAPA_HAS_DAT = BIT(0), BATADV_ORIG_CAPA_HAS_NC = BIT(1), + BATADV_ORIG_CAPA_HAS_TT = BIT(2), + BATADV_ORIG_CAPA_HAS_MCAST = BIT(3), }; /** @@ -283,49 +325,64 @@ struct batadv_gw_node { }; /** - * struct batadv_neigh_bat_iv - B.A.T.M.A.N. IV specific structure for single - * hop neighbors + * struct batadv_neigh_node - structure for single hops neighbors + * @list: list node for batadv_orig_node::neigh_list + * @orig_node: pointer to corresponding orig_node + * @addr: the MAC address of the neighboring interface + * @ifinfo_list: list for routing metrics per outgoing interface + * @ifinfo_lock: lock protecting private ifinfo members and list + * @if_incoming: pointer to incoming hard interface + * @last_seen: when last packet via this neighbor was received + * @last_ttl: last received ttl from this neigh node + * @rcu: struct used for freeing in an RCU-safe manner + * @bat_iv: B.A.T.M.A.N. IV private structure + */ +struct batadv_neigh_node { + struct hlist_node list; + struct batadv_orig_node *orig_node; + uint8_t addr[ETH_ALEN]; + struct hlist_head ifinfo_list; + spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ + struct batadv_hard_iface *if_incoming; + unsigned long last_seen; + atomic_t refcount; + struct rcu_head rcu; +}; + +/** + * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing + * interface for BATMAN IV * @tq_recv: ring buffer of received TQ values from this neigh node * @tq_index: ring buffer index * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv) * @real_bits: bitfield containing the number of OGMs received from this neigh * node (relative to orig_node->last_real_seqno) * @real_packet_count: counted result of real_bits - * @lq_update_lock: lock protecting tq_recv & tq_index */ -struct batadv_neigh_bat_iv { +struct batadv_neigh_ifinfo_bat_iv { uint8_t tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE]; uint8_t tq_index; uint8_t tq_avg; DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); uint8_t real_packet_count; - spinlock_t lq_update_lock; /* protects tq_recv & tq_index */ }; /** - * struct batadv_neigh_node - structure for single hops neighbors - * @list: list node for batadv_orig_node::neigh_list - * @orig_node: pointer to corresponding orig_node - * @addr: the MAC address of the neighboring interface - * @if_incoming: pointer to incoming hard interface - * @last_seen: when last packet via this neighbor was received + * struct batadv_neigh_ifinfo - neighbor information per outgoing interface + * @list: list node for batadv_neigh_node::ifinfo_list + * @if_outgoing: pointer to outgoing hard interface + * @bat_iv: B.A.T.M.A.N. IV private structure * @last_ttl: last received ttl from this neigh node - * @bonding_list: list node for batadv_orig_node::bond_list * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner - * @bat_iv: B.A.T.M.A.N. IV private structure + * @rcu: struct used for freeing in a RCU-safe manner */ -struct batadv_neigh_node { +struct batadv_neigh_ifinfo { struct hlist_node list; - struct batadv_orig_node *orig_node; - uint8_t addr[ETH_ALEN]; - struct batadv_hard_iface *if_incoming; - unsigned long last_seen; + struct batadv_hard_iface *if_outgoing; + struct batadv_neigh_ifinfo_bat_iv bat_iv; uint8_t last_ttl; - struct list_head bonding_list; atomic_t refcount; struct rcu_head rcu; - struct batadv_neigh_bat_iv bat_iv; }; /** @@ -503,7 +560,7 @@ struct batadv_priv_bla { #endif /** - * struct batadv_debug_log - debug logging data + * struct batadv_priv_debug_log - debug logging data * @log_buff: buffer holding the logs (ring bufer) * @log_start: index of next character to read * @log_end: index of next character to write @@ -566,6 +623,39 @@ struct batadv_priv_dat { }; #endif +#ifdef CONFIG_BATMAN_ADV_MCAST +/** + * struct batadv_priv_mcast - per mesh interface mcast data + * @mla_list: list of multicast addresses we are currently announcing via TT + * @want_all_unsnoopables_list: a list of orig_nodes wanting all unsnoopable + * multicast traffic + * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic + * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic + * @flags: the flags we have last sent in our mcast tvlv + * @enabled: whether the multicast tvlv is currently enabled + * @num_disabled: number of nodes that have no mcast tvlv + * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP traffic + * @num_want_all_ipv4: counter for items in want_all_ipv4_list + * @num_want_all_ipv6: counter for items in want_all_ipv6_list + * @want_lists_lock: lock for protecting modifications to mcast want lists + * (traversals are rcu-locked) + */ +struct batadv_priv_mcast { + struct hlist_head mla_list; + struct hlist_head want_all_unsnoopables_list; + struct hlist_head want_all_ipv4_list; + struct hlist_head want_all_ipv6_list; + uint8_t flags; + bool enabled; + atomic_t num_disabled; + atomic_t num_want_all_unsnoopables; + atomic_t num_want_all_ipv4; + atomic_t num_want_all_ipv6; + /* protects want_all_{unsnoopables,ipv4,ipv6}_list */ + spinlock_t want_lists_lock; +}; +#endif + /** * struct batadv_priv_nc - per mesh interface network coding private data * @work: work queue callback item for cleanup @@ -597,6 +687,7 @@ struct batadv_priv_nc { /** * struct batadv_softif_vlan - per VLAN attributes set + * @bat_priv: pointer to the mesh object * @vid: VLAN identifier * @kobj: kobject for sysfs vlan subdirectory * @ap_isolation: AP isolation state @@ -606,6 +697,7 @@ struct batadv_priv_nc { * @rcu: struct used for freeing in a RCU-safe manner */ struct batadv_softif_vlan { + struct batadv_priv *bat_priv; unsigned short vid; struct kobject *kobj; atomic_t ap_isolation; /* boolean */ @@ -631,6 +723,8 @@ struct batadv_softif_vlan { * enabled * @distributed_arp_table: bool indicating whether distributed ARP table is * enabled + * @multicast_mode: Enable or disable multicast optimizations on this node's + * sender/originating side * @gw_mode: gateway operation: off, client or server (see batadv_gw_modes) * @gw_sel_class: gateway selection class (applies if gw_mode client) * @orig_interval: OGM broadcast interval in milliseconds @@ -661,6 +755,7 @@ struct batadv_softif_vlan { * @tt: translation table data * @tvlv: type-version-length-value data * @dat: distributed arp table data + * @mcast: multicast data * @network_coding: bool indicating whether network coding is enabled * @batadv_priv_nc: network coding data */ @@ -680,6 +775,9 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_DAT atomic_t distributed_arp_table; #endif +#ifdef CONFIG_BATMAN_ADV_MCAST + atomic_t multicast_mode; +#endif atomic_t gw_mode; atomic_t gw_sel_class; atomic_t orig_interval; @@ -687,6 +785,8 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_t log_level; #endif + uint32_t isolation_mark; + uint32_t isolation_mark_mask; atomic_t bcast_seqno; atomic_t bcast_queue_left; atomic_t batman_queue_left; @@ -716,6 +816,9 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_DAT struct batadv_priv_dat dat; #endif +#ifdef CONFIG_BATMAN_ADV_MCAST + struct batadv_priv_mcast mcast; +#endif #ifdef CONFIG_BATMAN_ADV_NC atomic_t network_coding; struct batadv_priv_nc nc; @@ -838,12 +941,14 @@ struct batadv_tt_local_entry { * struct batadv_tt_global_entry - translation table global entry data * @common: general translation table data * @orig_list: list of orig nodes announcing this non-mesh client + * @orig_list_count: number of items in the orig_list * @list_lock: lock protecting orig_list * @roam_at: time at which TT_GLOBAL_ROAM was set */ struct batadv_tt_global_entry { struct batadv_tt_common_entry common; struct hlist_head orig_list; + atomic_t orig_list_count; spinlock_t list_lock; /* protects orig_list */ unsigned long roam_at; }; @@ -961,8 +1066,8 @@ struct batadv_nc_packet { }; /** - * batadv_skb_cb - control buffer structure used to store private data relevant - * to batman-adv in the skb->cb buffer in skbs. + * struct batadv_skb_cb - control buffer structure used to store private data + * relevant to batman-adv in the skb->cb buffer in skbs. * @decoded: Marks a skb as decoded, which is checked when searching for coding * opportunities in network-coding.c */ @@ -981,8 +1086,10 @@ struct batadv_skb_cb { * @direct_link_flags: direct link flags for aggregated OGM packets * @num_packets: counter for bcast packet retransmission * @delayed_work: work queue callback item for packet sending - * @if_incoming: pointer incoming hard-iface or primary iface if locally - * generated packet + * @if_incoming: pointer to incoming hard-iface or primary iface if + * locally generated packet + * @if_outgoing: packet where the packet should be sent to, or NULL if + * unspecified */ struct batadv_forw_packet { struct hlist_node list; @@ -994,6 +1101,7 @@ struct batadv_forw_packet { uint8_t num_packets; struct delayed_work delayed_work; struct batadv_hard_iface *if_incoming; + struct batadv_hard_iface *if_outgoing; }; /** @@ -1007,9 +1115,11 @@ struct batadv_forw_packet { * @bat_primary_iface_set: called when primary interface is selected / changed * @bat_ogm_schedule: prepare a new outgoing OGM for the send queue * @bat_ogm_emit: send scheduled OGM - * @bat_neigh_cmp: compare the metrics of two neighbors - * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or - * better than neigh2 from the metric prospective + * @bat_neigh_cmp: compare the metrics of two neighbors for their respective + * outgoing interfaces + * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better + * than neigh2 for their respective outgoing interface from the metric + * prospective * @bat_orig_print: print the originator table (optional) * @bat_orig_free: free the resources allocated by the routing algorithm for an * orig_node object @@ -1028,11 +1138,17 @@ struct batadv_algo_ops { void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, - struct batadv_neigh_node *neigh2); - bool (*bat_neigh_is_equiv_or_better)(struct batadv_neigh_node *neigh1, - struct batadv_neigh_node *neigh2); + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2); + bool (*bat_neigh_is_equiv_or_better) + (struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2); /* orig_node handling API */ - void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq); + void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, + struct batadv_hard_iface *hard_iface); void (*bat_orig_free)(struct batadv_orig_node *orig_node); int (*bat_orig_add_if)(struct batadv_orig_node *orig_node, int max_if_num); @@ -1062,6 +1178,16 @@ struct batadv_dat_entry { }; /** + * struct batadv_hw_addr - a list entry for a MAC address + * @list: list node for the linking of entries + * @addr: the MAC address of this list entry + */ +struct batadv_hw_addr { + struct hlist_node list; + unsigned char addr[ETH_ALEN]; +}; + +/** * struct batadv_dat_candidate - candidate destination for DAT operations * @type: the type of the selected candidate. It can one of the following: * - BATADV_DAT_CANDIDATE_NOT_FOUND diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c new file mode 100644 index 00000000000..8796ffa08b4 --- /dev/null +++ b/net/bluetooth/6lowpan.c @@ -0,0 +1,865 @@ +/* + Copyright (c) 2013 Intel Corp. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 and + only version 2 as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*/ + +#include <linux/if_arp.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> + +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> + +#include <net/af_ieee802154.h> /* to get the address type */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/l2cap.h> + +#include "6lowpan.h" + +#include <net/6lowpan.h> /* for the compression support */ + +#define IFACE_NAME_TEMPLATE "bt%d" +#define EUI64_ADDR_LEN 8 + +struct skb_cb { + struct in6_addr addr; + struct l2cap_conn *conn; +}; +#define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb)) + +/* The devices list contains those devices that we are acting + * as a proxy. The BT 6LoWPAN device is a virtual device that + * connects to the Bluetooth LE device. The real connection to + * BT device is done via l2cap layer. There exists one + * virtual device / one BT 6LoWPAN network (=hciX device). + * The list contains struct lowpan_dev elements. + */ +static LIST_HEAD(bt_6lowpan_devices); +static DEFINE_RWLOCK(devices_lock); + +struct lowpan_peer { + struct list_head list; + struct l2cap_conn *conn; + + /* peer addresses in various formats */ + unsigned char eui64_addr[EUI64_ADDR_LEN]; + struct in6_addr peer_addr; +}; + +struct lowpan_dev { + struct list_head list; + + struct hci_dev *hdev; + struct net_device *netdev; + struct list_head peers; + atomic_t peer_count; /* number of items in peers list */ + + struct work_struct delete_netdev; + struct delayed_work notify_peers; +}; + +static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev) +{ + return netdev_priv(netdev); +} + +static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer) +{ + list_add(&peer->list, &dev->peers); + atomic_inc(&dev->peer_count); +} + +static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer) +{ + list_del(&peer->list); + + if (atomic_dec_and_test(&dev->peer_count)) { + BT_DBG("last peer"); + return true; + } + + return false; +} + +static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev, + bdaddr_t *ba, __u8 type) +{ + struct lowpan_peer *peer, *tmp; + + BT_DBG("peers %d addr %pMR type %d", atomic_read(&dev->peer_count), + ba, type); + + list_for_each_entry_safe(peer, tmp, &dev->peers, list) { + BT_DBG("addr %pMR type %d", + &peer->conn->hcon->dst, peer->conn->hcon->dst_type); + + if (bacmp(&peer->conn->hcon->dst, ba)) + continue; + + if (type == peer->conn->hcon->dst_type) + return peer; + } + + return NULL; +} + +static inline struct lowpan_peer *peer_lookup_conn(struct lowpan_dev *dev, + struct l2cap_conn *conn) +{ + struct lowpan_peer *peer, *tmp; + + list_for_each_entry_safe(peer, tmp, &dev->peers, list) { + if (peer->conn == conn) + return peer; + } + + return NULL; +} + +static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) +{ + struct lowpan_dev *entry, *tmp; + struct lowpan_peer *peer = NULL; + unsigned long flags; + + read_lock_irqsave(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, list) { + peer = peer_lookup_conn(entry, conn); + if (peer) + break; + } + + read_unlock_irqrestore(&devices_lock, flags); + + return peer; +} + +static struct lowpan_dev *lookup_dev(struct l2cap_conn *conn) +{ + struct lowpan_dev *entry, *tmp; + struct lowpan_dev *dev = NULL; + unsigned long flags; + + read_lock_irqsave(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, list) { + if (conn->hcon->hdev == entry->hdev) { + dev = entry; + break; + } + } + + read_unlock_irqrestore(&devices_lock, flags); + + return dev; +} + +static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *skb_cp; + int ret; + + skb_cp = skb_copy(skb, GFP_ATOMIC); + if (!skb_cp) + return -ENOMEM; + + ret = netif_rx(skb_cp); + + BT_DBG("receive skb %d", ret); + if (ret < 0) + return NET_RX_DROP; + + return ret; +} + +static int process_data(struct sk_buff *skb, struct net_device *netdev, + struct l2cap_conn *conn) +{ + const u8 *saddr, *daddr; + u8 iphc0, iphc1; + struct lowpan_dev *dev; + struct lowpan_peer *peer; + unsigned long flags; + + dev = lowpan_dev(netdev); + + read_lock_irqsave(&devices_lock, flags); + peer = peer_lookup_conn(dev, conn); + read_unlock_irqrestore(&devices_lock, flags); + if (!peer) + goto drop; + + saddr = peer->eui64_addr; + daddr = dev->netdev->dev_addr; + + /* at least two bytes will be used for the encoding */ + if (skb->len < 2) + goto drop; + + if (lowpan_fetch_skb_u8(skb, &iphc0)) + goto drop; + + if (lowpan_fetch_skb_u8(skb, &iphc1)) + goto drop; + + return lowpan_process_data(skb, netdev, + saddr, IEEE802154_ADDR_LONG, EUI64_ADDR_LEN, + daddr, IEEE802154_ADDR_LONG, EUI64_ADDR_LEN, + iphc0, iphc1, give_skb_to_upper); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int recv_pkt(struct sk_buff *skb, struct net_device *dev, + struct l2cap_conn *conn) +{ + struct sk_buff *local_skb; + int ret; + + if (!netif_running(dev)) + goto drop; + + if (dev->type != ARPHRD_6LOWPAN) + goto drop; + + /* check that it's our buffer */ + if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { + /* Copy the packet so that the IPv6 header is + * properly aligned. + */ + local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1, + skb_tailroom(skb), GFP_ATOMIC); + if (!local_skb) + goto drop; + + local_skb->protocol = htons(ETH_P_IPV6); + local_skb->pkt_type = PACKET_HOST; + + skb_reset_network_header(local_skb); + skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); + + if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { + kfree_skb(local_skb); + goto drop; + } + + dev->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + + kfree_skb(local_skb); + kfree_skb(skb); + } else { + switch (skb->data[0] & 0xe0) { + case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ + local_skb = skb_clone(skb, GFP_ATOMIC); + if (!local_skb) + goto drop; + + ret = process_data(local_skb, dev, conn); + if (ret != NET_RX_SUCCESS) + goto drop; + + dev->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + + kfree_skb(skb); + break; + default: + break; + } + } + + return NET_RX_SUCCESS; + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* Packet from BT LE device */ +int bt_6lowpan_recv(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct lowpan_dev *dev; + struct lowpan_peer *peer; + int err; + + peer = lookup_peer(conn); + if (!peer) + return -ENOENT; + + dev = lookup_dev(conn); + if (!dev || !dev->netdev) + return -ENOENT; + + err = recv_pkt(skb, dev->netdev, conn); + BT_DBG("recv pkt %d", err); + + return err; +} + +static inline int skbuff_copy(void *msg, int len, int count, int mtu, + struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff **frag; + int sent = 0; + + memcpy(skb_put(skb, count), msg, count); + + sent += count; + msg += count; + len -= count; + + dev->stats.tx_bytes += count; + dev->stats.tx_packets++; + + raw_dump_table(__func__, "Sending", skb->data, skb->len); + + /* Continuation fragments (no L2CAP header) */ + frag = &skb_shinfo(skb)->frag_list; + while (len > 0) { + struct sk_buff *tmp; + + count = min_t(unsigned int, mtu, len); + + tmp = bt_skb_alloc(count, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + + *frag = tmp; + + memcpy(skb_put(*frag, count), msg, count); + + raw_dump_table(__func__, "Sending fragment", + (*frag)->data, count); + + (*frag)->priority = skb->priority; + + sent += count; + msg += count; + len -= count; + + skb->len += (*frag)->len; + skb->data_len += (*frag)->len; + + frag = &(*frag)->next; + + dev->stats.tx_bytes += count; + dev->stats.tx_packets++; + } + + return sent; +} + +static struct sk_buff *create_pdu(struct l2cap_conn *conn, void *msg, + size_t len, u32 priority, + struct net_device *dev) +{ + struct sk_buff *skb; + int err, count; + struct l2cap_hdr *lh; + + /* FIXME: This mtu check should be not needed and atm is only used for + * testing purposes + */ + if (conn->mtu > (L2CAP_LE_MIN_MTU + L2CAP_HDR_SIZE)) + conn->mtu = L2CAP_LE_MIN_MTU + L2CAP_HDR_SIZE; + + count = min_t(unsigned int, (conn->mtu - L2CAP_HDR_SIZE), len); + + BT_DBG("conn %p len %zu mtu %d count %d", conn, len, conn->mtu, count); + + skb = bt_skb_alloc(count + L2CAP_HDR_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + skb->priority = priority; + + lh = (struct l2cap_hdr *)skb_put(skb, L2CAP_HDR_SIZE); + lh->cid = cpu_to_le16(L2CAP_FC_6LOWPAN); + lh->len = cpu_to_le16(len); + + err = skbuff_copy(msg, len, count, conn->mtu, skb, dev); + if (unlikely(err < 0)) { + kfree_skb(skb); + BT_DBG("skbuff copy %d failed", err); + return ERR_PTR(err); + } + + return skb; +} + +static int conn_send(struct l2cap_conn *conn, + void *msg, size_t len, u32 priority, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = create_pdu(conn, msg, len, priority, dev); + if (IS_ERR(skb)) + return -EINVAL; + + BT_DBG("conn %p skb %p len %d priority %u", conn, skb, skb->len, + skb->priority); + + hci_send_acl(conn->hchan, skb, ACL_START); + + return 0; +} + +static u8 get_addr_type_from_eui64(u8 byte) +{ + /* Is universal(0) or local(1) bit, */ + if (byte & 0x02) + return ADDR_LE_DEV_RANDOM; + + return ADDR_LE_DEV_PUBLIC; +} + +static void copy_to_bdaddr(struct in6_addr *ip6_daddr, bdaddr_t *addr) +{ + u8 *eui64 = ip6_daddr->s6_addr + 8; + + addr->b[0] = eui64[7]; + addr->b[1] = eui64[6]; + addr->b[2] = eui64[5]; + addr->b[3] = eui64[2]; + addr->b[4] = eui64[1]; + addr->b[5] = eui64[0]; +} + +static void convert_dest_bdaddr(struct in6_addr *ip6_daddr, + bdaddr_t *addr, u8 *addr_type) +{ + copy_to_bdaddr(ip6_daddr, addr); + + /* We need to toggle the U/L bit that we got from IPv6 address + * so that we get the proper address and type of the BD address. + */ + addr->b[5] ^= 0x02; + + *addr_type = get_addr_type_from_eui64(addr->b[5]); +} + +static int header_create(struct sk_buff *skb, struct net_device *netdev, + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) +{ + struct ipv6hdr *hdr; + struct lowpan_dev *dev; + struct lowpan_peer *peer; + bdaddr_t addr, *any = BDADDR_ANY; + u8 *saddr, *daddr = any->b; + u8 addr_type; + + if (type != ETH_P_IPV6) + return -EINVAL; + + hdr = ipv6_hdr(skb); + + dev = lowpan_dev(netdev); + + if (ipv6_addr_is_multicast(&hdr->daddr)) { + memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, + sizeof(struct in6_addr)); + lowpan_cb(skb)->conn = NULL; + } else { + unsigned long flags; + + /* Get destination BT device from skb. + * If there is no such peer then discard the packet. + */ + convert_dest_bdaddr(&hdr->daddr, &addr, &addr_type); + + BT_DBG("dest addr %pMR type %s IP %pI6c", &addr, + addr_type == ADDR_LE_DEV_PUBLIC ? "PUBLIC" : "RANDOM", + &hdr->daddr); + + read_lock_irqsave(&devices_lock, flags); + peer = peer_lookup_ba(dev, &addr, addr_type); + read_unlock_irqrestore(&devices_lock, flags); + + if (!peer) { + BT_DBG("no such peer %pMR found", &addr); + return -ENOENT; + } + + daddr = peer->eui64_addr; + + memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, + sizeof(struct in6_addr)); + lowpan_cb(skb)->conn = peer->conn; + } + + saddr = dev->netdev->dev_addr; + + return lowpan_header_compress(skb, netdev, type, daddr, saddr, len); +} + +/* Packet to BT LE device */ +static int send_pkt(struct l2cap_conn *conn, const void *saddr, + const void *daddr, struct sk_buff *skb, + struct net_device *netdev) +{ + raw_dump_table(__func__, "raw skb data dump before fragmentation", + skb->data, skb->len); + + return conn_send(conn, skb->data, skb->len, 0, netdev); +} + +static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) +{ + struct sk_buff *local_skb; + struct lowpan_dev *entry, *tmp; + unsigned long flags; + + read_lock_irqsave(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, list) { + struct lowpan_peer *pentry, *ptmp; + struct lowpan_dev *dev; + + if (entry->netdev != netdev) + continue; + + dev = lowpan_dev(entry->netdev); + + list_for_each_entry_safe(pentry, ptmp, &dev->peers, list) { + local_skb = skb_clone(skb, GFP_ATOMIC); + + send_pkt(pentry->conn, netdev->dev_addr, + pentry->eui64_addr, local_skb, netdev); + + kfree_skb(local_skb); + } + } + + read_unlock_irqrestore(&devices_lock, flags); +} + +static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + int err = 0; + unsigned char *eui64_addr; + struct lowpan_dev *dev; + struct lowpan_peer *peer; + bdaddr_t addr; + u8 addr_type; + + if (ipv6_addr_is_multicast(&lowpan_cb(skb)->addr)) { + /* We need to send the packet to every device + * behind this interface. + */ + send_mcast_pkt(skb, netdev); + } else { + unsigned long flags; + + convert_dest_bdaddr(&lowpan_cb(skb)->addr, &addr, &addr_type); + eui64_addr = lowpan_cb(skb)->addr.s6_addr + 8; + dev = lowpan_dev(netdev); + + read_lock_irqsave(&devices_lock, flags); + peer = peer_lookup_ba(dev, &addr, addr_type); + read_unlock_irqrestore(&devices_lock, flags); + + BT_DBG("xmit %s to %pMR type %s IP %pI6c peer %p", + netdev->name, &addr, + addr_type == ADDR_LE_DEV_PUBLIC ? "PUBLIC" : "RANDOM", + &lowpan_cb(skb)->addr, peer); + + if (peer && peer->conn) + err = send_pkt(peer->conn, netdev->dev_addr, + eui64_addr, skb, netdev); + } + dev_kfree_skb(skb); + + if (err) + BT_DBG("ERROR: xmit failed (%d)", err); + + return (err < 0) ? NET_XMIT_DROP : err; +} + +static const struct net_device_ops netdev_ops = { + .ndo_start_xmit = bt_xmit, +}; + +static struct header_ops header_ops = { + .create = header_create, +}; + +static void netdev_setup(struct net_device *dev) +{ + dev->addr_len = EUI64_ADDR_LEN; + dev->type = ARPHRD_6LOWPAN; + + dev->hard_header_len = 0; + dev->needed_tailroom = 0; + dev->mtu = IPV6_MIN_MTU; + dev->tx_queue_len = 0; + dev->flags = IFF_RUNNING | IFF_POINTOPOINT; + dev->watchdog_timeo = 0; + + dev->netdev_ops = &netdev_ops; + dev->header_ops = &header_ops; + dev->destructor = free_netdev; +} + +static struct device_type bt_type = { + .name = "bluetooth", +}; + +static void set_addr(u8 *eui, u8 *addr, u8 addr_type) +{ + /* addr is the BT address in little-endian format */ + eui[0] = addr[5]; + eui[1] = addr[4]; + eui[2] = addr[3]; + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[5] = addr[2]; + eui[6] = addr[1]; + eui[7] = addr[0]; + + /* Universal/local bit set, BT 6lowpan draft ch. 3.2.1 */ + if (addr_type == ADDR_LE_DEV_PUBLIC) + eui[0] &= ~0x02; + else + eui[0] |= 0x02; + + BT_DBG("type %d addr %*phC", addr_type, 8, eui); +} + +static void set_dev_addr(struct net_device *netdev, bdaddr_t *addr, + u8 addr_type) +{ + netdev->addr_assign_type = NET_ADDR_PERM; + set_addr(netdev->dev_addr, addr->b, addr_type); +} + +static void ifup(struct net_device *netdev) +{ + int err; + + rtnl_lock(); + err = dev_open(netdev); + if (err < 0) + BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); + rtnl_unlock(); +} + +static void do_notify_peers(struct work_struct *work) +{ + struct lowpan_dev *dev = container_of(work, struct lowpan_dev, + notify_peers.work); + + netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */ +} + +static bool is_bt_6lowpan(struct hci_conn *hcon) +{ + if (hcon->type != LE_LINK) + return false; + + return test_bit(HCI_CONN_6LOWPAN, &hcon->flags); +} + +static int add_peer_conn(struct l2cap_conn *conn, struct lowpan_dev *dev) +{ + struct lowpan_peer *peer; + unsigned long flags; + + peer = kzalloc(sizeof(*peer), GFP_ATOMIC); + if (!peer) + return -ENOMEM; + + peer->conn = conn; + memset(&peer->peer_addr, 0, sizeof(struct in6_addr)); + + /* RFC 2464 ch. 5 */ + peer->peer_addr.s6_addr[0] = 0xFE; + peer->peer_addr.s6_addr[1] = 0x80; + set_addr((u8 *)&peer->peer_addr.s6_addr + 8, conn->hcon->dst.b, + conn->hcon->dst_type); + + memcpy(&peer->eui64_addr, (u8 *)&peer->peer_addr.s6_addr + 8, + EUI64_ADDR_LEN); + + write_lock_irqsave(&devices_lock, flags); + INIT_LIST_HEAD(&peer->list); + peer_add(dev, peer); + write_unlock_irqrestore(&devices_lock, flags); + + /* Notifying peers about us needs to be done without locks held */ + INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); + schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); + + return 0; +} + +/* This gets called when BT LE 6LoWPAN device is connected. We then + * create network device that acts as a proxy between BT LE device + * and kernel network stack. + */ +int bt_6lowpan_add_conn(struct l2cap_conn *conn) +{ + struct lowpan_peer *peer = NULL; + struct lowpan_dev *dev; + struct net_device *netdev; + int err = 0; + unsigned long flags; + + if (!is_bt_6lowpan(conn->hcon)) + return 0; + + peer = lookup_peer(conn); + if (peer) + return -EEXIST; + + dev = lookup_dev(conn); + if (dev) + return add_peer_conn(conn, dev); + + netdev = alloc_netdev(sizeof(*dev), IFACE_NAME_TEMPLATE, netdev_setup); + if (!netdev) + return -ENOMEM; + + set_dev_addr(netdev, &conn->hcon->src, conn->hcon->src_type); + + netdev->netdev_ops = &netdev_ops; + SET_NETDEV_DEV(netdev, &conn->hcon->dev); + SET_NETDEV_DEVTYPE(netdev, &bt_type); + + err = register_netdev(netdev); + if (err < 0) { + BT_INFO("register_netdev failed %d", err); + free_netdev(netdev); + goto out; + } + + BT_DBG("ifindex %d peer bdaddr %pMR my addr %pMR", + netdev->ifindex, &conn->hcon->dst, &conn->hcon->src); + set_bit(__LINK_STATE_PRESENT, &netdev->state); + + dev = netdev_priv(netdev); + dev->netdev = netdev; + dev->hdev = conn->hcon->hdev; + INIT_LIST_HEAD(&dev->peers); + + write_lock_irqsave(&devices_lock, flags); + INIT_LIST_HEAD(&dev->list); + list_add(&dev->list, &bt_6lowpan_devices); + write_unlock_irqrestore(&devices_lock, flags); + + ifup(netdev); + + return add_peer_conn(conn, dev); + +out: + return err; +} + +static void delete_netdev(struct work_struct *work) +{ + struct lowpan_dev *entry = container_of(work, struct lowpan_dev, + delete_netdev); + + unregister_netdev(entry->netdev); + + /* The entry pointer is deleted in device_event() */ +} + +int bt_6lowpan_del_conn(struct l2cap_conn *conn) +{ + struct lowpan_dev *entry, *tmp; + struct lowpan_dev *dev = NULL; + struct lowpan_peer *peer; + int err = -ENOENT; + unsigned long flags; + bool last = false; + + if (!conn || !is_bt_6lowpan(conn->hcon)) + return 0; + + write_lock_irqsave(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, list) { + dev = lowpan_dev(entry->netdev); + peer = peer_lookup_conn(dev, conn); + if (peer) { + last = peer_del(dev, peer); + err = 0; + break; + } + } + + if (!err && last && dev && !atomic_read(&dev->peer_count)) { + write_unlock_irqrestore(&devices_lock, flags); + + cancel_delayed_work_sync(&dev->notify_peers); + + /* bt_6lowpan_del_conn() is called with hci dev lock held which + * means that we must delete the netdevice in worker thread. + */ + INIT_WORK(&entry->delete_netdev, delete_netdev); + schedule_work(&entry->delete_netdev); + } else { + write_unlock_irqrestore(&devices_lock, flags); + } + + return err; +} + +static int device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct lowpan_dev *entry, *tmp; + unsigned long flags; + + if (netdev->type != ARPHRD_6LOWPAN) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UNREGISTER: + write_lock_irqsave(&devices_lock, flags); + list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, + list) { + if (entry->netdev == netdev) { + list_del(&entry->list); + kfree(entry); + break; + } + } + write_unlock_irqrestore(&devices_lock, flags); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block bt_6lowpan_dev_notifier = { + .notifier_call = device_event, +}; + +int bt_6lowpan_init(void) +{ + return register_netdevice_notifier(&bt_6lowpan_dev_notifier); +} + +void bt_6lowpan_cleanup(void) +{ + unregister_netdevice_notifier(&bt_6lowpan_dev_notifier); +} diff --git a/net/bluetooth/6lowpan.h b/net/bluetooth/6lowpan.h new file mode 100644 index 00000000000..5d281f1eaf5 --- /dev/null +++ b/net/bluetooth/6lowpan.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2013 Intel Corp. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 and + only version 2 as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*/ + +#ifndef __6LOWPAN_H +#define __6LOWPAN_H + +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/bluetooth/l2cap.h> + +#if IS_ENABLED(CONFIG_BT_6LOWPAN) +int bt_6lowpan_recv(struct l2cap_conn *conn, struct sk_buff *skb); +int bt_6lowpan_add_conn(struct l2cap_conn *conn); +int bt_6lowpan_del_conn(struct l2cap_conn *conn); +int bt_6lowpan_init(void); +void bt_6lowpan_cleanup(void); +#else +static int bt_6lowpan_recv(struct l2cap_conn *conn, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} +static int bt_6lowpan_add_conn(struct l2cap_conn *conn) +{ + return -EOPNOTSUPP; +} +int bt_6lowpan_del_conn(struct l2cap_conn *conn) +{ + return -EOPNOTSUPP; +} +static int bt_6lowpan_init(void) +{ + return -EOPNOTSUPP; +} +static void bt_6lowpan_cleanup(void) { } +#endif + +#endif /* __6LOWPAN_H */ diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index d3f3f7b1d32..06ec14499ca 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -6,6 +6,7 @@ menuconfig BT tristate "Bluetooth subsystem support" depends on NET && !S390 depends on RFKILL || !RFKILL + select 6LOWPAN_IPHC if BT_6LOWPAN select CRC16 select CRYPTO select CRYPTO_BLKCIPHER @@ -39,6 +40,12 @@ menuconfig BT to Bluetooth kernel modules are provided in the BlueZ packages. For more information, see <http://www.bluez.org/>. +config BT_6LOWPAN + bool "Bluetooth 6LoWPAN support" + depends on BT && IPV6 + help + IPv6 compression over Bluetooth. + source "net/bluetooth/rfcomm/Kconfig" source "net/bluetooth/bnep/Kconfig" diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 6a791e73e39..ca51246b101 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -11,5 +11,6 @@ obj-$(CONFIG_BT_HIDP) += hidp/ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o sco.o lib.o \ a2mp.o amp.o +bluetooth-$(CONFIG_BT_6LOWPAN) += 6lowpan.o subdir-ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index efcd108822c..9514cc9e850 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -162,7 +162,7 @@ static int a2mp_discover_req(struct amp_mgr *mgr, struct sk_buff *skb, return -ENOMEM; } - rsp->mtu = __constant_cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU); + rsp->mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU); rsp->ext_feat = 0; __a2mp_add_cl(mgr, rsp->cl); @@ -235,7 +235,7 @@ static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb, BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); - if (chan->chan_type == L2CAP_CHAN_CONN_FIX_A2MP) + if (chan->scid == L2CAP_CID_A2MP) continue; l2cap_chan_lock(chan); @@ -649,7 +649,7 @@ static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) if (err) { struct a2mp_cmd_rej rej; - rej.reason = __constant_cpu_to_le16(0); + rej.reason = cpu_to_le16(0); hdr = (void *) skb->data; BT_DBG("Send A2MP Rej: cmd 0x%2.2x err %d", hdr->code, err); @@ -695,7 +695,13 @@ static void a2mp_chan_state_change_cb(struct l2cap_chan *chan, int state, static struct sk_buff *a2mp_chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long len, int nb) { - return bt_skb_alloc(len, GFP_KERNEL); + struct sk_buff *skb; + + skb = bt_skb_alloc(len, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + return skb; } static struct l2cap_ops a2mp_chan_ops = { @@ -726,7 +732,11 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked) BT_DBG("chan %p", chan); - chan->chan_type = L2CAP_CHAN_CONN_FIX_A2MP; + chan->chan_type = L2CAP_CHAN_FIXED; + chan->scid = L2CAP_CID_A2MP; + chan->dcid = L2CAP_CID_A2MP; + chan->omtu = L2CAP_A2MP_DEFAULT_MTU; + chan->imtu = L2CAP_A2MP_DEFAULT_MTU; chan->flush_to = L2CAP_DEFAULT_FLUSH_TO; chan->ops = &a2mp_chan_ops; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 56ca494621c..2021c481cdb 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -31,7 +31,7 @@ #include <net/bluetooth/bluetooth.h> #include <linux/proc_fs.h> -#define VERSION "2.17" +#define VERSION "2.19" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ba5366c320d..a7a27bc2c0b 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -28,6 +28,7 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/l2cap.h> #include "smp.h" #include "a2mp.h" @@ -82,7 +83,7 @@ static void hci_acl_create_connection(struct hci_conn *conn) cp.pscan_rep_mode = ie->data.pscan_rep_mode; cp.pscan_mode = ie->data.pscan_mode; cp.clock_offset = ie->data.clock_offset | - __constant_cpu_to_le16(0x8000); + cpu_to_le16(0x8000); } memcpy(conn->dev_class, ie->data.dev_class, 3); @@ -182,8 +183,8 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) cp.handle = cpu_to_le16(handle); - cp.tx_bandwidth = __constant_cpu_to_le32(0x00001f40); - cp.rx_bandwidth = __constant_cpu_to_le32(0x00001f40); + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.voice_setting = cpu_to_le16(conn->setting); switch (conn->setting & SCO_AIRMODE_MASK) { @@ -225,13 +226,13 @@ void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, cp.conn_interval_max = cpu_to_le16(max); cp.conn_latency = cpu_to_le16(latency); cp.supervision_timeout = cpu_to_le16(to_multiplier); - cp.min_ce_len = __constant_cpu_to_le16(0x0001); - cp.max_ce_len = __constant_cpu_to_le16(0x0001); + cp.min_ce_len = cpu_to_le16(0x0000); + cp.max_ce_len = cpu_to_le16(0x0000); hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); } -void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __u8 rand[8], +void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16]) { struct hci_dev *hdev = conn->hdev; @@ -242,9 +243,9 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __u8 rand[8], memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); - memcpy(cp.ltk, ltk, sizeof(cp.ltk)); + cp.rand = rand; cp.ediv = ediv; - memcpy(cp.rand, rand, sizeof(cp.rand)); + memcpy(cp.ltk, ltk, sizeof(cp.ltk)); hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); } @@ -288,10 +289,20 @@ static void hci_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, disc_work.work); + int refcnt = atomic_read(&conn->refcnt); BT_DBG("hcon %p state %s", conn, state_to_string(conn->state)); - if (atomic_read(&conn->refcnt)) + WARN_ON(refcnt < 0); + + /* FIXME: It was observed that in pairing failed scenario, refcnt + * drops below 0. Probably this is because l2cap_conn_del calls + * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is + * dropped. After that loop hci_chan_del is called which also drops + * conn. For now make sure that ACL is alive if refcnt is higher then 0, + * otherwise drop it. + */ + if (refcnt > 0) return; switch (conn->state) { @@ -337,9 +348,9 @@ static void hci_conn_idle(struct work_struct *work) if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) { struct hci_cp_sniff_subrate cp; cp.handle = cpu_to_le16(conn->handle); - cp.max_latency = __constant_cpu_to_le16(0); - cp.min_remote_timeout = __constant_cpu_to_le16(0); - cp.min_local_timeout = __constant_cpu_to_le16(0); + cp.max_latency = cpu_to_le16(0); + cp.min_remote_timeout = cpu_to_le16(0); + cp.min_local_timeout = cpu_to_le16(0); hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp); } @@ -348,8 +359,8 @@ static void hci_conn_idle(struct work_struct *work) cp.handle = cpu_to_le16(conn->handle); cp.max_interval = cpu_to_le16(hdev->sniff_max_interval); cp.min_interval = cpu_to_le16(hdev->sniff_min_interval); - cp.attempt = __constant_cpu_to_le16(4); - cp.timeout = __constant_cpu_to_le16(1); + cp.attempt = cpu_to_le16(4); + cp.timeout = cpu_to_le16(1); hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp); } } @@ -363,6 +374,30 @@ static void hci_conn_auto_accept(struct work_struct *work) &conn->dst); } +static void le_conn_timeout(struct work_struct *work) +{ + struct hci_conn *conn = container_of(work, struct hci_conn, + le_conn_timeout.work); + struct hci_dev *hdev = conn->hdev; + + BT_DBG(""); + + /* We could end up here due to having done directed advertising, + * so clean up the state if necessary. This should however only + * happen with broken hardware or if low duty cycle was used + * (which doesn't have a timeout of its own). + */ + if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) { + u8 enable = 0x00; + hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), + &enable); + hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); + return; + } + + hci_le_create_connection_cancel(conn); +} + struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) { struct hci_conn *conn; @@ -383,6 +418,8 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) conn->io_capability = hdev->io_capability; conn->remote_auth = 0xff; conn->key_type = 0xff; + conn->tx_power = HCI_TX_POWER_INVALID; + conn->max_tx_power = HCI_TX_POWER_INVALID; set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; @@ -391,6 +428,10 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; break; + case LE_LINK: + /* conn->src should reflect the local identity address */ + hci_copy_identity_address(hdev, &conn->src, &conn->src_type); + break; case SCO_LINK: if (lmp_esco_capable(hdev)) conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | @@ -410,6 +451,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout); INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept); INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle); + INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout); atomic_set(&conn->refcnt, 0); @@ -442,6 +484,8 @@ int hci_conn_del(struct hci_conn *conn) /* Unacked frames */ hdev->acl_cnt += conn->sent; } else if (conn->type == LE_LINK) { + cancel_delayed_work_sync(&conn->le_conn_timeout); + if (hdev->le_pkts) hdev->le_cnt += conn->sent; else @@ -514,6 +558,31 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) } EXPORT_SYMBOL(hci_get_route); +/* This function requires the caller holds hdev->lock */ +void hci_le_conn_failed(struct hci_conn *conn, u8 status) +{ + struct hci_dev *hdev = conn->hdev; + + conn->state = BT_CLOSED; + + mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, + status); + + hci_proto_connect_cfm(conn, status); + + hci_conn_del(conn); + + /* Since we may have temporarily stopped the background scanning in + * favor of connection establishment, we should restart it. + */ + hci_update_background_scan(hdev); + + /* Re-enable advertising in case this was a failed connection + * attempt as a peripheral. + */ + mgmt_reenable_advertising(hdev); +} + static void create_le_conn_complete(struct hci_dev *hdev, u8 status) { struct hci_conn *conn; @@ -530,60 +599,91 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status) if (!conn) goto done; - conn->state = BT_CLOSED; - - mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, - status); - - hci_proto_connect_cfm(conn, status); - - hci_conn_del(conn); + hci_le_conn_failed(conn, status); done: hci_dev_unlock(hdev); } -static int hci_create_le_conn(struct hci_conn *conn) +static void hci_req_add_le_create_conn(struct hci_request *req, + struct hci_conn *conn) { - struct hci_dev *hdev = conn->hdev; struct hci_cp_le_create_conn cp; - struct hci_request req; - int err; - - hci_req_init(&req, hdev); + struct hci_dev *hdev = conn->hdev; + u8 own_addr_type; memset(&cp, 0, sizeof(cp)); + + /* Update random address, but set require_privacy to false so + * that we never connect with an unresolvable address. + */ + if (hci_update_random_address(req, false, &own_addr_type)) + return; + cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); cp.scan_window = cpu_to_le16(hdev->le_scan_window); bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; - cp.own_address_type = conn->src_type; - cp.conn_interval_min = cpu_to_le16(hdev->le_conn_min_interval); - cp.conn_interval_max = cpu_to_le16(hdev->le_conn_max_interval); - cp.supervision_timeout = __constant_cpu_to_le16(0x002a); - cp.min_ce_len = __constant_cpu_to_le16(0x0000); - cp.max_ce_len = __constant_cpu_to_le16(0x0000); + cp.own_address_type = own_addr_type; + cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); + cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); + cp.supervision_timeout = cpu_to_le16(0x002a); + cp.min_ce_len = cpu_to_le16(0x0000); + cp.max_ce_len = cpu_to_le16(0x0000); - hci_req_add(&req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); - err = hci_req_run(&req, create_le_conn_complete); - if (err) { - hci_conn_del(conn); - return err; - } + conn->state = BT_CONNECT; +} - return 0; +static void hci_req_directed_advertising(struct hci_request *req, + struct hci_conn *conn) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_adv_param cp; + u8 own_addr_type; + u8 enable; + + enable = 0x00; + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); + + /* Clear the HCI_ADVERTISING bit temporarily so that the + * hci_update_random_address knows that it's safe to go ahead + * and write a new random address. The flag will be set back on + * as soon as the SET_ADV_ENABLE HCI command completes. + */ + clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + + /* Set require_privacy to false so that the remote device has a + * chance of identifying us. + */ + if (hci_update_random_address(req, false, &own_addr_type) < 0) + return; + + memset(&cp, 0, sizeof(cp)); + cp.type = LE_ADV_DIRECT_IND; + cp.own_address_type = own_addr_type; + cp.direct_addr_type = conn->dst_type; + bacpy(&cp.direct_addr, &conn->dst); + cp.channel_map = hdev->le_adv_channel_map; + + hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); + + enable = 0x01; + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); + + conn->state = BT_CONNECT; } -static struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, - u8 dst_type, u8 sec_level, u8 auth_type) +struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, + u8 dst_type, u8 sec_level, u8 auth_type) { + struct hci_conn_params *params; struct hci_conn *conn; + struct smp_irk *irk; + struct hci_request req; int err; - if (test_bit(HCI_ADVERTISING, &hdev->flags)) - return ERR_PTR(-ENOTSUPP); - /* Some devices send ATT messages as soon as the physical link is * established. To be able to handle these ATT messages, the user- * space first establishes the connection and then starts the pairing @@ -607,35 +707,80 @@ static struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, if (conn) return ERR_PTR(-EBUSY); + /* When given an identity address with existing identity + * resolving key, the connection needs to be established + * to a resolvable random address. + * + * This uses the cached random resolvable address from + * a previous scan. When no cached address is available, + * try connecting to the identity address instead. + * + * Storing the resolvable random address is required here + * to handle connection failures. The address will later + * be resolved back into the original identity address + * from the connect request. + */ + irk = hci_find_irk_by_addr(hdev, dst, dst_type); + if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { + dst = &irk->rpa; + dst_type = ADDR_LE_DEV_RANDOM; + } + conn = hci_conn_add(hdev, LE_LINK, dst); if (!conn) return ERR_PTR(-ENOMEM); - if (dst_type == BDADDR_LE_PUBLIC) - conn->dst_type = ADDR_LE_DEV_PUBLIC; - else - conn->dst_type = ADDR_LE_DEV_RANDOM; + conn->dst_type = dst_type; + conn->sec_level = BT_SECURITY_LOW; + conn->pending_sec_level = sec_level; + conn->auth_type = auth_type; - conn->src_type = hdev->own_addr_type; + hci_req_init(&req, hdev); + + if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) { + hci_req_directed_advertising(&req, conn); + goto create_conn; + } - conn->state = BT_CONNECT; conn->out = true; conn->link_mode |= HCI_LM_MASTER; - conn->sec_level = BT_SECURITY_LOW; - conn->pending_sec_level = sec_level; - conn->auth_type = auth_type; - err = hci_create_le_conn(conn); - if (err) + params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); + if (params) { + conn->le_conn_min_interval = params->conn_min_interval; + conn->le_conn_max_interval = params->conn_max_interval; + } else { + conn->le_conn_min_interval = hdev->le_conn_min_interval; + conn->le_conn_max_interval = hdev->le_conn_max_interval; + } + + /* If controller is scanning, we stop it since some controllers are + * not able to scan and connect at the same time. Also set the + * HCI_LE_SCAN_INTERRUPTED flag so that the command complete + * handler for scan disabling knows to set the correct discovery + * state. + */ + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) { + hci_req_add_le_scan_disable(&req); + set_bit(HCI_LE_SCAN_INTERRUPTED, &hdev->dev_flags); + } + + hci_req_add_le_create_conn(&req, conn); + +create_conn: + err = hci_req_run(&req, create_le_conn_complete); + if (err) { + hci_conn_del(conn); return ERR_PTR(err); + } done: hci_conn_hold(conn); return conn; } -static struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, - u8 sec_level, u8 auth_type) +struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, + u8 sec_level, u8 auth_type) { struct hci_conn *acl; @@ -704,27 +849,22 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, return sco; } -/* Create SCO, ACL or LE connection. */ -struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, - __u8 dst_type, __u8 sec_level, __u8 auth_type) -{ - BT_DBG("%s dst %pMR type 0x%x", hdev->name, dst, type); - - switch (type) { - case LE_LINK: - return hci_connect_le(hdev, dst, dst_type, sec_level, auth_type); - case ACL_LINK: - return hci_connect_acl(hdev, dst, sec_level, auth_type); - } - - return ERR_PTR(-EINVAL); -} - /* Check link security requirement */ int hci_conn_check_link_mode(struct hci_conn *conn) { BT_DBG("hcon %p", conn); + /* In Secure Connections Only mode, it is required that Secure + * Connections is used and the link is encrypted with AES-CCM + * using a P-256 authenticated combination key. + */ + if (test_bit(HCI_SC_ONLY, &conn->hdev->flags)) { + if (!hci_conn_sc_enabled(conn) || + !test_bit(HCI_CONN_AES_CCM, &conn->flags) || + conn->key_type != HCI_LK_AUTH_COMBINATION_P256) + return 0; + } + if (hci_conn_ssp_enabled(conn) && !(conn->link_mode & HCI_LM_ENCRYPT)) return 0; @@ -752,14 +892,17 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { struct hci_cp_auth_requested cp; - /* encrypt must be pending if auth is also pending */ - set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); - cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); - if (conn->key_type != 0xff) + + /* If we're already encrypted set the REAUTH_PEND flag, + * otherwise set the ENCRYPT_PEND. + */ + if (conn->link_mode & HCI_LM_ENCRYPT) set_bit(HCI_CONN_REAUTH_PEND, &conn->flags); + else + set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); } return 0; @@ -800,14 +943,23 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (!(conn->link_mode & HCI_LM_AUTH)) goto auth; - /* An authenticated combination key has sufficient security for any - security level. */ - if (conn->key_type == HCI_LK_AUTH_COMBINATION) + /* An authenticated FIPS approved combination key has sufficient + * security for security level 4. */ + if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256 && + sec_level == BT_SECURITY_FIPS) + goto encrypt; + + /* An authenticated combination key has sufficient security for + security level 3. */ + if ((conn->key_type == HCI_LK_AUTH_COMBINATION_P192 || + conn->key_type == HCI_LK_AUTH_COMBINATION_P256) && + sec_level == BT_SECURITY_HIGH) goto encrypt; /* An unauthenticated combination key has sufficient security for security level 1 and 2. */ - if (conn->key_type == HCI_LK_UNAUTH_COMBINATION && + if ((conn->key_type == HCI_LK_UNAUTH_COMBINATION_P192 || + conn->key_type == HCI_LK_UNAUTH_COMBINATION_P256) && (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW)) goto encrypt; @@ -816,7 +968,8 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) is generated using maximum PIN code length (16). For pre 2.1 units. */ if (conn->key_type == HCI_LK_COMBINATION && - (sec_level != BT_SECURITY_HIGH || conn->pin_length == 16)) + (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW || + conn->pin_length == 16)) goto encrypt; auth: @@ -840,13 +993,17 @@ int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level) { BT_DBG("hcon %p", conn); - if (sec_level != BT_SECURITY_HIGH) - return 1; /* Accept if non-secure is required */ + /* Accept if non-secure or higher security level is required */ + if (sec_level != BT_SECURITY_HIGH && sec_level != BT_SECURITY_FIPS) + return 1; - if (conn->sec_level == BT_SECURITY_HIGH) + /* Accept if secure or higher security level is already present */ + if (conn->sec_level == BT_SECURITY_HIGH || + conn->sec_level == BT_SECURITY_FIPS) return 1; - return 0; /* Reject not secure link */ + /* Reject not secure link */ + return 0; } EXPORT_SYMBOL(hci_conn_check_secure); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 6ccc4eb9e55..0a43cce9a91 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -29,10 +29,14 @@ #include <linux/idr.h> #include <linux/rfkill.h> #include <linux/debugfs.h> +#include <linux/crypto.h> #include <asm/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/l2cap.h> + +#include "smp.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -285,24 +289,6 @@ static const struct file_operations link_keys_fops = { .release = single_release, }; -static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[3]; - - buf[0] = test_bit(HCI_DEBUG_KEYS, &hdev->dev_flags) ? 'Y': 'N'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static const struct file_operations use_debug_keys_fops = { - .open = simple_open, - .read = use_debug_keys_read, - .llseek = default_llseek, -}; - static int dev_class_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -415,6 +401,70 @@ static int ssp_debug_mode_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(ssp_debug_mode_fops, ssp_debug_mode_get, ssp_debug_mode_set, "%llu\n"); +static ssize_t force_sc_support_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = test_bit(HCI_FORCE_SC, &hdev->dev_flags) ? 'Y': 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t force_sc_support_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf)-1)); + bool enable; + + if (test_bit(HCI_UP, &hdev->flags)) + return -EBUSY; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) + return -EINVAL; + + if (enable == test_bit(HCI_FORCE_SC, &hdev->dev_flags)) + return -EALREADY; + + change_bit(HCI_FORCE_SC, &hdev->dev_flags); + + return count; +} + +static const struct file_operations force_sc_support_fops = { + .open = simple_open, + .read = force_sc_support_read, + .write = force_sc_support_write, + .llseek = default_llseek, +}; + +static ssize_t sc_only_mode_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = test_bit(HCI_SC_ONLY, &hdev->dev_flags) ? 'Y': 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static const struct file_operations sc_only_mode_fops = { + .open = simple_open, + .read = sc_only_mode_read, + .llseek = default_llseek, +}; + static int idle_timeout_set(void *data, u64 val) { struct hci_dev *hdev = data; @@ -443,6 +493,37 @@ static int idle_timeout_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(idle_timeout_fops, idle_timeout_get, idle_timeout_set, "%llu\n"); +static int rpa_timeout_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + /* Require the RPA timeout to be at least 30 seconds and at most + * 24 hours. + */ + if (val < 30 || val > (60 * 60 * 24)) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->rpa_timeout = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int rpa_timeout_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->rpa_timeout; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(rpa_timeout_fops, rpa_timeout_get, + rpa_timeout_set, "%llu\n"); + static int sniff_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; @@ -499,6 +580,115 @@ static int sniff_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(sniff_max_interval_fops, sniff_max_interval_get, sniff_max_interval_set, "%llu\n"); +static int conn_info_min_age_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val == 0 || val > hdev->conn_info_max_age) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->conn_info_min_age = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int conn_info_min_age_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->conn_info_min_age; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(conn_info_min_age_fops, conn_info_min_age_get, + conn_info_min_age_set, "%llu\n"); + +static int conn_info_max_age_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val == 0 || val < hdev->conn_info_min_age) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->conn_info_max_age = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int conn_info_max_age_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->conn_info_max_age; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(conn_info_max_age_fops, conn_info_max_age_get, + conn_info_max_age_set, "%llu\n"); + +static int identity_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + bdaddr_t addr; + u8 addr_type; + + hci_dev_lock(hdev); + + hci_copy_identity_address(hdev, &addr, &addr_type); + + seq_printf(f, "%pMR (type %u) %*phN %pMR\n", &addr, addr_type, + 16, hdev->irk, &hdev->rpa); + + hci_dev_unlock(hdev); + + return 0; +} + +static int identity_open(struct inode *inode, struct file *file) +{ + return single_open(file, identity_show, inode->i_private); +} + +static const struct file_operations identity_fops = { + .open = identity_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int random_address_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + + hci_dev_lock(hdev); + seq_printf(f, "%pMR\n", &hdev->random_addr); + hci_dev_unlock(hdev); + + return 0; +} + +static int random_address_open(struct inode *inode, struct file *file) +{ + return single_open(file, random_address_show, inode->i_private); +} + +static const struct file_operations random_address_fops = { + .open = random_address_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int static_address_show(struct seq_file *f, void *p) { struct hci_dev *hdev = f->private; @@ -522,33 +712,107 @@ static const struct file_operations static_address_fops = { .release = single_release, }; -static int own_address_type_set(void *data, u64 val) +static ssize_t force_static_address_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) { - struct hci_dev *hdev = data; + struct hci_dev *hdev = file->private_data; + char buf[3]; - if (val != 0 && val != 1) + buf[0] = test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags) ? 'Y': 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t force_static_address_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf)-1)); + bool enable; + + if (test_bit(HCI_UP, &hdev->flags)) + return -EBUSY; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) return -EINVAL; + if (enable == test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags)) + return -EALREADY; + + change_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags); + + return count; +} + +static const struct file_operations force_static_address_fops = { + .open = simple_open, + .read = force_static_address_read, + .write = force_static_address_write, + .llseek = default_llseek, +}; + +static int white_list_show(struct seq_file *f, void *ptr) +{ + struct hci_dev *hdev = f->private; + struct bdaddr_list *b; + hci_dev_lock(hdev); - hdev->own_addr_type = val; + list_for_each_entry(b, &hdev->le_white_list, list) + seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); hci_dev_unlock(hdev); return 0; } -static int own_address_type_get(void *data, u64 *val) +static int white_list_open(struct inode *inode, struct file *file) { - struct hci_dev *hdev = data; + return single_open(file, white_list_show, inode->i_private); +} + +static const struct file_operations white_list_fops = { + .open = white_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int identity_resolving_keys_show(struct seq_file *f, void *ptr) +{ + struct hci_dev *hdev = f->private; + struct list_head *p, *n; hci_dev_lock(hdev); - *val = hdev->own_addr_type; + list_for_each_safe(p, n, &hdev->identity_resolving_keys) { + struct smp_irk *irk = list_entry(p, struct smp_irk, list); + seq_printf(f, "%pMR (type %u) %*phN %pMR\n", + &irk->bdaddr, irk->addr_type, + 16, irk->val, &irk->rpa); + } hci_dev_unlock(hdev); return 0; } -DEFINE_SIMPLE_ATTRIBUTE(own_address_type_fops, own_address_type_get, - own_address_type_set, "%llu\n"); +static int identity_resolving_keys_open(struct inode *inode, struct file *file) +{ + return single_open(file, identity_resolving_keys_show, + inode->i_private); +} + +static const struct file_operations identity_resolving_keys_fops = { + .open = identity_resolving_keys_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; static int long_term_keys_show(struct seq_file *f, void *ptr) { @@ -556,12 +820,12 @@ static int long_term_keys_show(struct seq_file *f, void *ptr) struct list_head *p, *n; hci_dev_lock(hdev); - list_for_each_safe(p, n, &hdev->link_keys) { + list_for_each_safe(p, n, &hdev->long_term_keys) { struct smp_ltk *ltk = list_entry(p, struct smp_ltk, list); - seq_printf(f, "%pMR (type %u) %u %u %u %.4x %*phN %*phN\\n", + seq_printf(f, "%pMR (type %u) %u 0x%02x %u %.4x %.16llx %*phN\n", <k->bdaddr, ltk->bdaddr_type, ltk->authenticated, ltk->type, ltk->enc_size, __le16_to_cpu(ltk->ediv), - 8, ltk->rand, 16, ltk->val); + __le64_to_cpu(ltk->rand), 16, ltk->val); } hci_dev_unlock(hdev); @@ -636,6 +900,181 @@ static int conn_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get, conn_max_interval_set, "%llu\n"); +static int adv_channel_map_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val < 0x01 || val > 0x07) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->le_adv_channel_map = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int adv_channel_map_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->le_adv_channel_map; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get, + adv_channel_map_set, "%llu\n"); + +static ssize_t lowpan_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = test_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t lowpan_write(struct file *fp, const char __user *user_buffer, + size_t count, loff_t *position) +{ + struct hci_dev *hdev = fp->private_data; + bool enable; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf)-1)); + + if (copy_from_user(buf, user_buffer, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + if (strtobool(buf, &enable) < 0) + return -EINVAL; + + if (enable == test_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags)) + return -EALREADY; + + change_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags); + + return count; +} + +static const struct file_operations lowpan_debugfs_fops = { + .open = simple_open, + .read = lowpan_read, + .write = lowpan_write, + .llseek = default_llseek, +}; + +static int le_auto_conn_show(struct seq_file *sf, void *ptr) +{ + struct hci_dev *hdev = sf->private; + struct hci_conn_params *p; + + hci_dev_lock(hdev); + + list_for_each_entry(p, &hdev->le_conn_params, list) { + seq_printf(sf, "%pMR %u %u\n", &p->addr, p->addr_type, + p->auto_connect); + } + + hci_dev_unlock(hdev); + + return 0; +} + +static int le_auto_conn_open(struct inode *inode, struct file *file) +{ + return single_open(file, le_auto_conn_show, inode->i_private); +} + +static ssize_t le_auto_conn_write(struct file *file, const char __user *data, + size_t count, loff_t *offset) +{ + struct seq_file *sf = file->private_data; + struct hci_dev *hdev = sf->private; + u8 auto_connect = 0; + bdaddr_t addr; + u8 addr_type; + char *buf; + int err = 0; + int n; + + /* Don't allow partial write */ + if (*offset != 0) + return -EINVAL; + + if (count < 3) + return -EINVAL; + + buf = memdup_user(data, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + if (memcmp(buf, "add", 3) == 0) { + n = sscanf(&buf[4], "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu %hhu", + &addr.b[5], &addr.b[4], &addr.b[3], &addr.b[2], + &addr.b[1], &addr.b[0], &addr_type, + &auto_connect); + + if (n < 7) { + err = -EINVAL; + goto done; + } + + hci_dev_lock(hdev); + err = hci_conn_params_add(hdev, &addr, addr_type, auto_connect, + hdev->le_conn_min_interval, + hdev->le_conn_max_interval); + hci_dev_unlock(hdev); + + if (err) + goto done; + } else if (memcmp(buf, "del", 3) == 0) { + n = sscanf(&buf[4], "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", + &addr.b[5], &addr.b[4], &addr.b[3], &addr.b[2], + &addr.b[1], &addr.b[0], &addr_type); + + if (n < 7) { + err = -EINVAL; + goto done; + } + + hci_dev_lock(hdev); + hci_conn_params_del(hdev, &addr, addr_type); + hci_dev_unlock(hdev); + } else if (memcmp(buf, "clr", 3) == 0) { + hci_dev_lock(hdev); + hci_conn_params_clear(hdev); + hci_pend_le_conns_clear(hdev); + hci_update_background_scan(hdev); + hci_dev_unlock(hdev); + } else { + err = -EINVAL; + } + +done: + kfree(buf); + + if (err) + return err; + else + return count; +} + +static const struct file_operations le_auto_conn_fops = { + .open = le_auto_conn_open, + .read = seq_read, + .write = le_auto_conn_write, + .llseek = seq_lseek, + .release = single_release, +}; + /* ---- HCI requests ---- */ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result) @@ -962,7 +1401,7 @@ static void bredr_setup(struct hci_request *req) hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &flt_type); /* Connection accept timeout ~20 secs */ - param = __constant_cpu_to_le16(0x7d00); + param = cpu_to_le16(0x7d00); hci_req_add(req, HCI_OP_WRITE_CA_TIMEOUT, 2, ¶m); /* AVM Berlin (31), aka "BlueFRITZ!", reports version 1.2, @@ -984,14 +1423,17 @@ static void le_setup(struct hci_request *req) /* Read LE Local Supported Features */ hci_req_add(req, HCI_OP_LE_READ_LOCAL_FEATURES, 0, NULL); + /* Read LE Supported States */ + hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL); + /* Read LE Advertising Channel TX Power */ hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); /* Read LE White List Size */ hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL); - /* Read LE Supported States */ - hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL); + /* Clear LE White List */ + hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); /* LE-only controllers have LE implicitly enabled */ if (!lmp_bredr_capable(hdev)) @@ -1228,7 +1670,7 @@ static void hci_set_event_mask_page_2(struct hci_request *req) /* If Connectionless Slave Broadcast master role is supported * enable all necessary events for it. */ - if (hdev->features[2][0] & 0x01) { + if (lmp_csb_master_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ events[2] |= 0x10; /* Slave Page Response Timeout */ @@ -1238,13 +1680,17 @@ static void hci_set_event_mask_page_2(struct hci_request *req) /* If Connectionless Slave Broadcast slave role is supported * enable all necessary events for it. */ - if (hdev->features[2][0] & 0x02) { + if (lmp_csb_slave_capable(hdev)) { events[2] |= 0x01; /* Synchronization Train Received */ events[2] |= 0x02; /* CSB Receive */ events[2] |= 0x04; /* CSB Timeout */ events[2] |= 0x08; /* Truncated Page Complete */ } + /* Enable Authenticated Payload Timeout Expired event if supported */ + if (lmp_ping_capable(hdev)) + events[2] |= 0x80; + hci_req_add(req, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events); } @@ -1261,8 +1707,13 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) * as supported send it. If not supported assume that the controller * does not have actual support for stored link keys which makes this * command redundant anyway. + * + * Some controllers indicate that they support handling deleting + * stored link keys, but they don't. The quirk lets a driver + * just disable this command. */ - if (hdev->commands[6] & 0x80) { + if (hdev->commands[6] & 0x80 && + !test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) { struct hci_cp_delete_stored_link_key cp; bacpy(&cp.bdaddr, BDADDR_ANY); @@ -1274,19 +1725,8 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[5] & 0x10) hci_setup_link_policy(req); - if (lmp_le_capable(hdev)) { - /* If the controller has a public BD_ADDR, then by - * default use that one. If this is a LE only - * controller without one, default to the random - * address. - */ - if (bacmp(&hdev->bdaddr, BDADDR_ANY)) - hdev->own_addr_type = ADDR_LE_DEV_PUBLIC; - else - hdev->own_addr_type = ADDR_LE_DEV_RANDOM; - + if (lmp_le_capable(hdev)) hci_set_le_support(req); - } /* Read features beyond page 1 if available */ for (p = 2; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) { @@ -1307,8 +1747,17 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt) hci_set_event_mask_page_2(req); /* Check for Synchronization Train support */ - if (hdev->features[2][0] & 0x04) + if (lmp_sync_train_capable(hdev)) hci_req_add(req, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL); + + /* Enable Secure Connections if supported and configured */ + if ((lmp_sc_capable(hdev) || + test_bit(HCI_FORCE_SC, &hdev->dev_flags)) && + test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) { + u8 support = 0x01; + hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT, + sizeof(support), &support); + } } static int __hci_init(struct hci_dev *hdev) @@ -1362,13 +1811,16 @@ static int __hci_init(struct hci_dev *hdev) &blacklist_fops); debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops); + debugfs_create_file("conn_info_min_age", 0644, hdev->debugfs, hdev, + &conn_info_min_age_fops); + debugfs_create_file("conn_info_max_age", 0644, hdev->debugfs, hdev, + &conn_info_max_age_fops); + if (lmp_bredr_capable(hdev)) { debugfs_create_file("inquiry_cache", 0444, hdev->debugfs, hdev, &inquiry_cache_fops); debugfs_create_file("link_keys", 0400, hdev->debugfs, hdev, &link_keys_fops); - debugfs_create_file("use_debug_keys", 0444, hdev->debugfs, - hdev, &use_debug_keys_fops); debugfs_create_file("dev_class", 0444, hdev->debugfs, hdev, &dev_class_fops); debugfs_create_file("voice_setting", 0444, hdev->debugfs, @@ -1380,6 +1832,10 @@ static int __hci_init(struct hci_dev *hdev) hdev, &auto_accept_delay_fops); debugfs_create_file("ssp_debug_mode", 0644, hdev->debugfs, hdev, &ssp_debug_mode_fops); + debugfs_create_file("force_sc_support", 0644, hdev->debugfs, + hdev, &force_sc_support_fops); + debugfs_create_file("sc_only_mode", 0444, hdev->debugfs, + hdev, &sc_only_mode_fops); } if (lmp_sniff_capable(hdev)) { @@ -1392,18 +1848,46 @@ static int __hci_init(struct hci_dev *hdev) } if (lmp_le_capable(hdev)) { + debugfs_create_file("identity", 0400, hdev->debugfs, + hdev, &identity_fops); + debugfs_create_file("rpa_timeout", 0644, hdev->debugfs, + hdev, &rpa_timeout_fops); + debugfs_create_file("random_address", 0444, hdev->debugfs, + hdev, &random_address_fops); + debugfs_create_file("static_address", 0444, hdev->debugfs, + hdev, &static_address_fops); + + /* For controllers with a public address, provide a debug + * option to force the usage of the configured static + * address. By default the public address is used. + */ + if (bacmp(&hdev->bdaddr, BDADDR_ANY)) + debugfs_create_file("force_static_address", 0644, + hdev->debugfs, hdev, + &force_static_address_fops); + debugfs_create_u8("white_list_size", 0444, hdev->debugfs, &hdev->le_white_list_size); - debugfs_create_file("static_address", 0444, hdev->debugfs, - hdev, &static_address_fops); - debugfs_create_file("own_address_type", 0644, hdev->debugfs, - hdev, &own_address_type_fops); + debugfs_create_file("white_list", 0444, hdev->debugfs, hdev, + &white_list_fops); + debugfs_create_file("identity_resolving_keys", 0400, + hdev->debugfs, hdev, + &identity_resolving_keys_fops); debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev, &long_term_keys_fops); debugfs_create_file("conn_min_interval", 0644, hdev->debugfs, hdev, &conn_min_interval_fops); debugfs_create_file("conn_max_interval", 0644, hdev->debugfs, hdev, &conn_max_interval_fops); + debugfs_create_file("adv_channel_map", 0644, hdev->debugfs, + hdev, &adv_channel_map_fops); + debugfs_create_file("6lowpan", 0644, hdev->debugfs, hdev, + &lowpan_debugfs_fops); + debugfs_create_file("le_auto_conn", 0644, hdev->debugfs, hdev, + &le_auto_conn_fops); + debugfs_create_u16("discov_interleaved_timeout", 0644, + hdev->debugfs, + &hdev->discov_interleaved_timeout); } return 0; @@ -1496,6 +1980,8 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state) switch (state) { case DISCOVERY_STOPPED: + hci_update_background_scan(hdev); + if (hdev->discovery.state != DISCOVERY_STARTING) mgmt_discovering(hdev, 0); break; @@ -1607,12 +2093,11 @@ bool hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, hci_remove_remote_oob_data(hdev, &data->bdaddr); - if (ssp) - *ssp = data->ssp_mode; + *ssp = data->ssp_mode; ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); if (ie) { - if (ie->data.ssp_mode && ssp) + if (ie->data.ssp_mode) *ssp = true; if (ie->name_state == NAME_NEEDED && @@ -1824,10 +2309,15 @@ static int hci_dev_do_open(struct hci_dev *hdev) * be able to determine if there is a public address * or not. * + * In case of user channel usage, it is not important + * if a public address or static random address is + * available. + * * This check is only valid for BR/EDR controllers * since AMP controllers do not have an address. */ - if (hdev->dev_type == HCI_BREDR && + if (!test_bit(HCI_USER_CHANNEL, &hdev->dev_flags) && + hdev->dev_type == HCI_BREDR && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; @@ -1864,6 +2354,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) if (!ret) { hci_dev_hold(hdev); + set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); set_bit(HCI_UP, &hdev->flags); hci_notify(hdev, HCI_DEV_UP); if (!test_bit(HCI_SETUP, &hdev->dev_flags) && @@ -1962,9 +2453,13 @@ static int hci_dev_do_close(struct hci_dev *hdev) cancel_delayed_work_sync(&hdev->le_scan_disable); + if (test_bit(HCI_MGMT, &hdev->dev_flags)) + cancel_delayed_work_sync(&hdev->rpa_expired); + hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_conn_hash_flush(hdev); + hci_pend_le_conns_clear(hdev); hci_dev_unlock(hdev); hci_notify(hdev, HCI_DEV_DOWN); @@ -2022,6 +2517,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); + bacpy(&hdev->random_addr, BDADDR_ANY); hci_req_unlock(hdev); @@ -2385,7 +2881,7 @@ static void hci_discov_off(struct work_struct *work) mgmt_discoverable_timeout(hdev); } -int hci_uuids_clear(struct hci_dev *hdev) +void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; @@ -2393,11 +2889,9 @@ int hci_uuids_clear(struct hci_dev *hdev) list_del(&uuid->list); kfree(uuid); } - - return 0; } -int hci_link_keys_clear(struct hci_dev *hdev) +void hci_link_keys_clear(struct hci_dev *hdev) { struct list_head *p, *n; @@ -2409,11 +2903,9 @@ int hci_link_keys_clear(struct hci_dev *hdev) list_del(p); kfree(key); } - - return 0; } -int hci_smp_ltks_clear(struct hci_dev *hdev) +void hci_smp_ltks_clear(struct hci_dev *hdev) { struct smp_ltk *k, *tmp; @@ -2421,8 +2913,16 @@ int hci_smp_ltks_clear(struct hci_dev *hdev) list_del(&k->list); kfree(k); } +} - return 0; +void hci_smp_irks_clear(struct hci_dev *hdev) +{ + struct smp_irk *k, *tmp; + + list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { + list_del(&k->list); + kfree(k); + } } struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) @@ -2472,13 +2972,24 @@ static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, return false; } -struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, u8 rand[8]) +static bool ltk_type_master(u8 type) +{ + if (type == HCI_SMP_STK || type == HCI_SMP_LTK) + return true; + + return false; +} + +struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, __le64 rand, + bool master) { struct smp_ltk *k; list_for_each_entry(k, &hdev->long_term_keys, list) { - if (k->ediv != ediv || - memcmp(rand, k->rand, sizeof(k->rand))) + if (k->ediv != ediv || k->rand != rand) + continue; + + if (ltk_type_master(k->type) != master) continue; return k; @@ -2488,18 +2999,56 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, u8 rand[8]) } struct smp_ltk *hci_find_ltk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, - u8 addr_type) + u8 addr_type, bool master) { struct smp_ltk *k; list_for_each_entry(k, &hdev->long_term_keys, list) if (addr_type == k->bdaddr_type && - bacmp(bdaddr, &k->bdaddr) == 0) + bacmp(bdaddr, &k->bdaddr) == 0 && + ltk_type_master(k->type) == master) return k; return NULL; } +struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) +{ + struct smp_irk *irk; + + list_for_each_entry(irk, &hdev->identity_resolving_keys, list) { + if (!bacmp(&irk->rpa, rpa)) + return irk; + } + + list_for_each_entry(irk, &hdev->identity_resolving_keys, list) { + if (smp_irk_matches(hdev->tfm_aes, irk->val, rpa)) { + bacpy(&irk->rpa, rpa); + return irk; + } + } + + return NULL; +} + +struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type) +{ + struct smp_irk *irk; + + /* Identity Address must be public or static random */ + if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0) + return NULL; + + list_for_each_entry(irk, &hdev->identity_resolving_keys, list) { + if (addr_type == irk->addr_type && + bacmp(bdaddr, &irk->bdaddr) == 0) + return irk; + } + + return NULL; +} + int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len) { @@ -2513,7 +3062,7 @@ int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, key = old_key; } else { old_key_type = conn ? conn->key_type : 0xff; - key = kzalloc(sizeof(*key), GFP_ATOMIC); + key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return -ENOMEM; list_add(&key->list, &hdev->link_keys); @@ -2553,22 +3102,20 @@ int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, return 0; } -int hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, - int new_key, u8 authenticated, u8 tk[16], u8 enc_size, __le16 - ediv, u8 rand[8]) +struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type, u8 type, u8 authenticated, + u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand) { struct smp_ltk *key, *old_key; + bool master = ltk_type_master(type); - if (!(type & HCI_SMP_STK) && !(type & HCI_SMP_LTK)) - return 0; - - old_key = hci_find_ltk_by_addr(hdev, bdaddr, addr_type); + old_key = hci_find_ltk_by_addr(hdev, bdaddr, addr_type, master); if (old_key) key = old_key; else { - key = kzalloc(sizeof(*key), GFP_ATOMIC); + key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) - return -ENOMEM; + return NULL; list_add(&key->list, &hdev->long_term_keys); } @@ -2577,17 +3124,34 @@ int hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, memcpy(key->val, tk, sizeof(key->val)); key->authenticated = authenticated; key->ediv = ediv; + key->rand = rand; key->enc_size = enc_size; key->type = type; - memcpy(key->rand, rand, sizeof(key->rand)); - if (!new_key) - return 0; + return key; +} - if (type & HCI_SMP_LTK) - mgmt_new_ltk(hdev, key, 1); +struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type, u8 val[16], bdaddr_t *rpa) +{ + struct smp_irk *irk; - return 0; + irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type); + if (!irk) { + irk = kzalloc(sizeof(*irk), GFP_KERNEL); + if (!irk) + return NULL; + + bacpy(&irk->bdaddr, bdaddr); + irk->addr_type = addr_type; + + list_add(&irk->list, &hdev->identity_resolving_keys); + } + + memcpy(irk->val, val, 16); + bacpy(&irk->rpa, rpa); + + return irk; } int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) @@ -2606,21 +3170,38 @@ int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) return 0; } -int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr) +int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct smp_ltk *k, *tmp; + int removed = 0; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { - if (bacmp(bdaddr, &k->bdaddr)) + if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del(&k->list); kfree(k); + removed++; } - return 0; + return removed ? 0 : -ENOENT; +} + +void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) +{ + struct smp_irk *k, *tmp; + + list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { + if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type) + continue; + + BT_DBG("%s removing %pMR", hdev->name, bdaddr); + + list_del(&k->list); + kfree(k); + } } /* HCI command timer function */ @@ -2669,7 +3250,7 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr) return 0; } -int hci_remote_oob_data_clear(struct hci_dev *hdev) +void hci_remote_oob_data_clear(struct hci_dev *hdev) { struct oob_data *data, *n; @@ -2677,19 +3258,43 @@ int hci_remote_oob_data_clear(struct hci_dev *hdev) list_del(&data->list); kfree(data); } +} + +int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 *hash, u8 *randomizer) +{ + struct oob_data *data; + + data = hci_find_remote_oob_data(hdev, bdaddr); + if (!data) { + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + bacpy(&data->bdaddr, bdaddr); + list_add(&data->list, &hdev->remote_oob_data); + } + + memcpy(data->hash192, hash, sizeof(data->hash192)); + memcpy(data->randomizer192, randomizer, sizeof(data->randomizer192)); + + memset(data->hash256, 0, sizeof(data->hash256)); + memset(data->randomizer256, 0, sizeof(data->randomizer256)); + + BT_DBG("%s for %pMR", hdev->name, bdaddr); return 0; } -int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *hash, - u8 *randomizer) +int hci_add_remote_oob_ext_data(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 *hash192, u8 *randomizer192, + u8 *hash256, u8 *randomizer256) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr); - if (!data) { - data = kmalloc(sizeof(*data), GFP_ATOMIC); + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -2697,8 +3302,11 @@ int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *hash, list_add(&data->list, &hdev->remote_oob_data); } - memcpy(data->hash, hash, sizeof(data->hash)); - memcpy(data->randomizer, randomizer, sizeof(data->randomizer)); + memcpy(data->hash192, hash192, sizeof(data->hash192)); + memcpy(data->randomizer192, randomizer192, sizeof(data->randomizer192)); + + memcpy(data->hash256, hash256, sizeof(data->hash256)); + memcpy(data->randomizer256, randomizer256, sizeof(data->randomizer256)); BT_DBG("%s for %pMR", hdev->name, bdaddr); @@ -2718,7 +3326,7 @@ struct bdaddr_list *hci_blacklist_lookup(struct hci_dev *hdev, return NULL; } -int hci_blacklist_clear(struct hci_dev *hdev) +static void hci_blacklist_clear(struct hci_dev *hdev) { struct list_head *p, *n; @@ -2728,8 +3336,6 @@ int hci_blacklist_clear(struct hci_dev *hdev) list_del(p); kfree(b); } - - return 0; } int hci_blacklist_add(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) @@ -2758,8 +3364,10 @@ int hci_blacklist_del(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; - if (!bacmp(bdaddr, BDADDR_ANY)) - return hci_blacklist_clear(hdev); + if (!bacmp(bdaddr, BDADDR_ANY)) { + hci_blacklist_clear(hdev); + return 0; + } entry = hci_blacklist_lookup(hdev, bdaddr, type); if (!entry) @@ -2771,6 +3379,262 @@ int hci_blacklist_del(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) return mgmt_device_unblocked(hdev, bdaddr, type); } +struct bdaddr_list *hci_white_list_lookup(struct hci_dev *hdev, + bdaddr_t *bdaddr, u8 type) +{ + struct bdaddr_list *b; + + list_for_each_entry(b, &hdev->le_white_list, list) { + if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) + return b; + } + + return NULL; +} + +void hci_white_list_clear(struct hci_dev *hdev) +{ + struct list_head *p, *n; + + list_for_each_safe(p, n, &hdev->le_white_list) { + struct bdaddr_list *b = list_entry(p, struct bdaddr_list, list); + + list_del(p); + kfree(b); + } +} + +int hci_white_list_add(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) +{ + struct bdaddr_list *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) + return -EBADF; + + entry = kzalloc(sizeof(struct bdaddr_list), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + bacpy(&entry->bdaddr, bdaddr); + entry->bdaddr_type = type; + + list_add(&entry->list, &hdev->le_white_list); + + return 0; +} + +int hci_white_list_del(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) +{ + struct bdaddr_list *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) + return -EBADF; + + entry = hci_white_list_lookup(hdev, bdaddr, type); + if (!entry) + return -ENOENT; + + list_del(&entry->list); + kfree(entry); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, + bdaddr_t *addr, u8 addr_type) +{ + struct hci_conn_params *params; + + list_for_each_entry(params, &hdev->le_conn_params, list) { + if (bacmp(¶ms->addr, addr) == 0 && + params->addr_type == addr_type) { + return params; + } + } + + return NULL; +} + +static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) +{ + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); + if (!conn) + return false; + + if (conn->dst_type != type) + return false; + + if (conn->state != BT_CONNECTED) + return false; + + return true; +} + +static bool is_identity_address(bdaddr_t *addr, u8 addr_type) +{ + if (addr_type == ADDR_LE_DEV_PUBLIC) + return true; + + /* Check for Random Static address type */ + if ((addr->b[5] & 0xc0) == 0xc0) + return true; + + return false; +} + +/* This function requires the caller holds hdev->lock */ +int hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type, + u8 auto_connect, u16 conn_min_interval, + u16 conn_max_interval) +{ + struct hci_conn_params *params; + + if (!is_identity_address(addr, addr_type)) + return -EINVAL; + + params = hci_conn_params_lookup(hdev, addr, addr_type); + if (params) + goto update; + + params = kzalloc(sizeof(*params), GFP_KERNEL); + if (!params) { + BT_ERR("Out of memory"); + return -ENOMEM; + } + + bacpy(¶ms->addr, addr); + params->addr_type = addr_type; + + list_add(¶ms->list, &hdev->le_conn_params); + +update: + params->conn_min_interval = conn_min_interval; + params->conn_max_interval = conn_max_interval; + params->auto_connect = auto_connect; + + switch (auto_connect) { + case HCI_AUTO_CONN_DISABLED: + case HCI_AUTO_CONN_LINK_LOSS: + hci_pend_le_conn_del(hdev, addr, addr_type); + break; + case HCI_AUTO_CONN_ALWAYS: + if (!is_connected(hdev, addr, addr_type)) + hci_pend_le_conn_add(hdev, addr, addr_type); + break; + } + + BT_DBG("addr %pMR (type %u) auto_connect %u conn_min_interval 0x%.4x " + "conn_max_interval 0x%.4x", addr, addr_type, auto_connect, + conn_min_interval, conn_max_interval); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) +{ + struct hci_conn_params *params; + + params = hci_conn_params_lookup(hdev, addr, addr_type); + if (!params) + return; + + hci_pend_le_conn_del(hdev, addr, addr_type); + + list_del(¶ms->list); + kfree(params); + + BT_DBG("addr %pMR (type %u)", addr, addr_type); +} + +/* This function requires the caller holds hdev->lock */ +void hci_conn_params_clear(struct hci_dev *hdev) +{ + struct hci_conn_params *params, *tmp; + + list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { + list_del(¶ms->list); + kfree(params); + } + + BT_DBG("All LE connection parameters were removed"); +} + +/* This function requires the caller holds hdev->lock */ +struct bdaddr_list *hci_pend_le_conn_lookup(struct hci_dev *hdev, + bdaddr_t *addr, u8 addr_type) +{ + struct bdaddr_list *entry; + + list_for_each_entry(entry, &hdev->pend_le_conns, list) { + if (bacmp(&entry->bdaddr, addr) == 0 && + entry->bdaddr_type == addr_type) + return entry; + } + + return NULL; +} + +/* This function requires the caller holds hdev->lock */ +void hci_pend_le_conn_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) +{ + struct bdaddr_list *entry; + + entry = hci_pend_le_conn_lookup(hdev, addr, addr_type); + if (entry) + goto done; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + BT_ERR("Out of memory"); + return; + } + + bacpy(&entry->bdaddr, addr); + entry->bdaddr_type = addr_type; + + list_add(&entry->list, &hdev->pend_le_conns); + + BT_DBG("addr %pMR (type %u)", addr, addr_type); + +done: + hci_update_background_scan(hdev); +} + +/* This function requires the caller holds hdev->lock */ +void hci_pend_le_conn_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) +{ + struct bdaddr_list *entry; + + entry = hci_pend_le_conn_lookup(hdev, addr, addr_type); + if (!entry) + goto done; + + list_del(&entry->list); + kfree(entry); + + BT_DBG("addr %pMR (type %u)", addr, addr_type); + +done: + hci_update_background_scan(hdev); +} + +/* This function requires the caller holds hdev->lock */ +void hci_pend_le_conns_clear(struct hci_dev *hdev) +{ + struct bdaddr_list *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &hdev->pend_le_conns, list) { + list_del(&entry->list); + kfree(entry); + } + + BT_DBG("All LE pending connections cleared"); +} + static void inquiry_complete(struct hci_dev *hdev, u8 status) { if (status) { @@ -2830,7 +3694,6 @@ static void le_scan_disable_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, le_scan_disable.work); - struct hci_cp_le_set_scan_enable cp; struct hci_request req; int err; @@ -2838,15 +3701,128 @@ static void le_scan_disable_work(struct work_struct *work) hci_req_init(&req, hdev); - memset(&cp, 0, sizeof(cp)); - cp.enable = LE_SCAN_DISABLE; - hci_req_add(&req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + hci_req_add_le_scan_disable(&req); err = hci_req_run(&req, le_scan_disable_work_complete); if (err) BT_ERR("Disable LE scanning request failed: err %d", err); } +static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) +{ + struct hci_dev *hdev = req->hdev; + + /* If we're advertising or initiating an LE connection we can't + * go ahead and change the random address at this time. This is + * because the eventual initiator address used for the + * subsequently created connection will be undefined (some + * controllers use the new address and others the one we had + * when the operation started). + * + * In this kind of scenario skip the update and let the random + * address be updated at the next cycle. + */ + if (test_bit(HCI_ADVERTISING, &hdev->dev_flags) || + hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { + BT_DBG("Deferring random address update"); + return; + } + + hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa); +} + +int hci_update_random_address(struct hci_request *req, bool require_privacy, + u8 *own_addr_type) +{ + struct hci_dev *hdev = req->hdev; + int err; + + /* If privacy is enabled use a resolvable private address. If + * current RPA has expired or there is something else than + * the current RPA in use, then generate a new one. + */ + if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) { + int to; + + *own_addr_type = ADDR_LE_DEV_RANDOM; + + if (!test_and_clear_bit(HCI_RPA_EXPIRED, &hdev->dev_flags) && + !bacmp(&hdev->random_addr, &hdev->rpa)) + return 0; + + err = smp_generate_rpa(hdev->tfm_aes, hdev->irk, &hdev->rpa); + if (err < 0) { + BT_ERR("%s failed to generate new RPA", hdev->name); + return err; + } + + set_random_addr(req, &hdev->rpa); + + to = msecs_to_jiffies(hdev->rpa_timeout * 1000); + queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, to); + + return 0; + } + + /* In case of required privacy without resolvable private address, + * use an unresolvable private address. This is useful for active + * scanning and non-connectable advertising. + */ + if (require_privacy) { + bdaddr_t urpa; + + get_random_bytes(&urpa, 6); + urpa.b[5] &= 0x3f; /* Clear two most significant bits */ + + *own_addr_type = ADDR_LE_DEV_RANDOM; + set_random_addr(req, &urpa); + return 0; + } + + /* If forcing static address is in use or there is no public + * address use the static address as random address (but skip + * the HCI command if the current random address is already the + * static one. + */ + if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags) || + !bacmp(&hdev->bdaddr, BDADDR_ANY)) { + *own_addr_type = ADDR_LE_DEV_RANDOM; + if (bacmp(&hdev->static_addr, &hdev->random_addr)) + hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, + &hdev->static_addr); + return 0; + } + + /* Neither privacy nor static address is being used so use a + * public address. + */ + *own_addr_type = ADDR_LE_DEV_PUBLIC; + + return 0; +} + +/* Copy the Identity Address of the controller. + * + * If the controller has a public BD_ADDR, then by default use that one. + * If this is a LE only controller without a public address, default to + * the static random address. + * + * For debugging purposes it is possible to force controllers with a + * public address to use the static random address instead. + */ +void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 *bdaddr_type) +{ + if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags) || + !bacmp(&hdev->bdaddr, BDADDR_ANY)) { + bacpy(bdaddr, &hdev->static_addr); + *bdaddr_type = ADDR_LE_DEV_RANDOM; + } else { + bacpy(bdaddr, &hdev->bdaddr); + *bdaddr_type = ADDR_LE_DEV_PUBLIC; + } +} + /* Alloc HCI device */ struct hci_dev *hci_alloc_dev(void) { @@ -2867,11 +3843,17 @@ struct hci_dev *hci_alloc_dev(void) hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; + hdev->le_adv_channel_map = 0x07; hdev->le_scan_interval = 0x0060; hdev->le_scan_window = 0x0030; hdev->le_conn_min_interval = 0x0028; hdev->le_conn_max_interval = 0x0038; + hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; + hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; + hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; + hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; + mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); @@ -2880,7 +3862,11 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); + INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); + INIT_LIST_HEAD(&hdev->le_white_list); + INIT_LIST_HEAD(&hdev->le_conn_params); + INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_WORK(&hdev->rx_work, hci_rx_work); @@ -2965,9 +3951,18 @@ int hci_register_dev(struct hci_dev *hdev) dev_set_name(&hdev->dev, "%s", hdev->name); + hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(hdev->tfm_aes)) { + BT_ERR("Unable to create crypto context"); + error = PTR_ERR(hdev->tfm_aes); + hdev->tfm_aes = NULL; + goto err_wqueue; + } + error = device_add(&hdev->dev); if (error < 0) - goto err_wqueue; + goto err_tfm; hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, @@ -3003,6 +3998,8 @@ int hci_register_dev(struct hci_dev *hdev) return id; +err_tfm: + crypto_free_blkcipher(hdev->tfm_aes); err_wqueue: destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); @@ -3053,6 +4050,9 @@ void hci_unregister_dev(struct hci_dev *hdev) rfkill_destroy(hdev->rfkill); } + if (hdev->tfm_aes) + crypto_free_blkcipher(hdev->tfm_aes); + device_del(&hdev->dev); debugfs_remove_recursive(hdev->debugfs); @@ -3065,7 +4065,11 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); + hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); + hci_white_list_clear(hdev); + hci_conn_params_clear(hdev); + hci_pend_le_conns_clear(hdev); hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -4293,3 +5297,104 @@ static void hci_cmd_work(struct work_struct *work) } } } + +void hci_req_add_le_scan_disable(struct hci_request *req) +{ + struct hci_cp_le_set_scan_enable cp; + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_DISABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); +} + +void hci_req_add_le_passive_scan(struct hci_request *req) +{ + struct hci_cp_le_set_scan_param param_cp; + struct hci_cp_le_set_scan_enable enable_cp; + struct hci_dev *hdev = req->hdev; + u8 own_addr_type; + + /* Set require_privacy to true to avoid identification from + * unknown peer devices. Since this is passive scanning, no + * SCAN_REQ using the local identity should be sent. Mandating + * privacy is just an extra precaution. + */ + if (hci_update_random_address(req, true, &own_addr_type)) + return; + + memset(¶m_cp, 0, sizeof(param_cp)); + param_cp.type = LE_SCAN_PASSIVE; + param_cp.interval = cpu_to_le16(hdev->le_scan_interval); + param_cp.window = cpu_to_le16(hdev->le_scan_window); + param_cp.own_address_type = own_addr_type; + hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), + ¶m_cp); + + memset(&enable_cp, 0, sizeof(enable_cp)); + enable_cp.enable = LE_SCAN_ENABLE; + enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), + &enable_cp); +} + +static void update_background_scan_complete(struct hci_dev *hdev, u8 status) +{ + if (status) + BT_DBG("HCI request failed to update background scanning: " + "status 0x%2.2x", status); +} + +/* This function controls the background scanning based on hdev->pend_le_conns + * list. If there are pending LE connection we start the background scanning, + * otherwise we stop it. + * + * This function requires the caller holds hdev->lock. + */ +void hci_update_background_scan(struct hci_dev *hdev) +{ + struct hci_request req; + struct hci_conn *conn; + int err; + + hci_req_init(&req, hdev); + + if (list_empty(&hdev->pend_le_conns)) { + /* If there is no pending LE connections, we should stop + * the background scanning. + */ + + /* If controller is not scanning we are done. */ + if (!test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + return; + + hci_req_add_le_scan_disable(&req); + + BT_DBG("%s stopping background scanning", hdev->name); + } else { + /* If there is at least one pending LE connection, we should + * keep the background scan running. + */ + + /* If controller is connecting, we should not start scanning + * since some controllers are not able to scan and connect at + * the same time. + */ + conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + if (conn) + return; + + /* If controller is currently scanning, we stop it to ensure we + * don't miss any advertising (due to duplicates filter). + */ + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + hci_req_add_le_scan_disable(&req); + + hci_req_add_le_passive_scan(&req); + + BT_DBG("%s starting background scanning", hdev->name); + } + + err = hci_req_run(&req, update_background_scan_complete); + if (err) + BT_ERR("Failed to run HCI request: err %d", err); +} diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 5935f748c0f..640c54ec1bd 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -45,9 +45,13 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) return; clear_bit(HCI_INQUIRY, &hdev->flags); - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); + hci_dev_lock(hdev); + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + hci_dev_unlock(hdev); + hci_conn_check_pending(hdev); } @@ -199,6 +203,8 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) memset(hdev->scan_rsp_data, 0, sizeof(hdev->scan_rsp_data)); hdev->scan_rsp_data_len = 0; + hdev->le_scan_type = LE_SCAN_PASSIVE; + hdev->ssp_debug_mode = 0; } @@ -461,6 +467,34 @@ static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb) } } +static void hci_cc_write_sc_support(struct hci_dev *hdev, struct sk_buff *skb) +{ + u8 status = *((u8 *) skb->data); + struct hci_cp_write_sc_support *sent; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SC_SUPPORT); + if (!sent) + return; + + if (!status) { + if (sent->support) + hdev->features[1][0] |= LMP_HOST_SC; + else + hdev->features[1][0] &= ~LMP_HOST_SC; + } + + if (test_bit(HCI_MGMT, &hdev->dev_flags)) + mgmt_sc_enable_complete(hdev, sent->support, status); + else if (!status) { + if (sent->support) + set_bit(HCI_SC_ENABLED, &hdev->dev_flags); + else + clear_bit(HCI_SC_ENABLED, &hdev->dev_flags); + } +} + static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_local_version *rp = (void *) skb->data; @@ -486,7 +520,10 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) + if (rp->status) + return; + + if (test_bit(HCI_SETUP, &hdev->dev_flags)) memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); } @@ -538,12 +575,6 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, if (hdev->features[0][5] & LMP_EDR_3S_ESCO) hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5); - - BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name, - hdev->features[0][0], hdev->features[0][1], - hdev->features[0][2], hdev->features[0][3], - hdev->features[0][4], hdev->features[0][5], - hdev->features[0][6], hdev->features[0][7]); } static void hci_cc_read_local_ext_features(struct hci_dev *hdev, @@ -907,16 +938,50 @@ static void hci_cc_user_passkey_neg_reply(struct hci_dev *hdev, hci_dev_unlock(hdev); } -static void hci_cc_read_local_oob_data_reply(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_read_local_oob_data(struct hci_dev *hdev, + struct sk_buff *skb) { struct hci_rp_read_local_oob_data *rp = (void *) skb->data; BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); hci_dev_lock(hdev); - mgmt_read_local_oob_data_reply_complete(hdev, rp->hash, - rp->randomizer, rp->status); + mgmt_read_local_oob_data_complete(hdev, rp->hash, rp->randomizer, + NULL, NULL, rp->status); + hci_dev_unlock(hdev); +} + +static void hci_cc_read_local_oob_ext_data(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + hci_dev_lock(hdev); + mgmt_read_local_oob_data_complete(hdev, rp->hash192, rp->randomizer192, + rp->hash256, rp->randomizer256, + rp->status); + hci_dev_unlock(hdev); +} + + +static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + bdaddr_t *sent; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_RANDOM_ADDR); + if (!sent) + return; + + hci_dev_lock(hdev); + + if (!status) + bacpy(&hdev->random_addr, sent); + hci_dev_unlock(hdev); } @@ -930,18 +995,75 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) if (!sent) return; + if (status) + return; + hci_dev_lock(hdev); - if (!status) { - if (*sent) - set_bit(HCI_ADVERTISING, &hdev->dev_flags); - else - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + /* If we're doing connection initation as peripheral. Set a + * timeout in case something goes wrong. + */ + if (*sent) { + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + if (conn) + queue_delayed_work(hdev->workqueue, + &conn->le_conn_timeout, + HCI_LE_CONN_TIMEOUT); } + mgmt_advertising(hdev, *sent); + hci_dev_unlock(hdev); } +static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_cp_le_set_scan_param *cp; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_PARAM); + if (!cp) + return; + + hci_dev_lock(hdev); + + if (!status) + hdev->le_scan_type = cp->type; + + hci_dev_unlock(hdev); +} + +static bool has_pending_adv_report(struct hci_dev *hdev) +{ + struct discovery_state *d = &hdev->discovery; + + return bacmp(&d->last_adv_addr, BDADDR_ANY); +} + +static void clear_pending_adv_report(struct hci_dev *hdev) +{ + struct discovery_state *d = &hdev->discovery; + + bacpy(&d->last_adv_addr, BDADDR_ANY); + d->last_adv_data_len = 0; +} + +static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 bdaddr_type, s8 rssi, u8 *data, u8 len) +{ + struct discovery_state *d = &hdev->discovery; + + bacpy(&d->last_adv_addr, bdaddr); + d->last_adv_addr_type = bdaddr_type; + d->last_adv_rssi = rssi; + memcpy(d->last_adv_data, data, len); + d->last_adv_data_len = len; +} + static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, struct sk_buff *skb) { @@ -960,10 +1082,38 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, switch (cp->enable) { case LE_SCAN_ENABLE: set_bit(HCI_LE_SCAN, &hdev->dev_flags); + if (hdev->le_scan_type == LE_SCAN_ACTIVE) + clear_pending_adv_report(hdev); break; case LE_SCAN_DISABLE: + /* We do this here instead of when setting DISCOVERY_STOPPED + * since the latter would potentially require waiting for + * inquiry to stop too. + */ + if (has_pending_adv_report(hdev)) { + struct discovery_state *d = &hdev->discovery; + + mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, + d->last_adv_addr_type, NULL, + d->last_adv_rssi, 0, 1, + d->last_adv_data, + d->last_adv_data_len, NULL, 0); + } + + /* Cancel this timer so that we don't try to disable scanning + * when it's already disabled. + */ + cancel_delayed_work(&hdev->le_scan_disable); + clear_bit(HCI_LE_SCAN, &hdev->dev_flags); + /* The HCI_LE_SCAN_INTERRUPTED flag indicates that we + * interrupted scanning due to a connect request. Mark + * therefore discovery as stopped. + */ + if (test_and_clear_bit(HCI_LE_SCAN_INTERRUPTED, + &hdev->dev_flags)) + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); break; default: @@ -983,6 +1133,49 @@ static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, hdev->le_white_list_size = rp->size; } +static void hci_cc_le_clear_white_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (!status) + hci_white_list_clear(hdev); +} + +static void hci_cc_le_add_to_white_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_add_to_white_list *sent; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_WHITE_LIST); + if (!sent) + return; + + if (!status) + hci_white_list_add(hdev, &sent->bdaddr, sent->bdaddr_type); +} + +static void hci_cc_le_del_from_white_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_del_from_white_list *sent; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_WHITE_LIST); + if (!sent) + return; + + if (!status) + hci_white_list_del(hdev, &sent->bdaddr, sent->bdaddr_type); +} + static void hci_cc_le_read_supported_states(struct hci_dev *hdev, struct sk_buff *skb) { @@ -1023,6 +1216,25 @@ static void hci_cc_write_le_host_supported(struct hci_dev *hdev, } } +static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_cp_le_set_adv_param *cp; + u8 status = *((u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_PARAM); + if (!cp) + return; + + hci_dev_lock(hdev); + hdev->adv_addr_type = cp->own_address_type; + hci_dev_unlock(hdev); +} + static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev, struct sk_buff *skb) { @@ -1037,6 +1249,59 @@ static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev, amp_write_rem_assoc_continue(hdev, rp->phy_handle); } +static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_rssi *rp = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) + conn->rssi = rp->rssi; + + hci_dev_unlock(hdev); +} + +static void hci_cc_read_tx_power(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_cp_read_tx_power *sent; + struct hci_rp_read_tx_power *rp = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_READ_TX_POWER); + if (!sent) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (!conn) + goto unlock; + + switch (sent->type) { + case 0x00: + conn->tx_power = rp->tx_power; + break; + case 0x01: + conn->max_tx_power = rp->tx_power; + break; + } + +unlock: + hci_dev_unlock(hdev); +} + static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status) { BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1188,9 +1453,13 @@ static int hci_outgoing_auth_needed(struct hci_dev *hdev, return 0; /* Only request authentication for SSP connections or non-SSP - * devices with sec_level HIGH or if MITM protection is requested */ + * devices with sec_level MEDIUM or HIGH or if MITM protection + * is requested. + */ if (!hci_conn_ssp_enabled(conn) && !(conn->auth_type & 0x01) && - conn->pending_sec_level != BT_SECURITY_HIGH) + conn->pending_sec_level != BT_SECURITY_FIPS && + conn->pending_sec_level != BT_SECURITY_HIGH && + conn->pending_sec_level != BT_SECURITY_MEDIUM) return 0; return 1; @@ -1521,6 +1790,87 @@ static void hci_cs_accept_phylink(struct hci_dev *hdev, u8 status) amp_write_remote_assoc(hdev, cp->phy_handle); } +static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_create_conn *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + /* All connection failure handling is taken care of by the + * hci_le_conn_failed function which is triggered by the HCI + * request completion callbacks used for connecting. + */ + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CONN); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->peer_addr); + if (!conn) + goto unlock; + + /* Store the initiator and responder address information which + * is needed for SMP. These values will not change during the + * lifetime of the connection. + */ + conn->init_addr_type = cp->own_address_type; + if (cp->own_address_type == ADDR_LE_DEV_RANDOM) + bacpy(&conn->init_addr, &hdev->random_addr); + else + bacpy(&conn->init_addr, &hdev->bdaddr); + + conn->resp_addr_type = cp->peer_addr_type; + bacpy(&conn->resp_addr, &cp->peer_addr); + + /* We don't want the connection attempt to stick around + * indefinitely since LE doesn't have a page timeout concept + * like BR/EDR. Set a timer for any connection that doesn't use + * the white list for connecting. + */ + if (cp->filter_policy == HCI_LE_USE_PEER_ADDR) + queue_delayed_work(conn->hdev->workqueue, + &conn->le_conn_timeout, + HCI_LE_CONN_TIMEOUT); + +unlock: + hci_dev_unlock(hdev); +} + +static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_start_enc *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (!status) + return; + + hci_dev_lock(hdev); + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_START_ENC); + if (!cp) + goto unlock; + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (!conn) + goto unlock; + + if (conn->state != BT_CONNECTED) + goto unlock; + + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); + +unlock: + hci_dev_unlock(hdev); +} + static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { __u8 status = *((__u8 *) skb->data); @@ -1534,7 +1884,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags)) return; - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); if (!test_bit(HCI_MGMT, &hdev->dev_flags)) @@ -1593,7 +1943,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) name_known = hci_inquiry_cache_update(hdev, &data, false, &ssp); mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, 0, !name_known, ssp, NULL, - 0); + 0, NULL, 0); } hci_dev_unlock(hdev); @@ -1662,7 +2012,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) } else { conn->state = BT_CLOSED; if (conn->type == ACL_LINK) - mgmt_connect_failed(hdev, &ev->bdaddr, conn->type, + mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, ev->status); } @@ -1741,9 +2091,9 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&cp.bdaddr, &ev->bdaddr); cp.pkt_type = cpu_to_le16(conn->pkt_type); - cp.tx_bandwidth = __constant_cpu_to_le32(0x00001f40); - cp.rx_bandwidth = __constant_cpu_to_le32(0x00001f40); - cp.max_latency = __constant_cpu_to_le16(0xffff); + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); + cp.max_latency = cpu_to_le16(0xffff); cp.content_format = cpu_to_le16(hdev->voice_setting); cp.retrans_effort = 0xff; @@ -1782,7 +2132,11 @@ static u8 hci_to_mgmt_reason(u8 err) static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_disconn_complete *ev = (void *) skb->data; + u8 reason = hci_to_mgmt_reason(ev->reason); + struct hci_conn_params *params; struct hci_conn *conn; + bool mgmt_connected; + u8 type; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -1792,43 +2146,55 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!conn) goto unlock; - if (ev->status == 0) - conn->state = BT_CLOSED; + if (ev->status) { + mgmt_disconnect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, ev->status); + goto unlock; + } - if (test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags) && - (conn->type == ACL_LINK || conn->type == LE_LINK)) { - if (ev->status) { - mgmt_disconnect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, ev->status); - } else { - u8 reason = hci_to_mgmt_reason(ev->reason); + conn->state = BT_CLOSED; + + mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); + mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type, + reason, mgmt_connected); + + if (conn->type == ACL_LINK && conn->flush_key) + hci_remove_link_key(hdev, &conn->dst); + + params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); + if (params) { + switch (params->auto_connect) { + case HCI_AUTO_CONN_LINK_LOSS: + if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT) + break; + /* Fall through */ - mgmt_device_disconnected(hdev, &conn->dst, conn->type, - conn->dst_type, reason); + case HCI_AUTO_CONN_ALWAYS: + hci_pend_le_conn_add(hdev, &conn->dst, conn->dst_type); + break; + + default: + break; } } - if (ev->status == 0) { - u8 type = conn->type; + type = conn->type; - if (type == ACL_LINK && conn->flush_key) - hci_remove_link_key(hdev, &conn->dst); - hci_proto_disconn_cfm(conn, ev->reason); - hci_conn_del(conn); + hci_proto_disconn_cfm(conn, ev->reason); + hci_conn_del(conn); - /* Re-enable advertising if necessary, since it might - * have been disabled by the connection. From the - * HCI_LE_Set_Advertise_Enable command description in - * the core specification (v4.0): - * "The Controller shall continue advertising until the Host - * issues an LE_Set_Advertise_Enable command with - * Advertising_Enable set to 0x00 (Advertising is disabled) - * or until a connection is created or until the Advertising - * is timed out due to Directed Advertising." - */ - if (type == LE_LINK) - mgmt_reenable_advertising(hdev); - } + /* Re-enable advertising if necessary, since it might + * have been disabled by the connection. From the + * HCI_LE_Set_Advertise_Enable command description in + * the core specification (v4.0): + * "The Controller shall continue advertising until the Host + * issues an LE_Set_Advertise_Enable command with + * Advertising_Enable set to 0x00 (Advertising is disabled) + * or until a connection is created or until the Advertising + * is timed out due to Directed Advertising." + */ + if (type == LE_LINK) + mgmt_reenable_advertising(hdev); unlock: hci_dev_unlock(hdev); @@ -1949,34 +2315,57 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); - if (conn) { - if (!ev->status) { - if (ev->encrypt) { - /* Encryption implies authentication */ - conn->link_mode |= HCI_LM_AUTH; - conn->link_mode |= HCI_LM_ENCRYPT; - conn->sec_level = conn->pending_sec_level; - } else - conn->link_mode &= ~HCI_LM_ENCRYPT; + if (!conn) + goto unlock; + + if (!ev->status) { + if (ev->encrypt) { + /* Encryption implies authentication */ + conn->link_mode |= HCI_LM_AUTH; + conn->link_mode |= HCI_LM_ENCRYPT; + conn->sec_level = conn->pending_sec_level; + + /* P-256 authentication key implies FIPS */ + if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256) + conn->link_mode |= HCI_LM_FIPS; + + if ((conn->type == ACL_LINK && ev->encrypt == 0x02) || + conn->type == LE_LINK) + set_bit(HCI_CONN_AES_CCM, &conn->flags); + } else { + conn->link_mode &= ~HCI_LM_ENCRYPT; + clear_bit(HCI_CONN_AES_CCM, &conn->flags); } + } - clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); + clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); - if (ev->status && conn->state == BT_CONNECTED) { - hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); + if (ev->status && conn->state == BT_CONNECTED) { + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); + goto unlock; + } + + if (conn->state == BT_CONFIG) { + if (!ev->status) + conn->state = BT_CONNECTED; + + /* In Secure Connections Only mode, do not allow any + * connections that are not encrypted with AES-CCM + * using a P-256 authenticated combination key. + */ + if (test_bit(HCI_SC_ONLY, &hdev->dev_flags) && + (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || + conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { + hci_proto_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); hci_conn_drop(conn); goto unlock; } - if (conn->state == BT_CONFIG) { - if (!ev->status) - conn->state = BT_CONNECTED; - - hci_proto_connect_cfm(conn, ev->status); - hci_conn_drop(conn); - } else - hci_encrypt_cfm(conn, ev->status, ev->encrypt); - } + hci_proto_connect_cfm(conn, ev->status); + hci_conn_drop(conn); + } else + hci_encrypt_cfm(conn, ev->status, ev->encrypt); unlock: hci_dev_unlock(hdev); @@ -2150,6 +2539,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_write_ssp_mode(hdev, skb); break; + case HCI_OP_WRITE_SC_SUPPORT: + hci_cc_write_sc_support(hdev, skb); + break; + case HCI_OP_READ_LOCAL_VERSION: hci_cc_read_local_version(hdev, skb); break; @@ -2219,7 +2612,11 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_OP_READ_LOCAL_OOB_DATA: - hci_cc_read_local_oob_data_reply(hdev, skb); + hci_cc_read_local_oob_data(hdev, skb); + break; + + case HCI_OP_READ_LOCAL_OOB_EXT_DATA: + hci_cc_read_local_oob_ext_data(hdev, skb); break; case HCI_OP_LE_READ_BUFFER_SIZE: @@ -2250,10 +2647,18 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_user_passkey_neg_reply(hdev, skb); break; + case HCI_OP_LE_SET_RANDOM_ADDR: + hci_cc_le_set_random_addr(hdev, skb); + break; + case HCI_OP_LE_SET_ADV_ENABLE: hci_cc_le_set_adv_enable(hdev, skb); break; + case HCI_OP_LE_SET_SCAN_PARAM: + hci_cc_le_set_scan_param(hdev, skb); + break; + case HCI_OP_LE_SET_SCAN_ENABLE: hci_cc_le_set_scan_enable(hdev, skb); break; @@ -2262,6 +2667,18 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_le_read_white_list_size(hdev, skb); break; + case HCI_OP_LE_CLEAR_WHITE_LIST: + hci_cc_le_clear_white_list(hdev, skb); + break; + + case HCI_OP_LE_ADD_TO_WHITE_LIST: + hci_cc_le_add_to_white_list(hdev, skb); + break; + + case HCI_OP_LE_DEL_FROM_WHITE_LIST: + hci_cc_le_del_from_white_list(hdev, skb); + break; + case HCI_OP_LE_READ_SUPPORTED_STATES: hci_cc_le_read_supported_states(hdev, skb); break; @@ -2270,10 +2687,22 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_write_le_host_supported(hdev, skb); break; + case HCI_OP_LE_SET_ADV_PARAM: + hci_cc_set_adv_param(hdev, skb); + break; + case HCI_OP_WRITE_REMOTE_AMP_ASSOC: hci_cc_write_remote_amp_assoc(hdev, skb); break; + case HCI_OP_READ_RSSI: + hci_cc_read_rssi(hdev, skb); + break; + + case HCI_OP_READ_TX_POWER: + hci_cc_read_tx_power(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode); break; @@ -2357,6 +2786,14 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cs_accept_phylink(hdev, ev->status); break; + case HCI_OP_LE_CREATE_CONN: + hci_cs_le_create_conn(hdev, ev->status); + break; + + case HCI_OP_LE_START_ENC: + hci_cs_le_start_enc(hdev, ev->status); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode); break; @@ -2636,14 +3073,16 @@ static void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); if (conn) { - if (key->type == HCI_LK_UNAUTH_COMBINATION && + if ((key->type == HCI_LK_UNAUTH_COMBINATION_P192 || + key->type == HCI_LK_UNAUTH_COMBINATION_P256) && conn->auth_type != 0xff && (conn->auth_type & 0x01)) { BT_DBG("%s ignoring unauthenticated key", hdev->name); goto not_found; } if (key->type == HCI_LK_COMBINATION && key->pin_len < 16 && - conn->pending_sec_level == BT_SECURITY_HIGH) { + (conn->pending_sec_level == BT_SECURITY_HIGH || + conn->pending_sec_level == BT_SECURITY_FIPS)) { BT_DBG("%s ignoring key unauthenticated for high security", hdev->name); goto not_found; @@ -2788,7 +3227,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, false, &ssp); mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, - !name_known, ssp, NULL, 0); + !name_known, ssp, NULL, 0, NULL, 0); } } else { struct inquiry_info_with_rssi *info = (void *) (skb->data + 1); @@ -2806,7 +3245,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, false, &ssp); mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, - !name_known, ssp, NULL, 0); + !name_known, ssp, NULL, 0, NULL, 0); } } @@ -2850,6 +3289,9 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, * features do not indicate SSP support */ clear_bit(HCI_CONN_SSP_ENABLED, &conn->flags); } + + if (ev->features[0] & LMP_HOST_SC) + set_bit(HCI_CONN_SC_ENABLED, &conn->flags); } if (conn->state != BT_CONFIG) @@ -2911,6 +3353,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, case 0x1c: /* SCO interval rejected */ case 0x1a: /* Unsupported Remote Feature */ case 0x1f: /* Unspecified error */ + case 0x20: /* Unsupported LMP Parameter value */ if (conn->out) { conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | (hdev->esco_type & EDR_ESCO_MASK); @@ -2991,7 +3434,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, eir_len = eir_get_length(info->data, sizeof(info->data)); mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, !name_known, - ssp, info->data, eir_len); + ssp, info->data, eir_len, NULL, 0); } hci_dev_unlock(hdev); @@ -3012,6 +3455,12 @@ static void hci_key_refresh_complete_evt(struct hci_dev *hdev, if (!conn) goto unlock; + /* For BR/EDR the necessary steps are taken through the + * auth_complete event. + */ + if (conn->type != LE_LINK) + goto unlock; + if (!ev->status) conn->sec_level = conn->pending_sec_level; @@ -3043,24 +3492,20 @@ unlock: static u8 hci_get_auth_req(struct hci_conn *conn) { - /* If remote requests dedicated bonding follow that lead */ - if (conn->remote_auth == HCI_AT_DEDICATED_BONDING || - conn->remote_auth == HCI_AT_DEDICATED_BONDING_MITM) { - /* If both remote and local IO capabilities allow MITM - * protection then require it, otherwise don't */ - if (conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT || - conn->io_capability == HCI_IO_NO_INPUT_OUTPUT) - return HCI_AT_DEDICATED_BONDING; - else - return HCI_AT_DEDICATED_BONDING_MITM; - } - /* If remote requests no-bonding follow that lead */ if (conn->remote_auth == HCI_AT_NO_BONDING || conn->remote_auth == HCI_AT_NO_BONDING_MITM) return conn->remote_auth | (conn->auth_type & 0x01); - return conn->auth_type; + /* If both remote and local have enough IO capabilities, require + * MITM protection + */ + if (conn->remote_cap != HCI_IO_NO_INPUT_OUTPUT && + conn->io_capability != HCI_IO_NO_INPUT_OUTPUT) + return conn->remote_auth | 0x01; + + /* No MITM protection possible so ignore remote requirement */ + return (conn->remote_auth & ~0x01) | (conn->auth_type & 0x01); } static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -3090,8 +3535,25 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) * to DisplayYesNo as it is not supported by BT spec. */ cp.capability = (conn->io_capability == 0x04) ? HCI_IO_DISPLAY_YESNO : conn->io_capability; - conn->auth_type = hci_get_auth_req(conn); - cp.authentication = conn->auth_type; + + /* If we are initiators, there is no remote information yet */ + if (conn->remote_auth == 0xff) { + cp.authentication = conn->auth_type; + + /* Request MITM protection if our IO caps allow it + * except for the no-bonding case. + * conn->auth_type is not updated here since + * that might cause the user confirmation to be + * rejected in case the remote doesn't have the + * IO capabilities for MITM. + */ + if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT && + cp.authentication != HCI_AT_NO_BONDING) + cp.authentication |= 0x01; + } else { + conn->auth_type = hci_get_auth_req(conn); + cp.authentication = conn->auth_type; + } if (hci_find_remote_oob_data(hdev, &conn->dst) && (conn->out || test_bit(HCI_CONN_REMOTE_OOB, &conn->flags))) @@ -3159,12 +3621,9 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, rem_mitm = (conn->remote_auth & 0x01); /* If we require MITM but the remote device can't provide that - * (it has NoInputNoOutput) then reject the confirmation - * request. The only exception is when we're dedicated bonding - * initiators (connect_cfm_cb set) since then we always have the MITM - * bit set. */ - if (!conn->connect_cfm_cb && loc_mitm && - conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) { + * (it has NoInputNoOutput) then reject the confirmation request + */ + if (loc_mitm && conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) { BT_DBG("Rejecting request: remote device can't provide MITM"); hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY, sizeof(ev->bdaddr), &ev->bdaddr); @@ -3177,8 +3636,11 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, /* If we're not the initiators request authorization to * proceed from user space (mgmt_user_confirm with - * confirm_hint set to 1). */ - if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { + * confirm_hint set to 1). The exception is if neither + * side had MITM in which case we do auto-accept. + */ + if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && + (loc_mitm || rem_mitm)) { BT_DBG("Confirming auto-accept as acceptor"); confirm_hint = 1; goto confirm; @@ -3200,8 +3662,8 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, } confirm: - mgmt_user_confirm_request(hdev, &ev->bdaddr, ACL_LINK, 0, ev->passkey, - confirm_hint); + mgmt_user_confirm_request(hdev, &ev->bdaddr, ACL_LINK, 0, + le32_to_cpu(ev->passkey), confirm_hint); unlock: hci_dev_unlock(hdev); @@ -3343,20 +3805,36 @@ static void hci_remote_oob_data_request_evt(struct hci_dev *hdev, data = hci_find_remote_oob_data(hdev, &ev->bdaddr); if (data) { - struct hci_cp_remote_oob_data_reply cp; + if (test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) { + struct hci_cp_remote_oob_ext_data_reply cp; - bacpy(&cp.bdaddr, &ev->bdaddr); - memcpy(cp.hash, data->hash, sizeof(cp.hash)); - memcpy(cp.randomizer, data->randomizer, sizeof(cp.randomizer)); + bacpy(&cp.bdaddr, &ev->bdaddr); + memcpy(cp.hash192, data->hash192, sizeof(cp.hash192)); + memcpy(cp.randomizer192, data->randomizer192, + sizeof(cp.randomizer192)); + memcpy(cp.hash256, data->hash256, sizeof(cp.hash256)); + memcpy(cp.randomizer256, data->randomizer256, + sizeof(cp.randomizer256)); + + hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_EXT_DATA_REPLY, + sizeof(cp), &cp); + } else { + struct hci_cp_remote_oob_data_reply cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + memcpy(cp.hash, data->hash192, sizeof(cp.hash)); + memcpy(cp.randomizer, data->randomizer192, + sizeof(cp.randomizer)); - hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_REPLY, sizeof(cp), - &cp); + hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_REPLY, + sizeof(cp), &cp); + } } else { struct hci_cp_remote_oob_data_neg_reply cp; bacpy(&cp.bdaddr, &ev->bdaddr); - hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_NEG_REPLY, sizeof(cp), - &cp); + hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_NEG_REPLY, + sizeof(cp), &cp); } unlock: @@ -3490,6 +3968,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_le_conn_complete *ev = (void *) skb->data; struct hci_conn *conn; + struct smp_irk *irk; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -3505,63 +3984,223 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->dst_type = ev->bdaddr_type; - /* The advertising parameters for own address type - * define which source address and source address - * type this connections has. - */ - if (bacmp(&conn->src, BDADDR_ANY)) { - conn->src_type = ADDR_LE_DEV_PUBLIC; - } else { - bacpy(&conn->src, &hdev->static_addr); - conn->src_type = ADDR_LE_DEV_RANDOM; - } - if (ev->role == LE_CONN_ROLE_MASTER) { conn->out = true; conn->link_mode |= HCI_LM_MASTER; } + + /* If we didn't have a hci_conn object previously + * but we're in master role this must be something + * initiated using a white list. Since white list based + * connections are not "first class citizens" we don't + * have full tracking of them. Therefore, we go ahead + * with a "best effort" approach of determining the + * initiator address based on the HCI_PRIVACY flag. + */ + if (conn->out) { + conn->resp_addr_type = ev->bdaddr_type; + bacpy(&conn->resp_addr, &ev->bdaddr); + if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) { + conn->init_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->init_addr, &hdev->rpa); + } else { + hci_copy_identity_address(hdev, + &conn->init_addr, + &conn->init_addr_type); + } + } + } else { + cancel_delayed_work(&conn->le_conn_timeout); + } + + if (!conn->out) { + /* Set the responder (our side) address type based on + * the advertising address type. + */ + conn->resp_addr_type = hdev->adv_addr_type; + if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) + bacpy(&conn->resp_addr, &hdev->random_addr); + else + bacpy(&conn->resp_addr, &hdev->bdaddr); + + conn->init_addr_type = ev->bdaddr_type; + bacpy(&conn->init_addr, &ev->bdaddr); + } + + /* Lookup the identity address from the stored connection + * address and address type. + * + * When establishing connections to an identity address, the + * connection procedure will store the resolvable random + * address first. Now if it can be converted back into the + * identity address, start using the identity address from + * now on. + */ + irk = hci_get_irk(hdev, &conn->dst, conn->dst_type); + if (irk) { + bacpy(&conn->dst, &irk->bdaddr); + conn->dst_type = irk->addr_type; } if (ev->status) { - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, ev->status); - hci_proto_connect_cfm(conn, ev->status); - conn->state = BT_CLOSED; - hci_conn_del(conn); + hci_le_conn_failed(conn, ev->status); goto unlock; } if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, &ev->bdaddr, conn->type, + mgmt_device_connected(hdev, &conn->dst, conn->type, conn->dst_type, 0, NULL, 0, NULL); conn->sec_level = BT_SECURITY_LOW; conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; + if (test_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags)) + set_bit(HCI_CONN_6LOWPAN, &conn->flags); + hci_conn_add_sysfs(conn); hci_proto_connect_cfm(conn, ev->status); + hci_pend_le_conn_del(hdev, &conn->dst, conn->dst_type); + unlock: hci_dev_unlock(hdev); } +/* This function requires the caller holds hdev->lock */ +static void check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, + u8 addr_type) +{ + struct hci_conn *conn; + struct smp_irk *irk; + + /* If this is a resolvable address, we should resolve it and then + * update address and address type variables. + */ + irk = hci_get_irk(hdev, addr, addr_type); + if (irk) { + addr = &irk->bdaddr; + addr_type = irk->addr_type; + } + + if (!hci_pend_le_conn_lookup(hdev, addr, addr_type)) + return; + + conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, + HCI_AT_NO_BONDING); + if (!IS_ERR(conn)) + return; + + switch (PTR_ERR(conn)) { + case -EBUSY: + /* If hci_connect() returns -EBUSY it means there is already + * an LE connection attempt going on. Since controllers don't + * support more than one connection attempt at the time, we + * don't consider this an error case. + */ + break; + default: + BT_DBG("Failed to connect: err %ld", PTR_ERR(conn)); + } +} + +static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, + u8 bdaddr_type, s8 rssi, u8 *data, u8 len) +{ + struct discovery_state *d = &hdev->discovery; + bool match; + + /* Passive scanning shouldn't trigger any device found events */ + if (hdev->le_scan_type == LE_SCAN_PASSIVE) { + if (type == LE_ADV_IND || type == LE_ADV_DIRECT_IND) + check_pending_le_conn(hdev, bdaddr, bdaddr_type); + return; + } + + /* If there's nothing pending either store the data from this + * event or send an immediate device found event if the data + * should not be stored for later. + */ + if (!has_pending_adv_report(hdev)) { + /* If the report will trigger a SCAN_REQ store it for + * later merging. + */ + if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) { + store_pending_adv_report(hdev, bdaddr, bdaddr_type, + rssi, data, len); + return; + } + + mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, + rssi, 0, 1, data, len, NULL, 0); + return; + } + + /* Check if the pending report is for the same device as the new one */ + match = (!bacmp(bdaddr, &d->last_adv_addr) && + bdaddr_type == d->last_adv_addr_type); + + /* If the pending data doesn't match this report or this isn't a + * scan response (e.g. we got a duplicate ADV_IND) then force + * sending of the pending data. + */ + if (type != LE_ADV_SCAN_RSP || !match) { + /* Send out whatever is in the cache, but skip duplicates */ + if (!match) + mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, + d->last_adv_addr_type, NULL, + d->last_adv_rssi, 0, 1, + d->last_adv_data, + d->last_adv_data_len, NULL, 0); + + /* If the new report will trigger a SCAN_REQ store it for + * later merging. + */ + if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) { + store_pending_adv_report(hdev, bdaddr, bdaddr_type, + rssi, data, len); + return; + } + + /* The advertising reports cannot be merged, so clear + * the pending report and send out a device found event. + */ + clear_pending_adv_report(hdev); + mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, + rssi, 0, 1, data, len, NULL, 0); + return; + } + + /* If we get here we've got a pending ADV_IND or ADV_SCAN_IND and + * the new event is a SCAN_RSP. We can therefore proceed with + * sending a merged device found event. + */ + mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, + d->last_adv_addr_type, NULL, rssi, 0, 1, data, len, + d->last_adv_data, d->last_adv_data_len); + clear_pending_adv_report(hdev); +} + static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) { u8 num_reports = skb->data[0]; void *ptr = &skb->data[1]; - s8 rssi; + + hci_dev_lock(hdev); while (num_reports--) { struct hci_ev_le_advertising_info *ev = ptr; + s8 rssi; rssi = ev->data[ev->length]; - mgmt_device_found(hdev, &ev->bdaddr, LE_LINK, ev->bdaddr_type, - NULL, rssi, 0, 1, ev->data, ev->length); + process_adv_report(hdev, ev->evt_type, &ev->bdaddr, + ev->bdaddr_type, rssi, ev->data, ev->length); ptr += sizeof(*ev) + ev->length + 1; } + + hci_dev_unlock(hdev); } static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -3580,7 +4219,7 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn == NULL) goto not_found; - ltk = hci_find_ltk(hdev, ev->ediv, ev->random); + ltk = hci_find_ltk(hdev, ev->ediv, ev->rand, conn->out); if (ltk == NULL) goto not_found; @@ -3596,7 +4235,13 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp); - if (ltk->type & HCI_SMP_STK) { + /* Ref. Bluetooth Core SPEC pages 1975 and 2004. STK is a + * temporary key used to encrypt a connection following + * pairing. It is used during the Encrypted Session Setup to + * distribute the keys. Later, security can be re-established + * using a distributed LTK. + */ + if (ltk->type == HCI_SMP_STK_SLAVE) { list_del(<k->list); kfree(ltk); } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 6a6c8bb4fd7..80d25c150a6 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -143,7 +143,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) if (!skb_copy) { /* Create a private copy with headroom */ - skb_copy = __pskb_copy(skb, 1, GFP_ATOMIC); + skb_copy = __pskb_copy_fclone(skb, 1, GFP_ATOMIC, true); if (!skb_copy) continue; @@ -211,22 +211,22 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: - opcode = __constant_cpu_to_le16(HCI_MON_COMMAND_PKT); + opcode = cpu_to_le16(HCI_MON_COMMAND_PKT); break; case HCI_EVENT_PKT: - opcode = __constant_cpu_to_le16(HCI_MON_EVENT_PKT); + opcode = cpu_to_le16(HCI_MON_EVENT_PKT); break; case HCI_ACLDATA_PKT: if (bt_cb(skb)->incoming) - opcode = __constant_cpu_to_le16(HCI_MON_ACL_RX_PKT); + opcode = cpu_to_le16(HCI_MON_ACL_RX_PKT); else - opcode = __constant_cpu_to_le16(HCI_MON_ACL_TX_PKT); + opcode = cpu_to_le16(HCI_MON_ACL_TX_PKT); break; case HCI_SCODATA_PKT: if (bt_cb(skb)->incoming) - opcode = __constant_cpu_to_le16(HCI_MON_SCO_RX_PKT); + opcode = cpu_to_le16(HCI_MON_SCO_RX_PKT); else - opcode = __constant_cpu_to_le16(HCI_MON_SCO_TX_PKT); + opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; default: return; @@ -247,8 +247,8 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) struct hci_mon_hdr *hdr; /* Create a private copy with headroom */ - skb_copy = __pskb_copy(skb, HCI_MON_HDR_SIZE, - GFP_ATOMIC); + skb_copy = __pskb_copy_fclone(skb, HCI_MON_HDR_SIZE, + GFP_ATOMIC, true); if (!skb_copy) continue; @@ -319,7 +319,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) bacpy(&ni->bdaddr, &hdev->bdaddr); memcpy(ni->name, hdev->name, 8); - opcode = __constant_cpu_to_le16(HCI_MON_NEW_INDEX); + opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; case HCI_DEV_UNREG: @@ -327,7 +327,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) if (!skb) return NULL; - opcode = __constant_cpu_to_le16(HCI_MON_DEL_INDEX); + opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; default: @@ -524,16 +524,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, case HCISETRAW: if (!capable(CAP_NET_ADMIN)) return -EPERM; - - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) - return -EPERM; - - if (arg) - set_bit(HCI_RAW, &hdev->flags); - else - clear_bit(HCI_RAW, &hdev->flags); - - return 0; + return -EOPNOTSUPP; case HCIGETCONNINFO: return hci_get_conn_info(hdev, (void __user *) arg); @@ -716,6 +707,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, err = hci_dev_open(hdev->id); if (err) { clear_bit(HCI_USER_CHANNEL, &hdev->dev_flags); + mgmt_index_added(hdev); hci_dev_put(hdev); goto done; } @@ -940,8 +932,22 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, bt_cb(skb)->pkt_type = *((unsigned char *) skb->data); skb_pull(skb, 1); - if (hci_pi(sk)->channel == HCI_CHANNEL_RAW && - bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { + if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { + /* No permission check is needed for user channel + * since that gets enforced when binding the socket. + * + * However check that the packet type is valid. + */ + if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT && + bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && + bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + err = -EINVAL; + goto drop; + } + + skb_queue_tail(&hdev->raw_q, skb); + queue_work(hdev->workqueue, &hdev->tx_work); + } else if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { u16 opcode = get_unaligned_le16(skb->data); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); @@ -972,14 +978,6 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, goto drop; } - if (hci_pi(sk)->channel == HCI_CHANNEL_USER && - bt_cb(skb)->pkt_type != HCI_COMMAND_PKT && - bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && - bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { - err = -EINVAL; - goto drop; - } - skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 0b61250cfdf..555982a78a5 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -49,14 +49,7 @@ static struct attribute *bt_link_attrs[] = { NULL }; -static struct attribute_group bt_link_group = { - .attrs = bt_link_attrs, -}; - -static const struct attribute_group *bt_link_groups[] = { - &bt_link_group, - NULL -}; +ATTRIBUTE_GROUPS(bt_link); static void bt_link_release(struct device *dev) { @@ -182,14 +175,7 @@ static struct attribute *bt_host_attrs[] = { NULL }; -static struct attribute_group bt_host_group = { - .attrs = bt_host_attrs, -}; - -static const struct attribute_group *bt_host_groups[] = { - &bt_host_group, - NULL -}; +ATTRIBUTE_GROUPS(bt_host); static void bt_host_release(struct device *dev) { diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 292e619db89..8181ea4bc2f 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -223,51 +223,6 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) input_sync(dev); } -static int hidp_send_report(struct hidp_session *session, struct hid_report *report) -{ - unsigned char hdr; - u8 *buf; - int rsize, ret; - - buf = hid_alloc_report_buf(report, GFP_ATOMIC); - if (!buf) - return -EIO; - - hid_output_report(report, buf); - hdr = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; - - rsize = ((report->size - 1) >> 3) + 1 + (report->id > 0); - ret = hidp_send_intr_message(session, hdr, buf, rsize); - - kfree(buf); - return ret; -} - -static int hidp_hidinput_event(struct input_dev *dev, unsigned int type, - unsigned int code, int value) -{ - struct hid_device *hid = input_get_drvdata(dev); - struct hidp_session *session = hid->driver_data; - struct hid_field *field; - int offset; - - BT_DBG("session %p type %d code %d value %d", - session, type, code, value); - - if (type != EV_LED) - return -1; - - offset = hidinput_find_field(hid, type, code, &field); - if (offset == -1) { - hid_warn(dev, "event field not found\n"); - return -1; - } - - hid_set_field(field, offset, value); - - return hidp_send_report(session, field->report); -} - static int hidp_get_raw_report(struct hid_device *hid, unsigned char report_number, unsigned char *data, size_t count, @@ -353,17 +308,24 @@ err: return ret; } -static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count, - unsigned char report_type) +static int hidp_set_raw_report(struct hid_device *hid, unsigned char reportnum, + unsigned char *data, size_t count, + unsigned char report_type) { struct hidp_session *session = hid->driver_data; int ret; - if (report_type == HID_OUTPUT_REPORT) { - report_type = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; - return hidp_send_intr_message(session, report_type, - data, count); - } else if (report_type != HID_FEATURE_REPORT) { + switch (report_type) { + case HID_FEATURE_REPORT: + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; + break; + case HID_INPUT_REPORT: + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_INPUT; + break; + case HID_OUTPUT_REPORT: + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_OUPUT; + break; + default: return -EINVAL; } @@ -371,8 +333,8 @@ static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, s return -ERESTARTSYS; /* Set up our wait, and send the report request to the device. */ + data[0] = reportnum; set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); - report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; ret = hidp_send_ctrl_message(session, report_type, data, count); if (ret) goto err; @@ -411,6 +373,29 @@ err: return ret; } +static int hidp_output_report(struct hid_device *hid, __u8 *data, size_t count) +{ + struct hidp_session *session = hid->driver_data; + + return hidp_send_intr_message(session, + HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT, + data, count); +} + +static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum, + __u8 *buf, size_t len, unsigned char rtype, + int reqtype) +{ + switch (reqtype) { + case HID_REQ_GET_REPORT: + return hidp_get_raw_report(hid, reportnum, buf, len, rtype); + case HID_REQ_SET_REPORT: + return hidp_set_raw_report(hid, reportnum, buf, len, rtype); + default: + return -EIO; + } +} + static void hidp_idle_timeout(unsigned long arg) { struct hidp_session *session = (struct hidp_session *) arg; @@ -430,6 +415,16 @@ static void hidp_del_timer(struct hidp_session *session) del_timer(&session->timer); } +static void hidp_process_report(struct hidp_session *session, + int type, const u8 *data, int len, int intr) +{ + if (len > HID_MAX_BUFFER_SIZE) + len = HID_MAX_BUFFER_SIZE; + + memcpy(session->input_buf, data, len); + hid_input_report(session->hid, type, session->input_buf, len, intr); +} + static void hidp_process_handshake(struct hidp_session *session, unsigned char param) { @@ -502,7 +497,8 @@ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, hidp_input_report(session, skb); if (session->hid) - hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 0); + hidp_process_report(session, HID_INPUT_REPORT, + skb->data, skb->len, 0); break; case HIDP_DATA_RTYPE_OTHER: @@ -584,7 +580,8 @@ static void hidp_recv_intr_frame(struct hidp_session *session, hidp_input_report(session, skb); if (session->hid) { - hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 1); + hidp_process_report(session, HID_INPUT_REPORT, + skb->data, skb->len, 1); BT_DBG("report len %d", skb->len); } } else { @@ -727,7 +724,8 @@ static struct hid_ll_driver hidp_hid_driver = { .stop = hidp_stop, .open = hidp_open, .close = hidp_close, - .hidinput_input_event = hidp_hidinput_event, + .raw_request = hidp_raw_request, + .output_report = hidp_output_report, }; /* This function sets up the hid device. It does not add it @@ -769,15 +767,15 @@ static int hidp_setup_hid(struct hidp_session *session, snprintf(hid->phys, sizeof(hid->phys), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->src); + /* NOTE: Some device modules depend on the dst address being stored in + * uniq. Please be aware of this before making changes to this behavior. + */ snprintf(hid->uniq, sizeof(hid->uniq), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->dst); hid->dev.parent = &session->conn->hcon->dev; hid->ll_driver = &hidp_hid_driver; - hid->hid_get_raw_report = hidp_get_raw_report; - hid->hid_output_raw_report = hidp_output_raw_report; - /* True if device is blacklisted in drivers/hid/hid-core.c */ if (hid_ignore(hid)) { hid_destroy_device(session->hid); diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h index ab5241400cf..8798492a6e9 100644 --- a/net/bluetooth/hidp/hidp.h +++ b/net/bluetooth/hidp/hidp.h @@ -24,6 +24,7 @@ #define __HIDP_H #include <linux/types.h> +#include <linux/hid.h> #include <linux/kref.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/l2cap.h> @@ -179,6 +180,9 @@ struct hidp_session { /* Used in hidp_output_raw_report() */ int output_report_success; /* boolean */ + + /* temporary input buffer */ + u8 input_buf[HID_MAX_BUFFER_SIZE]; }; /* HIDP init defines */ diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 4af3821df88..323f23cd2c3 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -40,6 +40,9 @@ #include "smp.h" #include "a2mp.h" #include "amp.h" +#include "6lowpan.h" + +#define LE_FLOWCTL_MAX_CREDITS 65535 bool disable_ertm; @@ -49,6 +52,9 @@ static u8 l2cap_fixed_chan[8] = { L2CAP_FC_L2CAP | L2CAP_FC_CONNLESS, }; static LIST_HEAD(chan_list); static DEFINE_RWLOCK(chan_list_lock); +static u16 le_max_credits = L2CAP_LE_MAX_CREDITS; +static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS; + static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, @@ -213,9 +219,14 @@ int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid) static u16 l2cap_alloc_cid(struct l2cap_conn *conn) { - u16 cid = L2CAP_CID_DYN_START; + u16 cid, dyn_end; + + if (conn->hcon->type == LE_LINK) + dyn_end = L2CAP_CID_LE_DYN_END; + else + dyn_end = L2CAP_CID_DYN_END; - for (; cid < L2CAP_CID_DYN_END; cid++) { + for (cid = L2CAP_CID_DYN_START; cid < dyn_end; cid++) { if (!__l2cap_get_chan_by_scid(conn, cid)) return cid; } @@ -321,44 +332,20 @@ static inline bool l2cap_seq_list_contains(struct l2cap_seq_list *seq_list, return seq_list->list[seq & seq_list->mask] != L2CAP_SEQ_LIST_CLEAR; } -static u16 l2cap_seq_list_remove(struct l2cap_seq_list *seq_list, u16 seq) +static inline u16 l2cap_seq_list_pop(struct l2cap_seq_list *seq_list) { + u16 seq = seq_list->head; u16 mask = seq_list->mask; - if (seq_list->head == L2CAP_SEQ_LIST_CLEAR) { - /* In case someone tries to pop the head of an empty list */ - return L2CAP_SEQ_LIST_CLEAR; - } else if (seq_list->head == seq) { - /* Head can be removed in constant time */ - seq_list->head = seq_list->list[seq & mask]; - seq_list->list[seq & mask] = L2CAP_SEQ_LIST_CLEAR; - - if (seq_list->head == L2CAP_SEQ_LIST_TAIL) { - seq_list->head = L2CAP_SEQ_LIST_CLEAR; - seq_list->tail = L2CAP_SEQ_LIST_CLEAR; - } - } else { - /* Walk the list to find the sequence number */ - u16 prev = seq_list->head; - while (seq_list->list[prev & mask] != seq) { - prev = seq_list->list[prev & mask]; - if (prev == L2CAP_SEQ_LIST_TAIL) - return L2CAP_SEQ_LIST_CLEAR; - } + seq_list->head = seq_list->list[seq & mask]; + seq_list->list[seq & mask] = L2CAP_SEQ_LIST_CLEAR; - /* Unlink the number from the list and clear it */ - seq_list->list[prev & mask] = seq_list->list[seq & mask]; - seq_list->list[seq & mask] = L2CAP_SEQ_LIST_CLEAR; - if (seq_list->tail == seq) - seq_list->tail = prev; + if (seq_list->head == L2CAP_SEQ_LIST_TAIL) { + seq_list->head = L2CAP_SEQ_LIST_CLEAR; + seq_list->tail = L2CAP_SEQ_LIST_CLEAR; } - return seq; -} -static inline u16 l2cap_seq_list_pop(struct l2cap_seq_list *seq_list) -{ - /* Remove the head in constant time */ - return l2cap_seq_list_remove(seq_list, seq_list->head); + return seq; } static void l2cap_seq_list_clear(struct l2cap_seq_list *seq_list) @@ -484,12 +471,30 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) chan->max_tx = L2CAP_DEFAULT_MAX_TX; chan->tx_win = L2CAP_DEFAULT_TX_WINDOW; chan->tx_win_max = L2CAP_DEFAULT_TX_WINDOW; + chan->remote_max_tx = chan->max_tx; + chan->remote_tx_win = chan->tx_win; chan->ack_win = L2CAP_DEFAULT_TX_WINDOW; chan->sec_level = BT_SECURITY_LOW; + chan->flush_to = L2CAP_DEFAULT_FLUSH_TO; + chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO; + chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO; + chan->conf_state = 0; set_bit(FLAG_FORCE_ACTIVE, &chan->flags); } +static void l2cap_le_flowctl_init(struct l2cap_chan *chan) +{ + chan->sdu = NULL; + chan->sdu_last_frag = NULL; + chan->sdu_len = 0; + chan->tx_credits = 0; + chan->rx_credits = le_max_credits; + chan->mps = min_t(u16, chan->imtu, le_default_mps); + + skb_queue_head_init(&chan->tx_q); +} + void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, @@ -501,18 +506,10 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) switch (chan->chan_type) { case L2CAP_CHAN_CONN_ORIENTED: - if (conn->hcon->type == LE_LINK) { - /* LE connection */ - chan->omtu = L2CAP_DEFAULT_MTU; - if (chan->dcid == L2CAP_CID_ATT) - chan->scid = L2CAP_CID_ATT; - else - chan->scid = l2cap_alloc_cid(conn); - } else { - /* Alloc CID for connection-oriented socket */ - chan->scid = l2cap_alloc_cid(conn); + /* Alloc CID for connection-oriented socket */ + chan->scid = l2cap_alloc_cid(conn); + if (conn->hcon->type == ACL_LINK) chan->omtu = L2CAP_DEFAULT_MTU; - } break; case L2CAP_CHAN_CONN_LESS: @@ -522,11 +519,8 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) chan->omtu = L2CAP_DEFAULT_MTU; break; - case L2CAP_CHAN_CONN_FIX_A2MP: - chan->scid = L2CAP_CID_A2MP; - chan->dcid = L2CAP_CID_A2MP; - chan->omtu = L2CAP_A2MP_DEFAULT_MTU; - chan->imtu = L2CAP_A2MP_DEFAULT_MTU; + case L2CAP_CHAN_FIXED: + /* Caller will set CID and CID specific MTU values */ break; default: @@ -574,7 +568,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) chan->conn = NULL; - if (chan->chan_type != L2CAP_CHAN_CONN_FIX_A2MP) + if (chan->scid != L2CAP_CID_A2MP) hci_conn_drop(conn->hcon); if (mgr && mgr->bredr_chan == chan) @@ -597,6 +591,10 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) case L2CAP_MODE_BASIC: break; + case L2CAP_MODE_LE_FLOWCTL: + skb_queue_purge(&chan->tx_q); + break; + case L2CAP_MODE_ERTM: __clear_retrans_timer(chan); __clear_monitor_timer(chan); @@ -617,6 +615,67 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) return; } +void l2cap_conn_update_id_addr(struct hci_conn *hcon) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan; + + mutex_lock(&conn->chan_lock); + + list_for_each_entry(chan, &conn->chan_l, list) { + l2cap_chan_lock(chan); + bacpy(&chan->dst, &hcon->dst); + chan->dst_type = bdaddr_type(hcon, hcon->dst_type); + l2cap_chan_unlock(chan); + } + + mutex_unlock(&conn->chan_lock); +} + +static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_le_conn_rsp rsp; + u16 result; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) + result = L2CAP_CR_AUTHORIZATION; + else + result = L2CAP_CR_BAD_PSM; + + l2cap_state_change(chan, BT_DISCONN); + + rsp.dcid = cpu_to_le16(chan->scid); + rsp.mtu = cpu_to_le16(chan->imtu); + rsp.mps = cpu_to_le16(chan->mps); + rsp.credits = cpu_to_le16(chan->rx_credits); + rsp.result = cpu_to_le16(result); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), + &rsp); +} + +static void l2cap_chan_connect_reject(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_conn_rsp rsp; + u16 result; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) + result = L2CAP_CR_SEC_BLOCK; + else + result = L2CAP_CR_BAD_PSM; + + l2cap_state_change(chan, BT_DISCONN); + + rsp.scid = cpu_to_le16(chan->dcid); + rsp.dcid = cpu_to_le16(chan->scid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + + l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); +} + void l2cap_chan_close(struct l2cap_chan *chan, int reason) { struct l2cap_conn *conn = chan->conn; @@ -630,8 +689,7 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason) case BT_CONNECTED: case BT_CONFIG: - if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && - conn->hcon->type == ACL_LINK) { + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) { __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); l2cap_send_disconn_req(chan, reason); } else @@ -639,24 +697,11 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason) break; case BT_CONNECT2: - if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && - conn->hcon->type == ACL_LINK) { - struct l2cap_conn_rsp rsp; - __u16 result; - - if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) - result = L2CAP_CR_SEC_BLOCK; - else - result = L2CAP_CR_BAD_PSM; - - l2cap_state_change(chan, BT_DISCONN); - - rsp.scid = cpu_to_le16(chan->dcid); - rsp.dcid = cpu_to_le16(chan->scid); - rsp.result = cpu_to_le16(result); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); - l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, - sizeof(rsp), &rsp); + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) { + if (conn->hcon->type == ACL_LINK) + l2cap_chan_connect_reject(chan); + else if (conn->hcon->type == LE_LINK) + l2cap_chan_le_connect_reject(chan); } l2cap_chan_del(chan, reason); @@ -679,6 +724,7 @@ static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) case L2CAP_CHAN_RAW: switch (chan->sec_level) { case BT_SECURITY_HIGH: + case BT_SECURITY_FIPS: return HCI_AT_DEDICATED_BONDING_MITM; case BT_SECURITY_MEDIUM: return HCI_AT_DEDICATED_BONDING; @@ -687,21 +733,23 @@ static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) } break; case L2CAP_CHAN_CONN_LESS: - if (chan->psm == __constant_cpu_to_le16(L2CAP_PSM_3DSP)) { + if (chan->psm == cpu_to_le16(L2CAP_PSM_3DSP)) { if (chan->sec_level == BT_SECURITY_LOW) chan->sec_level = BT_SECURITY_SDP; } - if (chan->sec_level == BT_SECURITY_HIGH) + if (chan->sec_level == BT_SECURITY_HIGH || + chan->sec_level == BT_SECURITY_FIPS) return HCI_AT_NO_BONDING_MITM; else return HCI_AT_NO_BONDING; break; case L2CAP_CHAN_CONN_ORIENTED: - if (chan->psm == __constant_cpu_to_le16(L2CAP_PSM_SDP)) { + if (chan->psm == cpu_to_le16(L2CAP_PSM_SDP)) { if (chan->sec_level == BT_SECURITY_LOW) chan->sec_level = BT_SECURITY_SDP; - if (chan->sec_level == BT_SECURITY_HIGH) + if (chan->sec_level == BT_SECURITY_HIGH || + chan->sec_level == BT_SECURITY_FIPS) return HCI_AT_NO_BONDING_MITM; else return HCI_AT_NO_BONDING; @@ -710,6 +758,7 @@ static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) default: switch (chan->sec_level) { case BT_SECURITY_HIGH: + case BT_SECURITY_FIPS: return HCI_AT_GENERAL_BONDING_MITM; case BT_SECURITY_MEDIUM: return HCI_AT_GENERAL_BONDING; @@ -726,6 +775,9 @@ int l2cap_chan_check_security(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; __u8 auth_type; + if (conn->hcon->type == LE_LINK) + return smp_conn_security(conn->hcon, chan->sec_level); + auth_type = l2cap_get_auth_type(chan); return hci_conn_security(conn->hcon, chan->sec_level, auth_type); @@ -1152,16 +1204,57 @@ static void l2cap_chan_ready(struct l2cap_chan *chan) chan->conf_state = 0; __clear_chan_timer(chan); + if (chan->mode == L2CAP_MODE_LE_FLOWCTL && !chan->tx_credits) + chan->ops->suspend(chan); + chan->state = BT_CONNECTED; chan->ops->ready(chan); } +static void l2cap_le_connect(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_le_conn_req req; + + if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) + return; + + req.psm = chan->psm; + req.scid = cpu_to_le16(chan->scid); + req.mtu = cpu_to_le16(chan->imtu); + req.mps = cpu_to_le16(chan->mps); + req.credits = cpu_to_le16(chan->rx_credits); + + chan->ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_REQ, + sizeof(req), &req); +} + +static void l2cap_le_start(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + + if (!smp_conn_security(conn->hcon, chan->sec_level)) + return; + + if (!chan->psm) { + l2cap_chan_ready(chan); + return; + } + + if (chan->state == BT_CONNECT) + l2cap_le_connect(chan); +} + static void l2cap_start_connection(struct l2cap_chan *chan) { if (__amp_capable(chan)) { BT_DBG("chan %p AMP capable: discover AMPs", chan); a2mp_discover_amp(chan); + } else if (chan->conn->hcon->type == LE_LINK) { + l2cap_le_start(chan); } else { l2cap_send_conn_req(chan); } @@ -1172,7 +1265,7 @@ static void l2cap_do_start(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; if (conn->hcon->type == LE_LINK) { - l2cap_chan_ready(chan); + l2cap_le_start(chan); return; } @@ -1186,7 +1279,7 @@ static void l2cap_do_start(struct l2cap_chan *chan) } } else { struct l2cap_info_req req; - req.type = __constant_cpu_to_le16(L2CAP_IT_FEAT_MASK); + req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; conn->info_ident = l2cap_get_ident(conn); @@ -1228,7 +1321,7 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err) __clear_ack_timer(chan); } - if (chan->chan_type == L2CAP_CHAN_CONN_FIX_A2MP) { + if (chan->scid == L2CAP_CID_A2MP) { l2cap_state_change(chan, BT_DISCONN); return; } @@ -1283,18 +1376,18 @@ static void l2cap_conn_start(struct l2cap_conn *conn) if (l2cap_chan_check_security(chan)) { if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { - rsp.result = __constant_cpu_to_le16(L2CAP_CR_PEND); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_AUTHOR_PEND); + rsp.result = cpu_to_le16(L2CAP_CR_PEND); + rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND); chan->ops->defer(chan); } else { l2cap_state_change(chan, BT_CONFIG); - rsp.result = __constant_cpu_to_le16(L2CAP_CR_SUCCESS); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); } } else { - rsp.result = __constant_cpu_to_le16(L2CAP_CR_PEND); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_AUTHEN_PEND); + rsp.result = cpu_to_le16(L2CAP_CR_PEND); + rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND); } l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, @@ -1367,6 +1460,8 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) BT_DBG(""); + bt_6lowpan_add_conn(conn); + /* Check if we have socket listening on cid */ pchan = l2cap_global_chan_by_scid(BT_LISTEN, L2CAP_CID_ATT, &hcon->src, &hcon->dst); @@ -1389,8 +1484,6 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) if (!chan) goto clean; - chan->dcid = L2CAP_CID_ATT; - bacpy(&chan->src, &hcon->src); bacpy(&chan->dst, &hcon->dst); chan->src_type = bdaddr_type(hcon, hcon->src_type); @@ -1424,15 +1517,13 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) l2cap_chan_lock(chan); - if (chan->chan_type == L2CAP_CHAN_CONN_FIX_A2MP) { + if (chan->scid == L2CAP_CID_A2MP) { l2cap_chan_unlock(chan); continue; } if (hcon->type == LE_LINK) { - if (smp_conn_security(hcon, chan->sec_level)) - l2cap_chan_ready(chan); - + l2cap_le_start(chan); } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { l2cap_chan_ready(chan); @@ -1444,6 +1535,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) } mutex_unlock(&conn->chan_lock); + + queue_work(hcon->hdev->workqueue, &conn->pending_rx_work); } /* Notify sockets that we cannot guaranty reliability anymore */ @@ -1569,6 +1662,15 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) kfree_skb(conn->rx_skb); + skb_queue_purge(&conn->pending_rx); + + /* We can not call flush_work(&conn->pending_rx_work) here since we + * might block if we are running on a worker from the same workqueue + * pending_rx_work is waiting on. + */ + if (work_pending(&conn->pending_rx_work)) + cancel_work_sync(&conn->pending_rx_work); + l2cap_unregister_all_users(conn); mutex_lock(&conn->chan_lock); @@ -1616,66 +1718,6 @@ static void security_timeout(struct work_struct *work) } } -static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) -{ - struct l2cap_conn *conn = hcon->l2cap_data; - struct hci_chan *hchan; - - if (conn) - return conn; - - hchan = hci_chan_create(hcon); - if (!hchan) - return NULL; - - conn = kzalloc(sizeof(struct l2cap_conn), GFP_KERNEL); - if (!conn) { - hci_chan_del(hchan); - return NULL; - } - - kref_init(&conn->ref); - hcon->l2cap_data = conn; - conn->hcon = hcon; - hci_conn_get(conn->hcon); - conn->hchan = hchan; - - BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan); - - switch (hcon->type) { - case LE_LINK: - if (hcon->hdev->le_mtu) { - conn->mtu = hcon->hdev->le_mtu; - break; - } - /* fall through */ - default: - conn->mtu = hcon->hdev->acl_mtu; - break; - } - - conn->feat_mask = 0; - - if (hcon->type == ACL_LINK) - conn->hs_enabled = test_bit(HCI_HS_ENABLED, - &hcon->hdev->dev_flags); - - spin_lock_init(&conn->lock); - mutex_init(&conn->chan_lock); - - INIT_LIST_HEAD(&conn->chan_l); - INIT_LIST_HEAD(&conn->users); - - if (hcon->type == LE_LINK) - INIT_DELAYED_WORK(&conn->security_timer, security_timeout); - else - INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); - - conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM; - - return conn; -} - static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); @@ -1703,7 +1745,8 @@ EXPORT_SYMBOL(l2cap_conn_put); */ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr_t *src, - bdaddr_t *dst) + bdaddr_t *dst, + u8 link_type) { struct l2cap_chan *c, *c1 = NULL; @@ -1713,6 +1756,12 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, if (state && c->state != state) continue; + if (link_type == ACL_LINK && c->src_type != BDADDR_BREDR) + continue; + + if (link_type == LE_LINK && c->src_type == BDADDR_BREDR) + continue; + if (c->psm == psm) { int src_match, dst_match; int src_any, dst_any; @@ -1739,140 +1788,6 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, return c1; } -int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, - bdaddr_t *dst, u8 dst_type) -{ - struct l2cap_conn *conn; - struct hci_conn *hcon; - struct hci_dev *hdev; - __u8 auth_type; - int err; - - BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst, - dst_type, __le16_to_cpu(psm)); - - hdev = hci_get_route(dst, &chan->src); - if (!hdev) - return -EHOSTUNREACH; - - hci_dev_lock(hdev); - - l2cap_chan_lock(chan); - - /* PSM must be odd and lsb of upper byte must be 0 */ - if ((__le16_to_cpu(psm) & 0x0101) != 0x0001 && !cid && - chan->chan_type != L2CAP_CHAN_RAW) { - err = -EINVAL; - goto done; - } - - if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && !(psm || cid)) { - err = -EINVAL; - goto done; - } - - switch (chan->mode) { - case L2CAP_MODE_BASIC: - break; - case L2CAP_MODE_ERTM: - case L2CAP_MODE_STREAMING: - if (!disable_ertm) - break; - /* fall through */ - default: - err = -ENOTSUPP; - goto done; - } - - switch (chan->state) { - case BT_CONNECT: - case BT_CONNECT2: - case BT_CONFIG: - /* Already connecting */ - err = 0; - goto done; - - case BT_CONNECTED: - /* Already connected */ - err = -EISCONN; - goto done; - - case BT_OPEN: - case BT_BOUND: - /* Can connect */ - break; - - default: - err = -EBADFD; - goto done; - } - - /* Set destination address and psm */ - bacpy(&chan->dst, dst); - chan->dst_type = dst_type; - - chan->psm = psm; - chan->dcid = cid; - - auth_type = l2cap_get_auth_type(chan); - - if (bdaddr_type_is_le(dst_type)) - hcon = hci_connect(hdev, LE_LINK, dst, dst_type, - chan->sec_level, auth_type); - else - hcon = hci_connect(hdev, ACL_LINK, dst, dst_type, - chan->sec_level, auth_type); - - if (IS_ERR(hcon)) { - err = PTR_ERR(hcon); - goto done; - } - - conn = l2cap_conn_add(hcon); - if (!conn) { - hci_conn_drop(hcon); - err = -ENOMEM; - goto done; - } - - if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { - hci_conn_drop(hcon); - err = -EBUSY; - goto done; - } - - /* Update source addr of the socket */ - bacpy(&chan->src, &hcon->src); - chan->src_type = bdaddr_type(hcon, hcon->src_type); - - l2cap_chan_unlock(chan); - l2cap_chan_add(conn, chan); - l2cap_chan_lock(chan); - - /* l2cap_chan_add takes its own ref so we can drop this one */ - hci_conn_drop(hcon); - - l2cap_state_change(chan, BT_CONNECT); - __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); - - if (hcon->state == BT_CONNECTED) { - if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { - __clear_chan_timer(chan); - if (l2cap_chan_check_security(chan)) - l2cap_state_change(chan, BT_CONNECTED); - } else - l2cap_do_start(chan); - } - - err = 0; - -done: - l2cap_chan_unlock(chan); - hci_dev_unlock(hdev); - hci_dev_put(hdev); - return err; -} - static void l2cap_monitor_timeout(struct work_struct *work) { struct l2cap_chan *chan = container_of(work, struct l2cap_chan, @@ -2432,6 +2347,89 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, return 0; } +static struct sk_buff *l2cap_create_le_flowctl_pdu(struct l2cap_chan *chan, + struct msghdr *msg, + size_t len, u16 sdulen) +{ + struct l2cap_conn *conn = chan->conn; + struct sk_buff *skb; + int err, count, hlen; + struct l2cap_hdr *lh; + + BT_DBG("chan %p len %zu", chan, len); + + if (!conn) + return ERR_PTR(-ENOTCONN); + + hlen = L2CAP_HDR_SIZE; + + if (sdulen) + hlen += L2CAP_SDULEN_SIZE; + + count = min_t(unsigned int, (conn->mtu - hlen), len); + + skb = chan->ops->alloc_skb(chan, count + hlen, + msg->msg_flags & MSG_DONTWAIT); + if (IS_ERR(skb)) + return skb; + + /* Create L2CAP header */ + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->cid = cpu_to_le16(chan->dcid); + lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); + + if (sdulen) + put_unaligned_le16(sdulen, skb_put(skb, L2CAP_SDULEN_SIZE)); + + err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb); + if (unlikely(err < 0)) { + kfree_skb(skb); + return ERR_PTR(err); + } + + return skb; +} + +static int l2cap_segment_le_sdu(struct l2cap_chan *chan, + struct sk_buff_head *seg_queue, + struct msghdr *msg, size_t len) +{ + struct sk_buff *skb; + size_t pdu_len; + u16 sdu_len; + + BT_DBG("chan %p, msg %p, len %zu", chan, msg, len); + + pdu_len = chan->conn->mtu - L2CAP_HDR_SIZE; + + pdu_len = min_t(size_t, pdu_len, chan->remote_mps); + + sdu_len = len; + pdu_len -= L2CAP_SDULEN_SIZE; + + while (len > 0) { + if (len <= pdu_len) + pdu_len = len; + + skb = l2cap_create_le_flowctl_pdu(chan, msg, pdu_len, sdu_len); + if (IS_ERR(skb)) { + __skb_queue_purge(seg_queue); + return PTR_ERR(skb); + } + + __skb_queue_tail(seg_queue, skb); + + len -= pdu_len; + + if (sdu_len) { + sdu_len = 0; + pdu_len += L2CAP_SDULEN_SIZE; + } + } + + return 0; +} + int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, u32 priority) { @@ -2448,11 +2446,53 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, if (IS_ERR(skb)) return PTR_ERR(skb); + /* Channel lock is released before requesting new skb and then + * reacquired thus we need to recheck channel state. + */ + if (chan->state != BT_CONNECTED) { + kfree_skb(skb); + return -ENOTCONN; + } + l2cap_do_send(chan, skb); return len; } switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + /* Check outgoing MTU */ + if (len > chan->omtu) + return -EMSGSIZE; + + if (!chan->tx_credits) + return -EAGAIN; + + __skb_queue_head_init(&seg_queue); + + err = l2cap_segment_le_sdu(chan, &seg_queue, msg, len); + + if (chan->state != BT_CONNECTED) { + __skb_queue_purge(&seg_queue); + err = -ENOTCONN; + } + + if (err) + return err; + + skb_queue_splice_tail_init(&seg_queue, &chan->tx_q); + + while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) { + l2cap_do_send(chan, skb_dequeue(&chan->tx_q)); + chan->tx_credits--; + } + + if (!chan->tx_credits) + chan->ops->suspend(chan); + + err = len; + + break; + case L2CAP_MODE_BASIC: /* Check outgoing MTU */ if (len > chan->omtu) @@ -2463,6 +2503,14 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, if (IS_ERR(skb)) return PTR_ERR(skb); + /* Channel lock is released before requesting new skb and then + * reacquired thus we need to recheck channel state. + */ + if (chan->state != BT_CONNECTED) { + kfree_skb(skb); + return -ENOTCONN; + } + l2cap_do_send(chan, skb); err = len; break; @@ -2859,9 +2907,9 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, lh->len = cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen); if (conn->hcon->type == LE_LINK) - lh->cid = __constant_cpu_to_le16(L2CAP_CID_LE_SIGNALING); + lh->cid = cpu_to_le16(L2CAP_CID_LE_SIGNALING); else - lh->cid = __constant_cpu_to_le16(L2CAP_CID_SIGNALING); + lh->cid = cpu_to_le16(L2CAP_CID_SIGNALING); cmd = (struct l2cap_cmd_hdr *) skb_put(skb, L2CAP_CMD_HDR_SIZE); cmd->code = code; @@ -2974,8 +3022,8 @@ static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) efs.stype = chan->local_stype; efs.msdu = cpu_to_le16(chan->local_msdu); efs.sdu_itime = cpu_to_le32(chan->local_sdu_itime); - efs.acc_lat = __constant_cpu_to_le32(L2CAP_DEFAULT_ACC_LAT); - efs.flush_to = __constant_cpu_to_le32(L2CAP_EFS_DEFAULT_FLUSH_TO); + efs.acc_lat = cpu_to_le32(L2CAP_DEFAULT_ACC_LAT); + efs.flush_to = cpu_to_le32(L2CAP_EFS_DEFAULT_FLUSH_TO); break; case L2CAP_MODE_STREAMING: @@ -3116,8 +3164,8 @@ static void __l2cap_set_ertm_timeouts(struct l2cap_chan *chan, rfc->retrans_timeout = cpu_to_le16((u16) ertm_to); rfc->monitor_timeout = rfc->retrans_timeout; } else { - rfc->retrans_timeout = __constant_cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO); - rfc->monitor_timeout = __constant_cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO); + rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO); + rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO); } } @@ -3249,7 +3297,7 @@ done: } req->dcid = cpu_to_le16(chan->dcid); - req->flags = __constant_cpu_to_le16(0); + req->flags = cpu_to_le16(0); return ptr - data; } @@ -3463,7 +3511,7 @@ done: } rsp->scid = cpu_to_le16(chan->dcid); rsp->result = cpu_to_le16(result); - rsp->flags = __constant_cpu_to_le16(0); + rsp->flags = cpu_to_le16(0); return ptr - data; } @@ -3572,7 +3620,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, } req->dcid = cpu_to_le16(chan->dcid); - req->flags = __constant_cpu_to_le16(0); + req->flags = cpu_to_le16(0); return ptr - data; } @@ -3592,6 +3640,23 @@ static int l2cap_build_conf_rsp(struct l2cap_chan *chan, void *data, return ptr - data; } +void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan) +{ + struct l2cap_le_conn_rsp rsp; + struct l2cap_conn *conn = chan->conn; + + BT_DBG("chan %p", chan); + + rsp.dcid = cpu_to_le16(chan->scid); + rsp.mtu = cpu_to_le16(chan->imtu); + rsp.mps = cpu_to_le16(chan->mps); + rsp.credits = cpu_to_le16(chan->rx_credits); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), + &rsp); +} + void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) { struct l2cap_conn_rsp rsp; @@ -3601,8 +3666,8 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) rsp.scid = cpu_to_le16(chan->dcid); rsp.dcid = cpu_to_le16(chan->scid); - rsp.result = __constant_cpu_to_le16(L2CAP_CR_SUCCESS); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); if (chan->hs_hcon) rsp_code = L2CAP_CREATE_CHAN_RSP; @@ -3631,8 +3696,8 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len) u16 txwin_ext = chan->ack_win; struct l2cap_conf_rfc rfc = { .mode = chan->mode, - .retrans_timeout = __constant_cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO), - .monitor_timeout = __constant_cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO), + .retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO), + .monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO), .max_pdu_size = cpu_to_le16(chan->imtu), .txwin_size = min_t(u16, chan->ack_win, L2CAP_DEFAULT_TX_WINDOW), }; @@ -3713,7 +3778,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, /* Check if we have socket listening on psm */ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, - &conn->hcon->dst); + &conn->hcon->dst, ACL_LINK); if (!pchan) { result = L2CAP_CR_BAD_PSM; goto sendresp; @@ -3723,7 +3788,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, l2cap_chan_lock(pchan); /* Check if the ACL is secure enough (if not SDP) */ - if (psm != __constant_cpu_to_le16(L2CAP_PSM_SDP) && + if (psm != cpu_to_le16(L2CAP_PSM_SDP) && !hci_conn_check_link_mode(conn->hcon)) { conn->disc_reason = HCI_ERROR_AUTH_FAILURE; result = L2CAP_CR_SEC_BLOCK; @@ -3808,7 +3873,7 @@ sendresp: if (result == L2CAP_CR_PEND && status == L2CAP_CS_NO_INFO) { struct l2cap_info_req info; - info.type = __constant_cpu_to_le16(L2CAP_IT_FEAT_MASK); + info.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; conn->info_ident = l2cap_get_ident(conn); @@ -3957,7 +4022,7 @@ static void cmd_reject_invalid_cid(struct l2cap_conn *conn, u8 ident, { struct l2cap_cmd_rej_cid rej; - rej.reason = __constant_cpu_to_le16(L2CAP_REJ_INVALID_CID); + rej.reason = cpu_to_le16(L2CAP_REJ_INVALID_CID); rej.scid = __cpu_to_le16(scid); rej.dcid = __cpu_to_le16(dcid); @@ -4289,8 +4354,8 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, u8 buf[8]; u32 feat_mask = l2cap_feat_mask; struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf; - rsp->type = __constant_cpu_to_le16(L2CAP_IT_FEAT_MASK); - rsp->result = __constant_cpu_to_le16(L2CAP_IR_SUCCESS); + rsp->type = cpu_to_le16(L2CAP_IT_FEAT_MASK); + rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS); if (!disable_ertm) feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING | L2CAP_FEAT_FCS; @@ -4310,15 +4375,15 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, else l2cap_fixed_chan[0] &= ~L2CAP_FC_A2MP; - rsp->type = __constant_cpu_to_le16(L2CAP_IT_FIXED_CHAN); - rsp->result = __constant_cpu_to_le16(L2CAP_IR_SUCCESS); + rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); + rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS); memcpy(rsp->data, l2cap_fixed_chan, sizeof(l2cap_fixed_chan)); l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf), buf); } else { struct l2cap_info_rsp rsp; rsp.type = cpu_to_le16(type); - rsp.result = __constant_cpu_to_le16(L2CAP_IR_NOTSUPP); + rsp.result = cpu_to_le16(L2CAP_IR_NOTSUPP); l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(rsp), &rsp); } @@ -4363,7 +4428,7 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, if (conn->feat_mask & L2CAP_FEAT_FIXED_CHAN) { struct l2cap_info_req req; - req.type = __constant_cpu_to_le16(L2CAP_IT_FIXED_CHAN); + req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); conn->info_ident = l2cap_get_ident(conn); @@ -4457,8 +4522,8 @@ static int l2cap_create_channel_req(struct l2cap_conn *conn, error: rsp.dcid = 0; rsp.scid = cpu_to_le16(scid); - rsp.result = __constant_cpu_to_le16(L2CAP_CR_BAD_AMP); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); + rsp.result = cpu_to_le16(L2CAP_CR_BAD_AMP); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); l2cap_send_cmd(conn, cmd->ident, L2CAP_CREATE_CHAN_RSP, sizeof(rsp), &rsp); @@ -4522,7 +4587,7 @@ static void l2cap_send_move_chan_cfm_icid(struct l2cap_conn *conn, u16 icid) BT_DBG("conn %p, icid 0x%4.4x", conn, icid); cfm.icid = cpu_to_le16(icid); - cfm.result = __constant_cpu_to_le16(L2CAP_MC_UNCONFIRMED); + cfm.result = cpu_to_le16(L2CAP_MC_UNCONFIRMED); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_MOVE_CHAN_CFM, sizeof(cfm), &cfm); @@ -4705,12 +4770,12 @@ static void l2cap_do_create(struct l2cap_chan *chan, int result, if (result == L2CAP_CR_SUCCESS) { /* Send successful response */ - rsp.result = __constant_cpu_to_le16(L2CAP_CR_SUCCESS); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); } else { /* Send negative response */ - rsp.result = __constant_cpu_to_le16(L2CAP_CR_NO_MEM); - rsp.status = __constant_cpu_to_le16(L2CAP_CS_NO_INFO); + rsp.result = cpu_to_le16(L2CAP_CR_NO_MEM); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); } l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_RSP, @@ -4838,7 +4903,7 @@ static inline int l2cap_move_channel_req(struct l2cap_conn *conn, chan = l2cap_get_chan_by_dcid(conn, icid); if (!chan) { rsp.icid = cpu_to_le16(icid); - rsp.result = __constant_cpu_to_le16(L2CAP_MR_NOT_ALLOWED); + rsp.result = cpu_to_le16(L2CAP_MR_NOT_ALLOWED); l2cap_send_cmd(conn, cmd->ident, L2CAP_MOVE_CHAN_RSP, sizeof(rsp), &rsp); return 0; @@ -5155,18 +5220,17 @@ static inline int l2cap_check_conn_param(u16 min, u16 max, u16 latency, static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, - u8 *data) + u16 cmd_len, u8 *data) { struct hci_conn *hcon = conn->hcon; struct l2cap_conn_param_update_req *req; struct l2cap_conn_param_update_rsp rsp; - u16 min, max, latency, to_multiplier, cmd_len; + u16 min, max, latency, to_multiplier; int err; if (!(hcon->link_mode & HCI_LM_MASTER)) return -EINVAL; - cmd_len = __le16_to_cpu(cmd->len); if (cmd_len != sizeof(struct l2cap_conn_param_update_req)) return -EPROTO; @@ -5183,9 +5247,9 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, err = l2cap_check_conn_param(min, max, latency, to_multiplier); if (err) - rsp.result = __constant_cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); + rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else - rsp.result = __constant_cpu_to_le16(L2CAP_CONN_PARAM_ACCEPTED); + rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_ACCEPTED); l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP, sizeof(rsp), &rsp); @@ -5196,6 +5260,65 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, return 0; } +static int l2cap_le_connect_rsp(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_le_conn_rsp *rsp = (struct l2cap_le_conn_rsp *) data; + u16 dcid, mtu, mps, credits, result; + struct l2cap_chan *chan; + int err; + + if (cmd_len < sizeof(*rsp)) + return -EPROTO; + + dcid = __le16_to_cpu(rsp->dcid); + mtu = __le16_to_cpu(rsp->mtu); + mps = __le16_to_cpu(rsp->mps); + credits = __le16_to_cpu(rsp->credits); + result = __le16_to_cpu(rsp->result); + + if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23)) + return -EPROTO; + + BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", + dcid, mtu, mps, credits, result); + + mutex_lock(&conn->chan_lock); + + chan = __l2cap_get_chan_by_ident(conn, cmd->ident); + if (!chan) { + err = -EBADSLT; + goto unlock; + } + + err = 0; + + l2cap_chan_lock(chan); + + switch (result) { + case L2CAP_CR_SUCCESS: + chan->ident = 0; + chan->dcid = dcid; + chan->omtu = mtu; + chan->remote_mps = mps; + chan->tx_credits = credits; + l2cap_chan_ready(chan); + break; + + default: + l2cap_chan_del(chan, ECONNREFUSED); + break; + } + + l2cap_chan_unlock(chan); + +unlock: + mutex_unlock(&conn->chan_lock); + + return err; +} + static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) @@ -5276,23 +5399,235 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn, return err; } +static int l2cap_le_connect_req(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_le_conn_req *req = (struct l2cap_le_conn_req *) data; + struct l2cap_le_conn_rsp rsp; + struct l2cap_chan *chan, *pchan; + u16 dcid, scid, credits, mtu, mps; + __le16 psm; + u8 result; + + if (cmd_len != sizeof(*req)) + return -EPROTO; + + scid = __le16_to_cpu(req->scid); + mtu = __le16_to_cpu(req->mtu); + mps = __le16_to_cpu(req->mps); + psm = req->psm; + dcid = 0; + credits = 0; + + if (mtu < 23 || mps < 23) + return -EPROTO; + + BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm), + scid, mtu, mps); + + /* Check if we have socket listening on psm */ + pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, + &conn->hcon->dst, LE_LINK); + if (!pchan) { + result = L2CAP_CR_BAD_PSM; + chan = NULL; + goto response; + } + + mutex_lock(&conn->chan_lock); + l2cap_chan_lock(pchan); + + if (!smp_sufficient_security(conn->hcon, pchan->sec_level)) { + result = L2CAP_CR_AUTHENTICATION; + chan = NULL; + goto response_unlock; + } + + /* Check if we already have channel with that dcid */ + if (__l2cap_get_chan_by_dcid(conn, scid)) { + result = L2CAP_CR_NO_MEM; + chan = NULL; + goto response_unlock; + } + + chan = pchan->ops->new_connection(pchan); + if (!chan) { + result = L2CAP_CR_NO_MEM; + goto response_unlock; + } + + l2cap_le_flowctl_init(chan); + + bacpy(&chan->src, &conn->hcon->src); + bacpy(&chan->dst, &conn->hcon->dst); + chan->src_type = bdaddr_type(conn->hcon, conn->hcon->src_type); + chan->dst_type = bdaddr_type(conn->hcon, conn->hcon->dst_type); + chan->psm = psm; + chan->dcid = scid; + chan->omtu = mtu; + chan->remote_mps = mps; + chan->tx_credits = __le16_to_cpu(req->credits); + + __l2cap_chan_add(conn, chan); + dcid = chan->scid; + credits = chan->rx_credits; + + __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); + + chan->ident = cmd->ident; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { + l2cap_state_change(chan, BT_CONNECT2); + result = L2CAP_CR_PEND; + chan->ops->defer(chan); + } else { + l2cap_chan_ready(chan); + result = L2CAP_CR_SUCCESS; + } + +response_unlock: + l2cap_chan_unlock(pchan); + mutex_unlock(&conn->chan_lock); + + if (result == L2CAP_CR_PEND) + return 0; + +response: + if (chan) { + rsp.mtu = cpu_to_le16(chan->imtu); + rsp.mps = cpu_to_le16(chan->mps); + } else { + rsp.mtu = 0; + rsp.mps = 0; + } + + rsp.dcid = cpu_to_le16(dcid); + rsp.credits = cpu_to_le16(credits); + rsp.result = cpu_to_le16(result); + + l2cap_send_cmd(conn, cmd->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), &rsp); + + return 0; +} + +static inline int l2cap_le_credits(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_le_credits *pkt; + struct l2cap_chan *chan; + u16 cid, credits, max_credits; + + if (cmd_len != sizeof(*pkt)) + return -EPROTO; + + pkt = (struct l2cap_le_credits *) data; + cid = __le16_to_cpu(pkt->cid); + credits = __le16_to_cpu(pkt->credits); + + BT_DBG("cid 0x%4.4x credits 0x%4.4x", cid, credits); + + chan = l2cap_get_chan_by_dcid(conn, cid); + if (!chan) + return -EBADSLT; + + max_credits = LE_FLOWCTL_MAX_CREDITS - chan->tx_credits; + if (credits > max_credits) { + BT_ERR("LE credits overflow"); + l2cap_send_disconn_req(chan, ECONNRESET); + + /* Return 0 so that we don't trigger an unnecessary + * command reject packet. + */ + return 0; + } + + chan->tx_credits += credits; + + while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) { + l2cap_do_send(chan, skb_dequeue(&chan->tx_q)); + chan->tx_credits--; + } + + if (chan->tx_credits) + chan->ops->resume(chan); + + l2cap_chan_unlock(chan); + + return 0; +} + +static inline int l2cap_le_command_rej(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_cmd_rej_unk *rej = (struct l2cap_cmd_rej_unk *) data; + struct l2cap_chan *chan; + + if (cmd_len < sizeof(*rej)) + return -EPROTO; + + mutex_lock(&conn->chan_lock); + + chan = __l2cap_get_chan_by_ident(conn, cmd->ident); + if (!chan) + goto done; + + l2cap_chan_lock(chan); + l2cap_chan_del(chan, ECONNREFUSED); + l2cap_chan_unlock(chan); + +done: + mutex_unlock(&conn->chan_lock); + return 0; +} + static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn, - struct l2cap_cmd_hdr *cmd, u8 *data) + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) { + int err = 0; + switch (cmd->code) { case L2CAP_COMMAND_REJ: - return 0; + l2cap_le_command_rej(conn, cmd, cmd_len, data); + break; case L2CAP_CONN_PARAM_UPDATE_REQ: - return l2cap_conn_param_update_req(conn, cmd, data); + err = l2cap_conn_param_update_req(conn, cmd, cmd_len, data); + break; case L2CAP_CONN_PARAM_UPDATE_RSP: - return 0; + break; + + case L2CAP_LE_CONN_RSP: + l2cap_le_connect_rsp(conn, cmd, cmd_len, data); + break; + + case L2CAP_LE_CONN_REQ: + err = l2cap_le_connect_req(conn, cmd, cmd_len, data); + break; + + case L2CAP_LE_CREDITS: + err = l2cap_le_credits(conn, cmd, cmd_len, data); + break; + + case L2CAP_DISCONN_REQ: + err = l2cap_disconnect_req(conn, cmd, cmd_len, data); + break; + + case L2CAP_DISCONN_RSP: + l2cap_disconnect_rsp(conn, cmd, cmd_len, data); + break; default: BT_ERR("Unknown LE signaling command 0x%2.2x", cmd->code); - return -EINVAL; + err = -EINVAL; + break; } + + return err; } static inline void l2cap_le_sig_channel(struct l2cap_conn *conn, @@ -5321,13 +5656,13 @@ static inline void l2cap_le_sig_channel(struct l2cap_conn *conn, goto drop; } - err = l2cap_le_sig_cmd(conn, cmd, skb->data); + err = l2cap_le_sig_cmd(conn, cmd, len, skb->data); if (err) { struct l2cap_cmd_rej_unk rej; BT_ERR("Wrong link type (%d)", err); - rej.reason = __constant_cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD); + rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD); l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); } @@ -5372,7 +5707,7 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn, BT_ERR("Wrong link type (%d)", err); - rej.reason = __constant_cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD); + rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD); l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); } @@ -6312,6 +6647,122 @@ drop: return 0; } +static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_le_credits pkt; + u16 return_credits; + + /* We return more credits to the sender only after the amount of + * credits falls below half of the initial amount. + */ + if (chan->rx_credits >= (le_max_credits + 1) / 2) + return; + + return_credits = le_max_credits - chan->rx_credits; + + BT_DBG("chan %p returning %u credits to sender", chan, return_credits); + + chan->rx_credits += return_credits; + + pkt.cid = cpu_to_le16(chan->scid); + pkt.credits = cpu_to_le16(return_credits); + + chan->ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt); +} + +static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) +{ + int err; + + if (!chan->rx_credits) { + BT_ERR("No credits to receive LE L2CAP data"); + l2cap_send_disconn_req(chan, ECONNRESET); + return -ENOBUFS; + } + + if (chan->imtu < skb->len) { + BT_ERR("Too big LE L2CAP PDU"); + return -ENOBUFS; + } + + chan->rx_credits--; + BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits); + + l2cap_chan_le_send_credits(chan); + + err = 0; + + if (!chan->sdu) { + u16 sdu_len; + + sdu_len = get_unaligned_le16(skb->data); + skb_pull(skb, L2CAP_SDULEN_SIZE); + + BT_DBG("Start of new SDU. sdu_len %u skb->len %u imtu %u", + sdu_len, skb->len, chan->imtu); + + if (sdu_len > chan->imtu) { + BT_ERR("Too big LE L2CAP SDU length received"); + err = -EMSGSIZE; + goto failed; + } + + if (skb->len > sdu_len) { + BT_ERR("Too much LE L2CAP data received"); + err = -EINVAL; + goto failed; + } + + if (skb->len == sdu_len) + return chan->ops->recv(chan, skb); + + chan->sdu = skb; + chan->sdu_len = sdu_len; + chan->sdu_last_frag = skb; + + return 0; + } + + BT_DBG("SDU fragment. chan->sdu->len %u skb->len %u chan->sdu_len %u", + chan->sdu->len, skb->len, chan->sdu_len); + + if (chan->sdu->len + skb->len > chan->sdu_len) { + BT_ERR("Too much LE L2CAP data received"); + err = -EINVAL; + goto failed; + } + + append_skb_frag(chan->sdu, skb, &chan->sdu_last_frag); + skb = NULL; + + if (chan->sdu->len == chan->sdu_len) { + err = chan->ops->recv(chan, chan->sdu); + if (!err) { + chan->sdu = NULL; + chan->sdu_last_frag = NULL; + chan->sdu_len = 0; + } + } + +failed: + if (err) { + kfree_skb(skb); + kfree_skb(chan->sdu); + chan->sdu = NULL; + chan->sdu_last_frag = NULL; + chan->sdu_len = 0; + } + + /* We can't return an error here since we took care of the skb + * freeing internally. An error return would cause the caller to + * do a double-free of the skb. + */ + return 0; +} + static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk_buff *skb) { @@ -6341,14 +6792,22 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, goto drop; switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + if (l2cap_le_data_rcv(chan, skb) < 0) + goto drop; + + goto done; + case L2CAP_MODE_BASIC: /* If socket recv buffers overflows we drop data here * which is *bad* because L2CAP has to be reliable. * But we don't have any other choice. L2CAP doesn't * provide flow control mechanism. */ - if (chan->imtu < skb->len) + if (chan->imtu < skb->len) { + BT_ERR("Dropping L2CAP data: receive buffer overflow"); goto drop; + } if (!chan->ops->recv(chan, skb)) goto done; @@ -6380,7 +6839,8 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, if (hcon->type != ACL_LINK) goto drop; - chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst); + chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst, + ACL_LINK); if (!chan) goto drop; @@ -6435,9 +6895,16 @@ drop: static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_hdr *lh = (void *) skb->data; + struct hci_conn *hcon = conn->hcon; u16 cid, len; __le16 psm; + if (hcon->state != BT_CONNECTED) { + BT_DBG("queueing pending rx skb"); + skb_queue_tail(&conn->pending_rx, skb); + return; + } + skb_pull(skb, L2CAP_HDR_SIZE); cid = __le16_to_cpu(lh->cid); len = __le16_to_cpu(lh->len); @@ -6473,12 +6940,257 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) l2cap_conn_del(conn->hcon, EACCES); break; + case L2CAP_FC_6LOWPAN: + bt_6lowpan_recv(conn, skb); + break; + default: l2cap_data_channel(conn, cid, skb); break; } } +static void process_pending_rx(struct work_struct *work) +{ + struct l2cap_conn *conn = container_of(work, struct l2cap_conn, + pending_rx_work); + struct sk_buff *skb; + + BT_DBG(""); + + while ((skb = skb_dequeue(&conn->pending_rx))) + l2cap_recv_frame(conn, skb); +} + +static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + struct hci_chan *hchan; + + if (conn) + return conn; + + hchan = hci_chan_create(hcon); + if (!hchan) + return NULL; + + conn = kzalloc(sizeof(struct l2cap_conn), GFP_KERNEL); + if (!conn) { + hci_chan_del(hchan); + return NULL; + } + + kref_init(&conn->ref); + hcon->l2cap_data = conn; + conn->hcon = hcon; + hci_conn_get(conn->hcon); + conn->hchan = hchan; + + BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan); + + switch (hcon->type) { + case LE_LINK: + if (hcon->hdev->le_mtu) { + conn->mtu = hcon->hdev->le_mtu; + break; + } + /* fall through */ + default: + conn->mtu = hcon->hdev->acl_mtu; + break; + } + + conn->feat_mask = 0; + + if (hcon->type == ACL_LINK) + conn->hs_enabled = test_bit(HCI_HS_ENABLED, + &hcon->hdev->dev_flags); + + spin_lock_init(&conn->lock); + mutex_init(&conn->chan_lock); + + INIT_LIST_HEAD(&conn->chan_l); + INIT_LIST_HEAD(&conn->users); + + if (hcon->type == LE_LINK) + INIT_DELAYED_WORK(&conn->security_timer, security_timeout); + else + INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + + skb_queue_head_init(&conn->pending_rx); + INIT_WORK(&conn->pending_rx_work, process_pending_rx); + + conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM; + + return conn; +} + +static bool is_valid_psm(u16 psm, u8 dst_type) { + if (!psm) + return false; + + if (bdaddr_type_is_le(dst_type)) + return (psm <= 0x00ff); + + /* PSM must be odd and lsb of upper byte must be 0 */ + return ((psm & 0x0101) == 0x0001); +} + +int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, + bdaddr_t *dst, u8 dst_type) +{ + struct l2cap_conn *conn; + struct hci_conn *hcon; + struct hci_dev *hdev; + __u8 auth_type; + int err; + + BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst, + dst_type, __le16_to_cpu(psm)); + + hdev = hci_get_route(dst, &chan->src); + if (!hdev) + return -EHOSTUNREACH; + + hci_dev_lock(hdev); + + l2cap_chan_lock(chan); + + if (!is_valid_psm(__le16_to_cpu(psm), dst_type) && !cid && + chan->chan_type != L2CAP_CHAN_RAW) { + err = -EINVAL; + goto done; + } + + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && !psm) { + err = -EINVAL; + goto done; + } + + if (chan->chan_type == L2CAP_CHAN_FIXED && !cid) { + err = -EINVAL; + goto done; + } + + switch (chan->mode) { + case L2CAP_MODE_BASIC: + break; + case L2CAP_MODE_LE_FLOWCTL: + l2cap_le_flowctl_init(chan); + break; + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + if (!disable_ertm) + break; + /* fall through */ + default: + err = -ENOTSUPP; + goto done; + } + + switch (chan->state) { + case BT_CONNECT: + case BT_CONNECT2: + case BT_CONFIG: + /* Already connecting */ + err = 0; + goto done; + + case BT_CONNECTED: + /* Already connected */ + err = -EISCONN; + goto done; + + case BT_OPEN: + case BT_BOUND: + /* Can connect */ + break; + + default: + err = -EBADFD; + goto done; + } + + /* Set destination address and psm */ + bacpy(&chan->dst, dst); + chan->dst_type = dst_type; + + chan->psm = psm; + chan->dcid = cid; + + auth_type = l2cap_get_auth_type(chan); + + if (bdaddr_type_is_le(dst_type)) { + /* Convert from L2CAP channel address type to HCI address type + */ + if (dst_type == BDADDR_LE_PUBLIC) + dst_type = ADDR_LE_DEV_PUBLIC; + else + dst_type = ADDR_LE_DEV_RANDOM; + + hcon = hci_connect_le(hdev, dst, dst_type, chan->sec_level, + auth_type); + } else { + hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type); + } + + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto done; + } + + conn = l2cap_conn_add(hcon); + if (!conn) { + hci_conn_drop(hcon); + err = -ENOMEM; + goto done; + } + + if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { + hci_conn_drop(hcon); + err = -EBUSY; + goto done; + } + + /* Update source addr of the socket */ + bacpy(&chan->src, &hcon->src); + chan->src_type = bdaddr_type(hcon, hcon->src_type); + + l2cap_chan_unlock(chan); + l2cap_chan_add(conn, chan); + l2cap_chan_lock(chan); + + /* l2cap_chan_add takes its own ref so we can drop this one */ + hci_conn_drop(hcon); + + l2cap_state_change(chan, BT_CONNECT); + __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); + + /* Release chan->sport so that it can be reused by other + * sockets (as it's only used for listening sockets). + */ + write_lock(&chan_list_lock); + chan->sport = 0; + write_unlock(&chan_list_lock); + + if (hcon->state == BT_CONNECTED) { + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + __clear_chan_timer(chan); + if (l2cap_chan_check_security(chan)) + l2cap_state_change(chan, BT_CONNECTED); + } else + l2cap_do_start(chan); + } + + err = 0; + +done: + l2cap_chan_unlock(chan); + hci_dev_unlock(hdev); + hci_dev_put(hdev); + return err; +} + /* ---- L2CAP interface with lower layer (HCI) ---- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr) @@ -6540,6 +7252,8 @@ void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { BT_DBG("hcon %p reason %d", hcon, reason); + bt_6lowpan_del_conn(hcon->l2cap_data); + l2cap_conn_del(hcon, bt_to_errno(reason)); } @@ -6551,7 +7265,8 @@ static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt) if (encrypt == 0x00) { if (chan->sec_level == BT_SECURITY_MEDIUM) { __set_chan_timer(chan, L2CAP_ENC_TIMEOUT); - } else if (chan->sec_level == BT_SECURITY_HIGH) + } else if (chan->sec_level == BT_SECURITY_HIGH || + chan->sec_level == BT_SECURITY_FIPS) l2cap_chan_close(chan, ECONNREFUSED); } else { if (chan->sec_level == BT_SECURITY_MEDIUM) @@ -6571,7 +7286,7 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) if (hcon->type == LE_LINK) { if (!status && encrypt) - smp_distribute_keys(conn, 0); + smp_distribute_keys(conn); cancel_delayed_work(&conn->security_timer); } @@ -6583,7 +7298,7 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("chan %p scid 0x%4.4x state %s", chan, chan->scid, state_to_string(chan->state)); - if (chan->chan_type == L2CAP_CHAN_CONN_FIX_A2MP) { + if (chan->scid == L2CAP_CID_A2MP) { l2cap_chan_unlock(chan); continue; } @@ -6612,11 +7327,10 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } if (chan->state == BT_CONNECT) { - if (!status) { + if (!status) l2cap_start_connection(chan); - } else { + else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); - } } else if (chan->state == BT_CONNECT2) { struct l2cap_conn_rsp rsp; __u16 res, stat; @@ -6817,11 +7531,19 @@ int __init l2cap_init(void) l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs, NULL, &l2cap_debugfs_fops); + debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs, + &le_max_credits); + debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs, + &le_default_mps); + + bt_6lowpan_init(); + return 0; } void l2cap_exit(void) { + bt_6lowpan_cleanup(); debugfs_remove(l2cap_debugfs); l2cap_cleanup_sockets(); } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 7cc24d263ca..e1378693cc9 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -27,6 +27,7 @@ /* Bluetooth L2CAP sockets. */ +#include <linux/module.h> #include <linux/export.h> #include <net/bluetooth/bluetooth.h> @@ -50,6 +51,32 @@ bool l2cap_is_socket(struct socket *sock) } EXPORT_SYMBOL(l2cap_is_socket); +static int l2cap_validate_bredr_psm(u16 psm) +{ + /* PSM must be odd and lsb of upper byte must be 0 */ + if ((psm & 0x0101) != 0x0001) + return -EINVAL; + + /* Restrict usage of well-known PSMs */ + if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + return 0; +} + +static int l2cap_validate_le_psm(u16 psm) +{ + /* Valid LE_PSM ranges are defined only until 0x00ff */ + if (psm > 0x00ff) + return -EINVAL; + + /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */ + if (psm <= 0x007f && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + return 0; +} + static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; @@ -72,12 +99,19 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) if (!bdaddr_type_is_valid(la.l2_bdaddr_type)) return -EINVAL; + if (la.l2_cid) { + /* When the socket gets created it defaults to + * CHAN_CONN_ORIENTED, so we need to overwrite the + * default here. + */ + chan->chan_type = L2CAP_CHAN_FIXED; + chan->omtu = L2CAP_DEFAULT_MTU; + } + if (bdaddr_type_is_le(la.l2_bdaddr_type)) { - /* Connection oriented channels are not supported on LE */ - if (la.l2_psm) - return -EINVAL; /* We only allow ATT user space socket */ - if (la.l2_cid != __constant_cpu_to_le16(L2CAP_CID_ATT)) + if (la.l2_cid && + la.l2_cid != cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; } @@ -91,17 +125,13 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) if (la.l2_psm) { __u16 psm = __le16_to_cpu(la.l2_psm); - /* PSM must be odd and lsb of upper byte must be 0 */ - if ((psm & 0x0101) != 0x0001) { - err = -EINVAL; - goto done; - } + if (la.l2_bdaddr_type == BDADDR_BREDR) + err = l2cap_validate_bredr_psm(psm); + else + err = l2cap_validate_le_psm(psm); - /* Restrict usage of well-known PSMs */ - if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) { - err = -EACCES; + if (err) goto done; - } } if (la.l2_cid) @@ -122,11 +152,17 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) __le16_to_cpu(la.l2_psm) == L2CAP_PSM_RFCOMM) chan->sec_level = BT_SECURITY_SDP; break; + case L2CAP_CHAN_RAW: + chan->sec_level = BT_SECURITY_SDP; + break; } bacpy(&chan->src, &la.l2_bdaddr); chan->src_type = la.l2_bdaddr_type; + if (chan->psm && bdaddr_type_is_le(chan->src_type)) + chan->mode = L2CAP_MODE_LE_FLOWCTL; + chan->state = BT_BOUND; sk->sk_state = BT_BOUND; @@ -173,7 +209,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, * ATT. Anything else is an invalid combination. */ if (chan->scid != L2CAP_CID_ATT || - la.l2_cid != __constant_cpu_to_le16(L2CAP_CID_ATT)) + la.l2_cid != cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; /* We don't have the hdev available here to make a @@ -189,14 +225,15 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; if (bdaddr_type_is_le(la.l2_bdaddr_type)) { - /* Connection oriented channels are not supported on LE */ - if (la.l2_psm) - return -EINVAL; /* We only allow ATT user space socket */ - if (la.l2_cid != __constant_cpu_to_le16(L2CAP_CID_ATT)) + if (la.l2_cid && + la.l2_cid != cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; } + if (chan->psm && bdaddr_type_is_le(chan->src_type)) + chan->mode = L2CAP_MODE_LE_FLOWCTL; + err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), &la.l2_bdaddr, la.l2_bdaddr_type); if (err) @@ -234,6 +271,7 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) switch (chan->mode) { case L2CAP_MODE_BASIC: + case L2CAP_MODE_LE_FLOWCTL: break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: @@ -322,17 +360,21 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, BT_DBG("sock %p, sk %p", sock, sk); + if (peer && sk->sk_state != BT_CONNECTED && + sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2) + return -ENOTCONN; + memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; *len = sizeof(struct sockaddr_l2); + la->l2_psm = chan->psm; + if (peer) { - la->l2_psm = chan->psm; bacpy(&la->l2_bdaddr, &chan->dst); la->l2_cid = cpu_to_le16(chan->dcid); la->l2_bdaddr_type = chan->dst_type; } else { - la->l2_psm = chan->sport; bacpy(&la->l2_bdaddr, &chan->src); la->l2_cid = cpu_to_le16(chan->scid); la->l2_bdaddr_type = chan->src_type; @@ -360,6 +402,16 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, switch (optname) { case L2CAP_OPTIONS: + /* LE sockets should use BT_SNDMTU/BT_RCVMTU, but since + * legacy ATT code depends on getsockopt for + * L2CAP_OPTIONS we need to let this pass. + */ + if (bdaddr_type_is_le(chan->src_type) && + chan->scid != L2CAP_CID_ATT) { + err = -EINVAL; + break; + } + memset(&opts, 0, sizeof(opts)); opts.imtu = chan->imtu; opts.omtu = chan->omtu; @@ -387,6 +439,10 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE; break; + case BT_SECURITY_FIPS: + opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT | + L2CAP_LM_SECURE | L2CAP_LM_FIPS; + break; default: opt = 0; break; @@ -400,6 +456,7 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; + break; case L2CAP_CONNINFO: @@ -454,6 +511,7 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case BT_SECURITY: if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && + chan->chan_type != L2CAP_CHAN_FIXED && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; break; @@ -514,6 +572,31 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_SNDMTU: + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + if (put_user(chan->omtu, (u16 __user *) optval)) + err = -EFAULT; + break; + + case BT_RCVMTU: + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + if (put_user(chan->imtu, (u16 __user *) optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; @@ -554,6 +637,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, switch (optname) { case L2CAP_OPTIONS: + if (bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; @@ -585,6 +673,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, chan->mode = opts.mode; switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + break; case L2CAP_MODE_BASIC: clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); break; @@ -612,6 +702,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } + if (opt & L2CAP_LM_FIPS) { + err = -EINVAL; + break; + } + if (opt & L2CAP_LM_AUTH) chan->sec_level = BT_SECURITY_LOW; if (opt & L2CAP_LM_ENCRYPT) @@ -663,6 +758,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case BT_SECURITY: if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && + chan->chan_type != L2CAP_CHAN_FIXED && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; break; @@ -691,11 +787,6 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, /*change security for LE channels */ if (chan->scid == L2CAP_CID_ATT) { - if (!conn->hcon->out) { - err = -EINVAL; - break; - } - if (smp_conn_security(conn->hcon, sec.level)) break; sk->sk_state = BT_CONFIG; @@ -807,6 +898,37 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; + case BT_SNDMTU: + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + /* Setting is not supported as it's the remote side that + * decides this. + */ + err = -EPERM; + break; + + case BT_RCVMTU: + if (!bdaddr_type_is_le(chan->src_type)) { + err = -EINVAL; + break; + } + + if (sk->sk_state == BT_CONNECTED) { + err = -EISCONN; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + chan->imtu = opt; + break; + default: err = -ENOPROTOOPT; break; @@ -859,10 +981,16 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { - sk->sk_state = BT_CONFIG; - pi->chan->state = BT_CONFIG; + if (bdaddr_type_is_le(pi->chan->src_type)) { + sk->sk_state = BT_CONNECTED; + pi->chan->state = BT_CONNECTED; + __l2cap_le_connect_rsp_defer(pi->chan); + } else { + sk->sk_state = BT_CONFIG; + pi->chan->state = BT_CONFIG; + __l2cap_connect_rsp_defer(pi->chan); + } - __l2cap_connect_rsp_defer(pi->chan); err = 0; goto done; } @@ -1047,13 +1175,16 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) /* Check for backlog size */ if (sk_acceptq_is_full(parent)) { BT_DBG("backlog full %d", parent->sk_ack_backlog); + release_sock(parent); return NULL; } sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP, GFP_ATOMIC); - if (!sk) + if (!sk) { + release_sock(parent); return NULL; + } bt_sock_reclassify_lock(sk, BTPROTO_L2CAP); @@ -1138,7 +1269,7 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) if (parent) { bt_accept_unlink(sk); - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { sk->sk_state_change(sk); } @@ -1194,7 +1325,7 @@ static void l2cap_sock_ready_cb(struct l2cap_chan *chan) sk->sk_state_change(sk); if (parent) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); release_sock(sk); } @@ -1207,7 +1338,7 @@ static void l2cap_sock_defer_cb(struct l2cap_chan *chan) parent = bt_sk(sk)->parent; if (parent) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); release_sock(sk); } @@ -1236,6 +1367,14 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) return sk->sk_sndtimeo; } +static void l2cap_sock_suspend_cb(struct l2cap_chan *chan) +{ + struct sock *sk = chan->data; + + set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); + sk->sk_state_change(sk); +} + static struct l2cap_ops l2cap_chan_ops = { .name = "L2CAP Socket Interface", .new_connection = l2cap_sock_new_connection_cb, @@ -1246,6 +1385,7 @@ static struct l2cap_ops l2cap_chan_ops = { .ready = l2cap_sock_ready_cb, .defer = l2cap_sock_defer_cb, .resume = l2cap_sock_resume_cb, + .suspend = l2cap_sock_suspend_cb, .set_shutdown = l2cap_sock_set_shutdown_cb, .get_sndtimeo = l2cap_sock_get_sndtimeo_cb, .alloc_skb = l2cap_sock_alloc_skb_cb, @@ -1270,7 +1410,7 @@ static void l2cap_sock_destruct(struct sock *sk) static void l2cap_skb_msg_name(struct sk_buff *skb, void *msg_name, int *msg_namelen) { - struct sockaddr_l2 *la = (struct sockaddr_l2 *) msg_name; + DECLARE_SOCKADDR(struct sockaddr_l2 *, la, msg_name); memset(la, 0, sizeof(struct sockaddr_l2)); la->l2_family = AF_BLUETOOTH; @@ -1303,6 +1443,13 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent) chan->tx_win_max = pchan->tx_win_max; chan->sec_level = pchan->sec_level; chan->flags = pchan->flags; + chan->tx_credits = pchan->tx_credits; + chan->rx_credits = pchan->rx_credits; + + if (chan->chan_type == L2CAP_CHAN_FIXED) { + chan->scid = pchan->scid; + chan->dcid = pchan->scid; + } security_sk_clone(parent, sk); } else { diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index b3fbc73516c..941ad7530ed 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -58,6 +58,7 @@ int bt_to_errno(__u16 code) return EIO; case 0x04: + case 0x3c: return EHOSTDOWN; case 0x05: diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 074d83690a4..af8e0a6243b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -29,12 +29,13 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "smp.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 4 +#define MGMT_REVISION 6 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -79,6 +80,11 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_BREDR, MGMT_OP_SET_STATIC_ADDRESS, MGMT_OP_SET_SCAN_PARAMS, + MGMT_OP_SET_SECURE_CONN, + MGMT_OP_SET_DEBUG_KEYS, + MGMT_OP_SET_PRIVACY, + MGMT_OP_LOAD_IRKS, + MGMT_OP_GET_CONN_INFO, }; static const u16 mgmt_events[] = { @@ -103,6 +109,8 @@ static const u16 mgmt_events[] = { MGMT_EV_DEVICE_UNBLOCKED, MGMT_EV_DEVICE_UNPAIRED, MGMT_EV_PASSKEY_NOTIFY, + MGMT_EV_NEW_IRK, + MGMT_EV_NEW_CSRK, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -127,7 +135,7 @@ static u8 mgmt_status_table[] = { MGMT_STATUS_FAILED, /* Hardware Failure */ MGMT_STATUS_CONNECT_FAILED, /* Page Timeout */ MGMT_STATUS_AUTH_FAILED, /* Authentication Failed */ - MGMT_STATUS_NOT_PAIRED, /* PIN or Key Missing */ + MGMT_STATUS_AUTH_FAILED, /* PIN or Key Missing */ MGMT_STATUS_NO_RESOURCES, /* Memory Full */ MGMT_STATUS_TIMEOUT, /* Connection Timeout */ MGMT_STATUS_NO_RESOURCES, /* Max Number of Connections */ @@ -207,7 +215,7 @@ static int cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) hdr = (void *) skb_put(skb, sizeof(*hdr)); - hdr->opcode = __constant_cpu_to_le16(MGMT_EV_CMD_STATUS); + hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev)); @@ -238,7 +246,7 @@ static int cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, hdr = (void *) skb_put(skb, sizeof(*hdr)); - hdr->opcode = __constant_cpu_to_le16(MGMT_EV_CMD_COMPLETE); + hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev) + rp_len); @@ -264,7 +272,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("sock %p", sk); rp.version = MGMT_VERSION; - rp.revision = __constant_cpu_to_le16(MGMT_REVISION); + rp.revision = cpu_to_le16(MGMT_REVISION); return cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, &rp, sizeof(rp)); @@ -288,8 +296,8 @@ static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data, if (!rp) return -ENOMEM; - rp->num_commands = __constant_cpu_to_le16(num_commands); - rp->num_events = __constant_cpu_to_le16(num_events); + rp->num_commands = cpu_to_le16(num_commands); + rp->num_events = cpu_to_le16(num_events); for (i = 0, opcode = rp->opcodes; i < num_commands; i++, opcode++) put_unaligned_le16(mgmt_commands[i], opcode); @@ -363,6 +371,7 @@ static u32 get_supported_settings(struct hci_dev *hdev) settings |= MGMT_SETTING_POWERED; settings |= MGMT_SETTING_PAIRABLE; + settings |= MGMT_SETTING_DEBUG_KEYS; if (lmp_bredr_capable(hdev)) { settings |= MGMT_SETTING_CONNECTABLE; @@ -376,11 +385,16 @@ static u32 get_supported_settings(struct hci_dev *hdev) settings |= MGMT_SETTING_SSP; settings |= MGMT_SETTING_HS; } + + if (lmp_sc_capable(hdev) || + test_bit(HCI_FORCE_SC, &hdev->dev_flags)) + settings |= MGMT_SETTING_SECURE_CONN; } if (lmp_le_capable(hdev)) { settings |= MGMT_SETTING_LE; settings |= MGMT_SETTING_ADVERTISING; + settings |= MGMT_SETTING_PRIVACY; } return settings; @@ -423,6 +437,15 @@ static u32 get_current_settings(struct hci_dev *hdev) if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) settings |= MGMT_SETTING_ADVERTISING; + if (test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) + settings |= MGMT_SETTING_SECURE_CONN; + + if (test_bit(HCI_DEBUG_KEYS, &hdev->dev_flags)) + settings |= MGMT_SETTING_DEBUG_KEYS; + + if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) + settings |= MGMT_SETTING_PRIVACY; + return settings; } @@ -629,14 +652,8 @@ static u8 create_adv_data(struct hci_dev *hdev, u8 *ptr) flags |= get_adv_discov_flags(hdev); - if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { - if (lmp_le_br_capable(hdev)) - flags |= LE_AD_SIM_LE_BREDR_CTRL; - if (lmp_host_le_br_capable(hdev)) - flags |= LE_AD_SIM_LE_BREDR_HOST; - } else { + if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) flags |= LE_AD_NO_BREDR; - } if (flags) { BT_DBG("adv flags 0x%02x", flags); @@ -803,6 +820,64 @@ static void update_class(struct hci_request *req) hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); } +static bool get_connectable(struct hci_dev *hdev) +{ + struct pending_cmd *cmd; + + /* If there's a pending mgmt command the flag will not yet have + * it's final value, so check for this first. + */ + cmd = mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev); + if (cmd) { + struct mgmt_mode *cp = cmd->param; + return cp->val; + } + + return test_bit(HCI_CONNECTABLE, &hdev->dev_flags); +} + +static void enable_advertising(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_adv_param cp; + u8 own_addr_type, enable = 0x01; + bool connectable; + + /* Clear the HCI_ADVERTISING bit temporarily so that the + * hci_update_random_address knows that it's safe to go ahead + * and write a new random address. The flag will be set back on + * as soon as the SET_ADV_ENABLE HCI command completes. + */ + clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + + connectable = get_connectable(hdev); + + /* Set require_privacy to true only when non-connectable + * advertising is used. In that case it is fine to use a + * non-resolvable private address. + */ + if (hci_update_random_address(req, !connectable, &own_addr_type) < 0) + return; + + memset(&cp, 0, sizeof(cp)); + cp.min_interval = cpu_to_le16(0x0800); + cp.max_interval = cpu_to_le16(0x0800); + cp.type = connectable ? LE_ADV_IND : LE_ADV_NONCONN_IND; + cp.own_address_type = own_addr_type; + cp.channel_map = hdev->le_adv_channel_map; + + hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); +} + +static void disable_advertising(struct hci_request *req) +{ + u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); +} + static void service_cache_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, @@ -824,12 +899,39 @@ static void service_cache_off(struct work_struct *work) hci_req_run(&req, NULL); } +static void rpa_expired(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + rpa_expired.work); + struct hci_request req; + + BT_DBG(""); + + set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + + if (!test_bit(HCI_ADVERTISING, &hdev->dev_flags) || + hci_conn_num(hdev, LE_LINK) > 0) + return; + + /* The generation of a new RPA and programming it into the + * controller happens in the enable_advertising() function. + */ + + hci_req_init(&req, hdev); + + disable_advertising(&req); + enable_advertising(&req); + + hci_req_run(&req, NULL); +} + static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) { if (test_and_set_bit(HCI_MGMT, &hdev->dev_flags)) return; INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off); + INIT_DELAYED_WORK(&hdev->rpa_expired, rpa_expired); /* Non-mgmt controlled devices get this bit set * implicitly so that pairing works for them, however @@ -935,6 +1037,106 @@ static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) sizeof(settings)); } +static void clean_up_hci_complete(struct hci_dev *hdev, u8 status) +{ + BT_DBG("%s status 0x%02x", hdev->name, status); + + if (hci_conn_count(hdev) == 0) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); + } +} + +static void hci_stop_discovery(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_remote_name_req_cancel cp; + struct inquiry_entry *e; + + switch (hdev->discovery.state) { + case DISCOVERY_FINDING: + if (test_bit(HCI_INQUIRY, &hdev->flags)) { + hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL); + } else { + cancel_delayed_work(&hdev->le_scan_disable); + hci_req_add_le_scan_disable(req); + } + + break; + + case DISCOVERY_RESOLVING: + e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, + NAME_PENDING); + if (!e) + return; + + bacpy(&cp.bdaddr, &e->data.bdaddr); + hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), + &cp); + + break; + + default: + /* Passive scanning */ + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + hci_req_add_le_scan_disable(req); + break; + } +} + +static int clean_up_hci_state(struct hci_dev *hdev) +{ + struct hci_request req; + struct hci_conn *conn; + + hci_req_init(&req, hdev); + + if (test_bit(HCI_ISCAN, &hdev->flags) || + test_bit(HCI_PSCAN, &hdev->flags)) { + u8 scan = 0x00; + hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + } + + if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + disable_advertising(&req); + + hci_stop_discovery(&req); + + list_for_each_entry(conn, &hdev->conn_hash.list, list) { + struct hci_cp_disconnect dc; + struct hci_cp_reject_conn_req rej; + + switch (conn->state) { + case BT_CONNECTED: + case BT_CONFIG: + dc.handle = cpu_to_le16(conn->handle); + dc.reason = 0x15; /* Terminated due to Power Off */ + hci_req_add(&req, HCI_OP_DISCONNECT, sizeof(dc), &dc); + break; + case BT_CONNECT: + if (conn->type == LE_LINK) + hci_req_add(&req, HCI_OP_LE_CREATE_CONN_CANCEL, + 0, NULL); + else if (conn->type == ACL_LINK) + hci_req_add(&req, HCI_OP_CREATE_CONN_CANCEL, + 6, &conn->dst); + break; + case BT_CONNECT2: + bacpy(&rej.bdaddr, &conn->dst); + rej.reason = 0x15; /* Terminated due to Power Off */ + if (conn->type == ACL_LINK) + hci_req_add(&req, HCI_OP_REJECT_CONN_REQ, + sizeof(rej), &rej); + else if (conn->type == SCO_LINK) + hci_req_add(&req, HCI_OP_REJECT_SYNC_CONN_REQ, + sizeof(rej), &rej); + break; + } + } + + return hci_req_run(&req, clean_up_hci_complete); +} + static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { @@ -978,12 +1180,23 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - if (cp->val) + if (cp->val) { queue_work(hdev->req_workqueue, &hdev->power_on); - else - queue_work(hdev->req_workqueue, &hdev->power_off.work); - - err = 0; + err = 0; + } else { + /* Disconnect connections, stop scans, etc */ + err = clean_up_hci_state(hdev); + if (!err) + queue_delayed_work(hdev->req_workqueue, &hdev->power_off, + HCI_POWER_OFF_TIMEOUT); + + /* ENODATA means there were no HCI commands queued */ + if (err == -ENODATA) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); + err = 0; + } + } failed: hci_dev_unlock(hdev); @@ -1005,7 +1218,7 @@ static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 data_len, if (hdev) hdr->index = cpu_to_le16(hdev->id); else - hdr->index = __constant_cpu_to_le16(MGMT_INDEX_NONE); + hdr->index = cpu_to_le16(MGMT_INDEX_NONE); hdr->len = cpu_to_le16(data_len); if (data) @@ -1264,7 +1477,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->val == 0x02) { /* Limited discoverable mode */ - hci_cp.num_iac = 2; + hci_cp.num_iac = min_t(u8, hdev->num_iac, 2); hci_cp.iac_lap[0] = 0x00; /* LIAC */ hci_cp.iac_lap[1] = 0x8b; hci_cp.iac_lap[2] = 0x9e; @@ -1317,15 +1530,15 @@ static void write_fast_connectable(struct hci_request *req, bool enable) type = PAGE_SCAN_TYPE_INTERLACED; /* 160 msec page scan interval */ - acp.interval = __constant_cpu_to_le16(0x0100); + acp.interval = cpu_to_le16(0x0100); } else { type = PAGE_SCAN_TYPE_STANDARD; /* default */ /* default 1.28 sec page scan */ - acp.interval = __constant_cpu_to_le16(0x0800); + acp.interval = cpu_to_le16(0x0800); } - acp.window = __constant_cpu_to_le16(0x0012); + acp.window = cpu_to_le16(0x0012); if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval || __cpu_to_le16(hdev->page_scan_window) != acp.window) @@ -1336,50 +1549,6 @@ static void write_fast_connectable(struct hci_request *req, bool enable) hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type); } -static u8 get_adv_type(struct hci_dev *hdev) -{ - struct pending_cmd *cmd; - bool connectable; - - /* If there's a pending mgmt command the flag will not yet have - * it's final value, so check for this first. - */ - cmd = mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev); - if (cmd) { - struct mgmt_mode *cp = cmd->param; - connectable = !!cp->val; - } else { - connectable = test_bit(HCI_CONNECTABLE, &hdev->dev_flags); - } - - return connectable ? LE_ADV_IND : LE_ADV_NONCONN_IND; -} - -static void enable_advertising(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_adv_param cp; - u8 enable = 0x01; - - memset(&cp, 0, sizeof(cp)); - cp.min_interval = __constant_cpu_to_le16(0x0800); - cp.max_interval = __constant_cpu_to_le16(0x0800); - cp.type = get_adv_type(hdev); - cp.own_address_type = hdev->own_addr_type; - cp.channel_map = 0x07; - - hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); -} - -static void disable_advertising(struct hci_request *req) -{ - u8 enable = 0x00; - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); -} - static void set_connectable_complete(struct hci_dev *hdev, u8 status) { struct pending_cmd *cmd; @@ -2065,7 +2234,7 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, } if (memcmp(cp->uuid, bt_uuid_any, 16) == 0) { - err = hci_uuids_clear(hdev); + hci_uuids_clear(hdev); if (enable_service_cache(hdev)) { err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, @@ -2205,6 +2374,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_load_link_keys *cp = data; u16 key_count, expected_len; + bool changed; int i; BT_DBG("request for %s", hdev->name); @@ -2219,7 +2389,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, sizeof(struct mgmt_link_key_info); if (expected_len != len) { BT_ERR("load_link_keys: expected %u bytes, got %u bytes", - len, expected_len); + expected_len, len); return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); } @@ -2234,7 +2404,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; - if (key->addr.type != BDADDR_BREDR) + if (key->addr.type != BDADDR_BREDR || key->type > 0x08) return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); } @@ -2244,9 +2414,12 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, hci_link_keys_clear(hdev); if (cp->debug_keys) - set_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + changed = !test_and_set_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); else - clear_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + changed = test_and_clear_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + + if (changed) + new_settings(hdev, NULL); for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; @@ -2306,10 +2479,22 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - if (cp->addr.type == BDADDR_BREDR) + if (cp->addr.type == BDADDR_BREDR) { err = hci_remove_link_key(hdev, &cp->addr.bdaddr); - else - err = hci_remove_ltk(hdev, &cp->addr.bdaddr); + } else { + u8 addr_type; + + if (cp->addr.type == BDADDR_LE_PUBLIC) + addr_type = ADDR_LE_DEV_PUBLIC; + else + addr_type = ADDR_LE_DEV_RANDOM; + + hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type); + + hci_conn_params_del(hdev, &cp->addr.bdaddr, addr_type); + + err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type); + } if (err < 0) { err = cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, @@ -2633,6 +2818,16 @@ static void pairing_complete(struct pending_cmd *cmd, u8 status) mgmt_pending_remove(cmd); } +void mgmt_smp_complete(struct hci_conn *conn, bool complete) +{ + u8 status = complete ? MGMT_STATUS_SUCCESS : MGMT_STATUS_FAILED; + struct pending_cmd *cmd; + + cmd = find_pairing(conn); + if (cmd) + pairing_complete(cmd, status); +} + static void pairing_complete_cb(struct hci_conn *conn, u8 status) { struct pending_cmd *cmd; @@ -2646,7 +2841,7 @@ static void pairing_complete_cb(struct hci_conn *conn, u8 status) pairing_complete(cmd, mgmt_status(status)); } -static void le_connect_complete_cb(struct hci_conn *conn, u8 status) +static void le_pairing_complete_cb(struct hci_conn *conn, u8 status) { struct pending_cmd *cmd; @@ -2692,17 +2887,24 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, } sec_level = BT_SECURITY_MEDIUM; - if (cp->io_cap == 0x03) - auth_type = HCI_AT_DEDICATED_BONDING; - else - auth_type = HCI_AT_DEDICATED_BONDING_MITM; + auth_type = HCI_AT_DEDICATED_BONDING; - if (cp->addr.type == BDADDR_BREDR) - conn = hci_connect(hdev, ACL_LINK, &cp->addr.bdaddr, - cp->addr.type, sec_level, auth_type); - else - conn = hci_connect(hdev, LE_LINK, &cp->addr.bdaddr, - cp->addr.type, sec_level, auth_type); + if (cp->addr.type == BDADDR_BREDR) { + conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level, + auth_type); + } else { + u8 addr_type; + + /* Convert from L2CAP channel address type to HCI address type + */ + if (cp->addr.type == BDADDR_LE_PUBLIC) + addr_type = ADDR_LE_DEV_PUBLIC; + else + addr_type = ADDR_LE_DEV_RANDOM; + + conn = hci_connect_le(hdev, &cp->addr.bdaddr, addr_type, + sec_level, auth_type); + } if (IS_ERR(conn)) { int status; @@ -2733,13 +2935,16 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, } /* For LE, just connecting isn't a proof that the pairing finished */ - if (cp->addr.type == BDADDR_BREDR) + if (cp->addr.type == BDADDR_BREDR) { conn->connect_cfm_cb = pairing_complete_cb; - else - conn->connect_cfm_cb = le_connect_complete_cb; + conn->security_cfm_cb = pairing_complete_cb; + conn->disconn_cfm_cb = pairing_complete_cb; + } else { + conn->connect_cfm_cb = le_pairing_complete_cb; + conn->security_cfm_cb = le_pairing_complete_cb; + conn->disconn_cfm_cb = le_pairing_complete_cb; + } - conn->security_cfm_cb = pairing_complete_cb; - conn->disconn_cfm_cb = pairing_complete_cb; conn->io_capability = cp->io_cap; cmd->user_data = conn; @@ -2826,8 +3031,13 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, } if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) { - /* Continue with pairing via SMP */ + /* Continue with pairing via SMP. The hdev lock must be + * released as SMP may try to recquire it for crypto + * purposes. + */ + hci_dev_unlock(hdev); err = smp_user_confirm_reply(conn, mgmt_op, passkey); + hci_dev_lock(hdev); if (!err) err = cmd_complete(sk, hdev->id, mgmt_op, @@ -3071,7 +3281,12 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, goto unlock; } - err = hci_send_cmd(hdev, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL); + if (test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) + err = hci_send_cmd(hdev, HCI_OP_READ_LOCAL_OOB_EXT_DATA, + 0, NULL); + else + err = hci_send_cmd(hdev, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL); + if (err < 0) mgmt_pending_remove(cmd); @@ -3083,23 +3298,46 @@ unlock: static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { - struct mgmt_cp_add_remote_oob_data *cp = data; - u8 status; int err; BT_DBG("%s ", hdev->name); hci_dev_lock(hdev); - err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr, cp->hash, - cp->randomizer); - if (err < 0) - status = MGMT_STATUS_FAILED; - else - status = MGMT_STATUS_SUCCESS; + if (len == MGMT_ADD_REMOTE_OOB_DATA_SIZE) { + struct mgmt_cp_add_remote_oob_data *cp = data; + u8 status; - err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, status, - &cp->addr, sizeof(cp->addr)); + err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr, + cp->hash, cp->randomizer); + if (err < 0) + status = MGMT_STATUS_FAILED; + else + status = MGMT_STATUS_SUCCESS; + + err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, + status, &cp->addr, sizeof(cp->addr)); + } else if (len == MGMT_ADD_REMOTE_OOB_EXT_DATA_SIZE) { + struct mgmt_cp_add_remote_oob_ext_data *cp = data; + u8 status; + + err = hci_add_remote_oob_ext_data(hdev, &cp->addr.bdaddr, + cp->hash192, + cp->randomizer192, + cp->hash256, + cp->randomizer256); + if (err < 0) + status = MGMT_STATUS_FAILED; + else + status = MGMT_STATUS_SUCCESS; + + err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, + status, &cp->addr, sizeof(cp->addr)); + } else { + BT_ERR("add_remote_oob_data: invalid length of %u bytes", len); + err = cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, + MGMT_STATUS_INVALID_PARAMS); + } hci_dev_unlock(hdev); return err; @@ -3152,6 +3390,8 @@ static int mgmt_start_discovery_failed(struct hci_dev *hdev, u8 status) static void start_discovery_complete(struct hci_dev *hdev, u8 status) { + unsigned long timeout = 0; + BT_DBG("status %d", status); if (status) { @@ -3167,13 +3407,11 @@ static void start_discovery_complete(struct hci_dev *hdev, u8 status) switch (hdev->discovery.type) { case DISCOV_TYPE_LE: - queue_delayed_work(hdev->workqueue, &hdev->le_scan_disable, - DISCOV_LE_TIMEOUT); + timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); break; case DISCOV_TYPE_INTERLEAVED: - queue_delayed_work(hdev->workqueue, &hdev->le_scan_disable, - DISCOV_INTERLEAVED_TIMEOUT); + timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); break; case DISCOV_TYPE_BREDR: @@ -3182,6 +3420,11 @@ static void start_discovery_complete(struct hci_dev *hdev, u8 status) default: BT_ERR("Invalid discovery type %d", hdev->discovery.type); } + + if (!timeout) + return; + + queue_delayed_work(hdev->workqueue, &hdev->le_scan_disable, timeout); } static int start_discovery(struct sock *sk, struct hci_dev *hdev, @@ -3195,7 +3438,7 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, struct hci_request req; /* General inquiry access code (GIAC) */ u8 lap[3] = { 0x33, 0x8b, 0x9e }; - u8 status; + u8 status, own_addr_type; int err; BT_DBG("%s", hdev->name); @@ -3280,18 +3523,31 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) { + /* If controller is scanning, it means the background scanning + * is running. Thus, we should temporarily stop it in order to + * set the discovery scanning parameters. + */ + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + hci_req_add_le_scan_disable(&req); + + memset(¶m_cp, 0, sizeof(param_cp)); + + /* All active scans will be done with either a resolvable + * private address (when privacy feature has been enabled) + * or unresolvable private address. + */ + err = hci_update_random_address(&req, true, &own_addr_type); + if (err < 0) { err = cmd_status(sk, hdev->id, MGMT_OP_START_DISCOVERY, - MGMT_STATUS_BUSY); + MGMT_STATUS_FAILED); mgmt_pending_remove(cmd); goto failed; } - memset(¶m_cp, 0, sizeof(param_cp)); param_cp.type = LE_SCAN_ACTIVE; param_cp.interval = cpu_to_le16(DISCOV_LE_SCAN_INT); param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN); - param_cp.own_address_type = hdev->own_addr_type; + param_cp.own_address_type = own_addr_type; hci_req_add(&req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), ¶m_cp); @@ -3358,10 +3614,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_stop_discovery *mgmt_cp = data; struct pending_cmd *cmd; - struct hci_cp_remote_name_req_cancel cp; - struct inquiry_entry *e; struct hci_request req; - struct hci_cp_le_set_scan_enable enable_cp; int err; BT_DBG("%s", hdev->name); @@ -3390,55 +3643,22 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, hci_req_init(&req, hdev); - switch (hdev->discovery.state) { - case DISCOVERY_FINDING: - if (test_bit(HCI_INQUIRY, &hdev->flags)) { - hci_req_add(&req, HCI_OP_INQUIRY_CANCEL, 0, NULL); - } else { - cancel_delayed_work(&hdev->le_scan_disable); - - memset(&enable_cp, 0, sizeof(enable_cp)); - enable_cp.enable = LE_SCAN_DISABLE; - hci_req_add(&req, HCI_OP_LE_SET_SCAN_ENABLE, - sizeof(enable_cp), &enable_cp); - } - - break; - - case DISCOVERY_RESOLVING: - e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, - NAME_PENDING); - if (!e) { - mgmt_pending_remove(cmd); - err = cmd_complete(sk, hdev->id, - MGMT_OP_STOP_DISCOVERY, 0, - &mgmt_cp->type, - sizeof(mgmt_cp->type)); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - goto unlock; - } - - bacpy(&cp.bdaddr, &e->data.bdaddr); - hci_req_add(&req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), - &cp); - - break; + hci_stop_discovery(&req); - default: - BT_DBG("unknown discovery state %u", hdev->discovery.state); - - mgmt_pending_remove(cmd); - err = cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, - MGMT_STATUS_FAILED, &mgmt_cp->type, - sizeof(mgmt_cp->type)); + err = hci_req_run(&req, stop_discovery_complete); + if (!err) { + hci_discovery_set_state(hdev, DISCOVERY_STOPPING); goto unlock; } - err = hci_req_run(&req, stop_discovery_complete); - if (err < 0) - mgmt_pending_remove(cmd); - else - hci_discovery_set_state(hdev, DISCOVERY_STOPPING); + mgmt_pending_remove(cmd); + + /* If no HCI commands were sent we're done */ + if (err == -ENODATA) { + err = cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, 0, + &mgmt_cp->type, sizeof(mgmt_cp->type)); + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + } unlock: hci_dev_unlock(hdev); @@ -3457,15 +3677,17 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (!hci_discovery_active(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_CONFIRM_NAME, - MGMT_STATUS_FAILED); + err = cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); goto failed; } e = hci_inquiry_cache_lookup_unknown(hdev, &cp->addr.bdaddr); if (!e) { - err = cmd_status(sk, hdev->id, MGMT_OP_CONFIRM_NAME, - MGMT_STATUS_INVALID_PARAMS); + err = cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, + MGMT_STATUS_INVALID_PARAMS, &cp->addr, + sizeof(cp->addr)); goto failed; } @@ -3754,6 +3976,21 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, err = cmd_complete(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, 0, NULL, 0); + /* If background scan is running, restart it so new parameters are + * loaded. + */ + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags) && + hdev->discovery.state == DISCOVERY_STOPPED) { + struct hci_request req; + + hci_req_init(&req, hdev); + + hci_req_add_le_scan_disable(&req); + hci_req_add_le_passive_scan(&req); + + hci_req_run(&req, NULL); + } + hci_dev_unlock(hdev); return err; @@ -3999,15 +4236,269 @@ unlock: return err; } +static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_mode *cp = data; + struct pending_cmd *cmd; + u8 val, status; + int err; + + BT_DBG("request for %s", hdev->name); + + status = mgmt_bredr_support(hdev); + if (status) + return cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + status); + + if (!lmp_sc_capable(hdev) && + !test_bit(HCI_FORCE_SC, &hdev->dev_flags)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + MGMT_STATUS_NOT_SUPPORTED); + + if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) + return cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + + if (!hdev_is_powered(hdev)) { + bool changed; + + if (cp->val) { + changed = !test_and_set_bit(HCI_SC_ENABLED, + &hdev->dev_flags); + if (cp->val == 0x02) + set_bit(HCI_SC_ONLY, &hdev->dev_flags); + else + clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + } else { + changed = test_and_clear_bit(HCI_SC_ENABLED, + &hdev->dev_flags); + clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + } + + err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev); + if (err < 0) + goto failed; + + if (changed) + err = new_settings(hdev, sk); + + goto failed; + } + + if (mgmt_pending_find(MGMT_OP_SET_SECURE_CONN, hdev)) { + err = cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + MGMT_STATUS_BUSY); + goto failed; + } + + val = !!cp->val; + + if (val == test_bit(HCI_SC_ENABLED, &hdev->dev_flags) && + (cp->val == 0x02) == test_bit(HCI_SC_ONLY, &hdev->dev_flags)) { + err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev); + goto failed; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_SECURE_CONN, hdev, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + err = hci_send_cmd(hdev, HCI_OP_WRITE_SC_SUPPORT, 1, &val); + if (err < 0) { + mgmt_pending_remove(cmd); + goto failed; + } + + if (cp->val == 0x02) + set_bit(HCI_SC_ONLY, &hdev->dev_flags); + else + clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + +failed: + hci_dev_unlock(hdev); + return err; +} + +static int set_debug_keys(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_mode *cp = data; + bool changed; + int err; + + BT_DBG("request for %s", hdev->name); + + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + + if (cp->val) + changed = !test_and_set_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + else + changed = test_and_clear_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + + err = send_settings_rsp(sk, MGMT_OP_SET_DEBUG_KEYS, hdev); + if (err < 0) + goto unlock; + + if (changed) + err = new_settings(hdev, sk); + +unlock: + hci_dev_unlock(hdev); + return err; +} + +static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, + u16 len) +{ + struct mgmt_cp_set_privacy *cp = cp_data; + bool changed; + int err; + + BT_DBG("request for %s", hdev->name); + + if (!lmp_le_capable(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, + MGMT_STATUS_NOT_SUPPORTED); + + if (cp->privacy != 0x00 && cp->privacy != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, + MGMT_STATUS_INVALID_PARAMS); + + if (hdev_is_powered(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, + MGMT_STATUS_REJECTED); + + hci_dev_lock(hdev); + + /* If user space supports this command it is also expected to + * handle IRKs. Therefore, set the HCI_RPA_RESOLVING flag. + */ + set_bit(HCI_RPA_RESOLVING, &hdev->dev_flags); + + if (cp->privacy) { + changed = !test_and_set_bit(HCI_PRIVACY, &hdev->dev_flags); + memcpy(hdev->irk, cp->irk, sizeof(hdev->irk)); + set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + } else { + changed = test_and_clear_bit(HCI_PRIVACY, &hdev->dev_flags); + memset(hdev->irk, 0, sizeof(hdev->irk)); + clear_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + } + + err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev); + if (err < 0) + goto unlock; + + if (changed) + err = new_settings(hdev, sk); + +unlock: + hci_dev_unlock(hdev); + return err; +} + +static bool irk_is_valid(struct mgmt_irk_info *irk) +{ + switch (irk->addr.type) { + case BDADDR_LE_PUBLIC: + return true; + + case BDADDR_LE_RANDOM: + /* Two most significant bits shall be set */ + if ((irk->addr.bdaddr.b[5] & 0xc0) != 0xc0) + return false; + return true; + } + + return false; +} + +static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, + u16 len) +{ + struct mgmt_cp_load_irks *cp = cp_data; + u16 irk_count, expected_len; + int i, err; + + BT_DBG("request for %s", hdev->name); + + if (!lmp_le_capable(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, + MGMT_STATUS_NOT_SUPPORTED); + + irk_count = __le16_to_cpu(cp->irk_count); + + expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info); + if (expected_len != len) { + BT_ERR("load_irks: expected %u bytes, got %u bytes", + expected_len, len); + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, + MGMT_STATUS_INVALID_PARAMS); + } + + BT_DBG("%s irk_count %u", hdev->name, irk_count); + + for (i = 0; i < irk_count; i++) { + struct mgmt_irk_info *key = &cp->irks[i]; + + if (!irk_is_valid(key)) + return cmd_status(sk, hdev->id, + MGMT_OP_LOAD_IRKS, + MGMT_STATUS_INVALID_PARAMS); + } + + hci_dev_lock(hdev); + + hci_smp_irks_clear(hdev); + + for (i = 0; i < irk_count; i++) { + struct mgmt_irk_info *irk = &cp->irks[i]; + u8 addr_type; + + if (irk->addr.type == BDADDR_LE_PUBLIC) + addr_type = ADDR_LE_DEV_PUBLIC; + else + addr_type = ADDR_LE_DEV_RANDOM; + + hci_add_irk(hdev, &irk->addr.bdaddr, addr_type, irk->val, + BDADDR_ANY); + } + + set_bit(HCI_RPA_RESOLVING, &hdev->dev_flags); + + err = cmd_complete(sk, hdev->id, MGMT_OP_LOAD_IRKS, 0, NULL, 0); + + hci_dev_unlock(hdev); + + return err; +} + static bool ltk_is_valid(struct mgmt_ltk_info *key) { - if (key->authenticated != 0x00 && key->authenticated != 0x01) - return false; if (key->master != 0x00 && key->master != 0x01) return false; - if (!bdaddr_type_is_le(key->addr.type)) - return false; - return true; + + switch (key->addr.type) { + case BDADDR_LE_PUBLIC: + return true; + + case BDADDR_LE_RANDOM: + /* Two most significant bits shall be set */ + if ((key->addr.bdaddr.b[5] & 0xc0) != 0xc0) + return false; + return true; + } + + return false; } static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, @@ -4029,7 +4520,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, sizeof(struct mgmt_ltk_info); if (expected_len != len) { BT_ERR("load_keys: expected %u bytes, got %u bytes", - len, expected_len); + expected_len, len); return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, MGMT_STATUS_INVALID_PARAMS); } @@ -4051,7 +4542,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, for (i = 0; i < key_count; i++) { struct mgmt_ltk_info *key = &cp->keys[i]; - u8 type, addr_type; + u8 type, addr_type, authenticated; if (key->addr.type == BDADDR_LE_PUBLIC) addr_type = ADDR_LE_DEV_PUBLIC; @@ -4063,9 +4554,20 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, else type = HCI_SMP_LTK_SLAVE; - hci_add_ltk(hdev, &key->addr.bdaddr, addr_type, - type, 0, key->authenticated, key->val, - key->enc_size, key->ediv, key->rand); + switch (key->type) { + case MGMT_LTK_UNAUTHENTICATED: + authenticated = 0x00; + break; + case MGMT_LTK_AUTHENTICATED: + authenticated = 0x01; + break; + default: + continue; + } + + hci_add_ltk(hdev, &key->addr.bdaddr, addr_type, type, + authenticated, key->val, key->enc_size, key->ediv, + key->rand); } err = cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0, @@ -4076,6 +4578,218 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, return err; } +struct cmd_conn_lookup { + struct hci_conn *conn; + bool valid_tx_power; + u8 mgmt_status; +}; + +static void get_conn_info_complete(struct pending_cmd *cmd, void *data) +{ + struct cmd_conn_lookup *match = data; + struct mgmt_cp_get_conn_info *cp; + struct mgmt_rp_get_conn_info rp; + struct hci_conn *conn = cmd->user_data; + + if (conn != match->conn) + return; + + cp = (struct mgmt_cp_get_conn_info *) cmd->param; + + memset(&rp, 0, sizeof(rp)); + bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); + rp.addr.type = cp->addr.type; + + if (!match->mgmt_status) { + rp.rssi = conn->rssi; + + if (match->valid_tx_power) { + rp.tx_power = conn->tx_power; + rp.max_tx_power = conn->max_tx_power; + } else { + rp.tx_power = HCI_TX_POWER_INVALID; + rp.max_tx_power = HCI_TX_POWER_INVALID; + } + } + + cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO, + match->mgmt_status, &rp, sizeof(rp)); + + hci_conn_drop(conn); + + mgmt_pending_remove(cmd); +} + +static void conn_info_refresh_complete(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_read_rssi *cp; + struct hci_conn *conn; + struct cmd_conn_lookup match; + u16 handle; + + BT_DBG("status 0x%02x", status); + + hci_dev_lock(hdev); + + /* TX power data is valid in case request completed successfully, + * otherwise we assume it's not valid. At the moment we assume that + * either both or none of current and max values are valid to keep code + * simple. + */ + match.valid_tx_power = !status; + + /* Commands sent in request are either Read RSSI or Read Transmit Power + * Level so we check which one was last sent to retrieve connection + * handle. Both commands have handle as first parameter so it's safe to + * cast data on the same command struct. + * + * First command sent is always Read RSSI and we fail only if it fails. + * In other case we simply override error to indicate success as we + * already remembered if TX power value is actually valid. + */ + cp = hci_sent_cmd_data(hdev, HCI_OP_READ_RSSI); + if (!cp) { + cp = hci_sent_cmd_data(hdev, HCI_OP_READ_TX_POWER); + status = 0; + } + + if (!cp) { + BT_ERR("invalid sent_cmd in response"); + goto unlock; + } + + handle = __le16_to_cpu(cp->handle); + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) { + BT_ERR("unknown handle (%d) in response", handle); + goto unlock; + } + + match.conn = conn; + match.mgmt_status = mgmt_status(status); + + /* Cache refresh is complete, now reply for mgmt request for given + * connection only. + */ + mgmt_pending_foreach(MGMT_OP_GET_CONN_INFO, hdev, + get_conn_info_complete, &match); + +unlock: + hci_dev_unlock(hdev); +} + +static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_get_conn_info *cp = data; + struct mgmt_rp_get_conn_info rp; + struct hci_conn *conn; + unsigned long conn_info_age; + int err = 0; + + BT_DBG("%s", hdev->name); + + memset(&rp, 0, sizeof(rp)); + bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); + rp.addr.type = cp->addr.type; + + if (!bdaddr_type_is_valid(cp->addr.type)) + return cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); + + hci_dev_lock(hdev); + + if (!hdev_is_powered(hdev)) { + err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); + goto unlock; + } + + if (cp->addr.type == BDADDR_BREDR) + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, + &cp->addr.bdaddr); + else + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr); + + if (!conn || conn->state != BT_CONNECTED) { + err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_NOT_CONNECTED, &rp, sizeof(rp)); + goto unlock; + } + + /* To avoid client trying to guess when to poll again for information we + * calculate conn info age as random value between min/max set in hdev. + */ + conn_info_age = hdev->conn_info_min_age + + prandom_u32_max(hdev->conn_info_max_age - + hdev->conn_info_min_age); + + /* Query controller to refresh cached values if they are too old or were + * never read. + */ + if (time_after(jiffies, conn->conn_info_timestamp + + msecs_to_jiffies(conn_info_age)) || + !conn->conn_info_timestamp) { + struct hci_request req; + struct hci_cp_read_tx_power req_txp_cp; + struct hci_cp_read_rssi req_rssi_cp; + struct pending_cmd *cmd; + + hci_req_init(&req, hdev); + req_rssi_cp.handle = cpu_to_le16(conn->handle); + hci_req_add(&req, HCI_OP_READ_RSSI, sizeof(req_rssi_cp), + &req_rssi_cp); + + /* For LE links TX power does not change thus we don't need to + * query for it once value is known. + */ + if (!bdaddr_type_is_le(cp->addr.type) || + conn->tx_power == HCI_TX_POWER_INVALID) { + req_txp_cp.handle = cpu_to_le16(conn->handle); + req_txp_cp.type = 0x00; + hci_req_add(&req, HCI_OP_READ_TX_POWER, + sizeof(req_txp_cp), &req_txp_cp); + } + + /* Max TX power needs to be read only once per connection */ + if (conn->max_tx_power == HCI_TX_POWER_INVALID) { + req_txp_cp.handle = cpu_to_le16(conn->handle); + req_txp_cp.type = 0x01; + hci_req_add(&req, HCI_OP_READ_TX_POWER, + sizeof(req_txp_cp), &req_txp_cp); + } + + err = hci_req_run(&req, conn_info_refresh_complete); + if (err < 0) + goto unlock; + + cmd = mgmt_pending_add(sk, MGMT_OP_GET_CONN_INFO, hdev, + data, len); + if (!cmd) { + err = -ENOMEM; + goto unlock; + } + + hci_conn_hold(conn); + cmd->user_data = conn; + + conn->conn_info_timestamp = jiffies; + } else { + /* Cache is valid, just reply with values cached in hci_conn */ + rp.rssi = conn->rssi; + rp.tx_power = conn->tx_power; + rp.max_tx_power = conn->max_tx_power; + + err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + } + +unlock: + hci_dev_unlock(hdev); + return err; +} + static const struct mgmt_handler { int (*func) (struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len); @@ -4115,7 +4829,7 @@ static const struct mgmt_handler { { user_passkey_reply, false, MGMT_USER_PASSKEY_REPLY_SIZE }, { user_passkey_neg_reply, false, MGMT_USER_PASSKEY_NEG_REPLY_SIZE }, { read_local_oob_data, false, MGMT_READ_LOCAL_OOB_DATA_SIZE }, - { add_remote_oob_data, false, MGMT_ADD_REMOTE_OOB_DATA_SIZE }, + { add_remote_oob_data, true, MGMT_ADD_REMOTE_OOB_DATA_SIZE }, { remove_remote_oob_data, false, MGMT_REMOVE_REMOTE_OOB_DATA_SIZE }, { start_discovery, false, MGMT_START_DISCOVERY_SIZE }, { stop_discovery, false, MGMT_STOP_DISCOVERY_SIZE }, @@ -4127,6 +4841,11 @@ static const struct mgmt_handler { { set_bredr, false, MGMT_SETTING_SIZE }, { set_static_address, false, MGMT_SET_STATIC_ADDRESS_SIZE }, { set_scan_params, false, MGMT_SET_SCAN_PARAMS_SIZE }, + { set_secure_conn, false, MGMT_SETTING_SIZE }, + { set_debug_keys, false, MGMT_SETTING_SIZE }, + { set_privacy, false, MGMT_SET_PRIVACY_SIZE }, + { load_irks, true, MGMT_LOAD_IRKS_SIZE }, + { get_conn_info, false, MGMT_GET_CONN_INFO_SIZE }, }; @@ -4243,6 +4962,17 @@ void mgmt_index_removed(struct hci_dev *hdev) mgmt_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, NULL); } +/* This function requires the caller holds hdev->lock */ +static void restart_le_auto_conns(struct hci_dev *hdev) +{ + struct hci_conn_params *p; + + list_for_each_entry(p, &hdev->le_conn_params, list) { + if (p->auto_connect == HCI_AUTO_CONN_ALWAYS) + hci_pend_le_conn_add(hdev, &p->addr, p->addr_type); + } +} + static void powered_complete(struct hci_dev *hdev, u8 status) { struct cmd_lookup match = { NULL, hdev }; @@ -4251,6 +4981,8 @@ static void powered_complete(struct hci_dev *hdev, u8 status) hci_dev_lock(hdev); + restart_le_auto_conns(hdev); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); new_settings(hdev, match.sk); @@ -4292,11 +5024,6 @@ static int powered_update_hci(struct hci_dev *hdev) } if (lmp_le_capable(hdev)) { - /* Set random address to static address if configured */ - if (bacmp(&hdev->static_addr, BDADDR_ANY)) - hci_req_add(&req, HCI_OP_LE_SET_RANDOM_ADDR, 6, - &hdev->static_addr); - /* Make sure the controller has a good default for * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. @@ -4422,6 +5149,10 @@ void mgmt_discoverable(struct hci_dev *hdev, u8 discoverable) if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) return; + /* Powering off may clear the scan mode - don't let that interfere */ + if (!discoverable && mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + if (discoverable) { changed = !test_and_set_bit(HCI_DISCOVERABLE, &hdev->dev_flags); } else { @@ -4455,6 +5186,10 @@ void mgmt_connectable(struct hci_dev *hdev, u8 connectable) if (mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) return; + /* Powering off may clear the scan mode - don't let that interfere */ + if (!connectable && mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + if (connectable) changed = !test_and_set_bit(HCI_CONNECTABLE, &hdev->dev_flags); else @@ -4464,6 +5199,18 @@ void mgmt_connectable(struct hci_dev *hdev, u8 connectable) new_settings(hdev, NULL); } +void mgmt_advertising(struct hci_dev *hdev, u8 advertising) +{ + /* Powering off may stop advertising - don't let that interfere */ + if (!advertising && mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + + if (advertising) + set_bit(HCI_ADVERTISING, &hdev->dev_flags); + else + clear_bit(HCI_ADVERTISING, &hdev->dev_flags); +} + void mgmt_write_scan_failed(struct hci_dev *hdev, u8 scan, u8 status) { u8 mgmt_err = mgmt_status(status); @@ -4494,28 +5241,112 @@ void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, mgmt_event(MGMT_EV_NEW_LINK_KEY, hdev, &ev, sizeof(ev), NULL); } -void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, u8 persistent) +static u8 mgmt_ltk_type(struct smp_ltk *ltk) +{ + if (ltk->authenticated) + return MGMT_LTK_AUTHENTICATED; + + return MGMT_LTK_UNAUTHENTICATED; +} + +void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) { struct mgmt_ev_new_long_term_key ev; memset(&ev, 0, sizeof(ev)); - ev.store_hint = persistent; + /* Devices using resolvable or non-resolvable random addresses + * without providing an indentity resolving key don't require + * to store long term keys. Their addresses will change the + * next time around. + * + * Only when a remote device provides an identity address + * make sure the long term key is stored. If the remote + * identity is known, the long term keys are internally + * mapped to the identity address. So allow static random + * and public addresses here. + */ + if (key->bdaddr_type == ADDR_LE_DEV_RANDOM && + (key->bdaddr.b[5] & 0xc0) != 0xc0) + ev.store_hint = 0x00; + else + ev.store_hint = persistent; + bacpy(&ev.key.addr.bdaddr, &key->bdaddr); ev.key.addr.type = link_to_bdaddr(LE_LINK, key->bdaddr_type); - ev.key.authenticated = key->authenticated; + ev.key.type = mgmt_ltk_type(key); ev.key.enc_size = key->enc_size; ev.key.ediv = key->ediv; + ev.key.rand = key->rand; if (key->type == HCI_SMP_LTK) ev.key.master = 1; - memcpy(ev.key.rand, key->rand, sizeof(key->rand)); memcpy(ev.key.val, key->val, sizeof(key->val)); mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL); } +void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk) +{ + struct mgmt_ev_new_irk ev; + + memset(&ev, 0, sizeof(ev)); + + /* For identity resolving keys from devices that are already + * using a public address or static random address, do not + * ask for storing this key. The identity resolving key really + * is only mandatory for devices using resovlable random + * addresses. + * + * Storing all identity resolving keys has the downside that + * they will be also loaded on next boot of they system. More + * identity resolving keys, means more time during scanning is + * needed to actually resolve these addresses. + */ + if (bacmp(&irk->rpa, BDADDR_ANY)) + ev.store_hint = 0x01; + else + ev.store_hint = 0x00; + + bacpy(&ev.rpa, &irk->rpa); + bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr); + ev.irk.addr.type = link_to_bdaddr(LE_LINK, irk->addr_type); + memcpy(ev.irk.val, irk->val, sizeof(irk->val)); + + mgmt_event(MGMT_EV_NEW_IRK, hdev, &ev, sizeof(ev), NULL); +} + +void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, + bool persistent) +{ + struct mgmt_ev_new_csrk ev; + + memset(&ev, 0, sizeof(ev)); + + /* Devices using resolvable or non-resolvable random addresses + * without providing an indentity resolving key don't require + * to store signature resolving keys. Their addresses will change + * the next time around. + * + * Only when a remote device provides an identity address + * make sure the signature resolving key is stored. So allow + * static random and public addresses here. + */ + if (csrk->bdaddr_type == ADDR_LE_DEV_RANDOM && + (csrk->bdaddr.b[5] & 0xc0) != 0xc0) + ev.store_hint = 0x00; + else + ev.store_hint = persistent; + + bacpy(&ev.key.addr.bdaddr, &csrk->bdaddr); + ev.key.addr.type = link_to_bdaddr(LE_LINK, csrk->bdaddr_type); + ev.key.master = csrk->master; + memcpy(ev.key.val, csrk->val, sizeof(csrk->val)); + + mgmt_event(MGMT_EV_NEW_CSRK, hdev, &ev, sizeof(ev), NULL); +} + static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, u8 data_len) { @@ -4590,11 +5421,32 @@ static void unpair_device_rsp(struct pending_cmd *cmd, void *data) } void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, - u8 link_type, u8 addr_type, u8 reason) + u8 link_type, u8 addr_type, u8 reason, + bool mgmt_connected) { struct mgmt_ev_device_disconnected ev; + struct pending_cmd *power_off; struct sock *sk = NULL; + power_off = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); + if (power_off) { + struct mgmt_mode *cp = power_off->param; + + /* The connection is still in hci_conn_hash so test for 1 + * instead of 0 to know if this is the last one. + */ + if (!cp->val && hci_conn_count(hdev) == 1) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); + } + } + + if (!mgmt_connected) + return; + + if (link_type != ACL_LINK && link_type != LE_LINK) + return; + mgmt_pending_foreach(MGMT_OP_DISCONNECT, hdev, disconnect_rsp, &sk); bacpy(&ev.addr.bdaddr, bdaddr); @@ -4613,6 +5465,8 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { + u8 bdaddr_type = link_to_bdaddr(link_type, addr_type); + struct mgmt_cp_disconnect *cp; struct mgmt_rp_disconnect rp; struct pending_cmd *cmd; @@ -4623,8 +5477,16 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, if (!cmd) return; + cp = cmd->param; + + if (bacmp(bdaddr, &cp->addr.bdaddr)) + return; + + if (cp->addr.type != bdaddr_type) + return; + bacpy(&rp.addr.bdaddr, bdaddr); - rp.addr.type = link_to_bdaddr(link_type, addr_type); + rp.addr.type = bdaddr_type; cmd_complete(cmd->sk, cmd->index, MGMT_OP_DISCONNECT, mgmt_status(status), &rp, sizeof(rp)); @@ -4636,6 +5498,20 @@ void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { struct mgmt_ev_connect_failed ev; + struct pending_cmd *power_off; + + power_off = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); + if (power_off) { + struct mgmt_mode *cp = power_off->param; + + /* The connection is still in hci_conn_hash so test for 1 + * instead of 0 to know if this is the last one. + */ + if (!cp->val && hci_conn_count(hdev) == 1) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); + } + } bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); @@ -4694,7 +5570,7 @@ void mgmt_pin_code_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, } int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr, - u8 link_type, u8 addr_type, __le32 value, + u8 link_type, u8 addr_type, u32 value, u8 confirm_hint) { struct mgmt_ev_user_confirm_request ev; @@ -4704,7 +5580,7 @@ int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr, bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); ev.confirm_hint = confirm_hint; - ev.value = value; + ev.value = cpu_to_le32(value); return mgmt_event(MGMT_EV_USER_CONFIRM_REQUEST, hdev, &ev, sizeof(ev), NULL); @@ -4897,6 +5773,43 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) hci_req_run(&req, NULL); } +void mgmt_sc_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) +{ + struct cmd_lookup match = { NULL, hdev }; + bool changed = false; + + if (status) { + u8 mgmt_err = mgmt_status(status); + + if (enable) { + if (test_and_clear_bit(HCI_SC_ENABLED, + &hdev->dev_flags)) + new_settings(hdev, NULL); + clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + } + + mgmt_pending_foreach(MGMT_OP_SET_SECURE_CONN, hdev, + cmd_status_rsp, &mgmt_err); + return; + } + + if (enable) { + changed = !test_and_set_bit(HCI_SC_ENABLED, &hdev->dev_flags); + } else { + changed = test_and_clear_bit(HCI_SC_ENABLED, &hdev->dev_flags); + clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + } + + mgmt_pending_foreach(MGMT_OP_SET_SECURE_CONN, hdev, + settings_rsp, &match); + + if (changed) + new_settings(hdev, match.sk); + + if (match.sk) + sock_put(match.sk); +} + static void sk_lookup(struct pending_cmd *cmd, void *data) { struct cmd_lookup *match = data; @@ -4951,8 +5864,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) cmd ? cmd->sk : NULL); } -void mgmt_read_local_oob_data_reply_complete(struct hci_dev *hdev, u8 *hash, - u8 *randomizer, u8 status) +void mgmt_read_local_oob_data_complete(struct hci_dev *hdev, u8 *hash192, + u8 *randomizer192, u8 *hash256, + u8 *randomizer256, u8 status) { struct pending_cmd *cmd; @@ -4966,42 +5880,72 @@ void mgmt_read_local_oob_data_reply_complete(struct hci_dev *hdev, u8 *hash, cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, mgmt_status(status)); } else { - struct mgmt_rp_read_local_oob_data rp; + if (test_bit(HCI_SC_ENABLED, &hdev->dev_flags) && + hash256 && randomizer256) { + struct mgmt_rp_read_local_oob_ext_data rp; + + memcpy(rp.hash192, hash192, sizeof(rp.hash192)); + memcpy(rp.randomizer192, randomizer192, + sizeof(rp.randomizer192)); - memcpy(rp.hash, hash, sizeof(rp.hash)); - memcpy(rp.randomizer, randomizer, sizeof(rp.randomizer)); + memcpy(rp.hash256, hash256, sizeof(rp.hash256)); + memcpy(rp.randomizer256, randomizer256, + sizeof(rp.randomizer256)); + + cmd_complete(cmd->sk, hdev->id, + MGMT_OP_READ_LOCAL_OOB_DATA, 0, + &rp, sizeof(rp)); + } else { + struct mgmt_rp_read_local_oob_data rp; - cmd_complete(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - 0, &rp, sizeof(rp)); + memcpy(rp.hash, hash192, sizeof(rp.hash)); + memcpy(rp.randomizer, randomizer192, + sizeof(rp.randomizer)); + + cmd_complete(cmd->sk, hdev->id, + MGMT_OP_READ_LOCAL_OOB_DATA, 0, + &rp, sizeof(rp)); + } } mgmt_pending_remove(cmd); } void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 *dev_class, s8 rssi, u8 cfm_name, u8 - ssp, u8 *eir, u16 eir_len) + u8 addr_type, u8 *dev_class, s8 rssi, u8 cfm_name, + u8 ssp, u8 *eir, u16 eir_len, u8 *scan_rsp, + u8 scan_rsp_len) { char buf[512]; struct mgmt_ev_device_found *ev = (void *) buf; + struct smp_irk *irk; size_t ev_size; if (!hci_discovery_active(hdev)) return; - /* Leave 5 bytes for a potential CoD field */ - if (sizeof(*ev) + eir_len + 5 > sizeof(buf)) + /* Make sure that the buffer is big enough. The 5 extra bytes + * are for the potential CoD field. + */ + if (sizeof(*ev) + eir_len + scan_rsp_len + 5 > sizeof(buf)) return; memset(buf, 0, sizeof(buf)); - bacpy(&ev->addr.bdaddr, bdaddr); - ev->addr.type = link_to_bdaddr(link_type, addr_type); + irk = hci_get_irk(hdev, bdaddr, addr_type); + if (irk) { + bacpy(&ev->addr.bdaddr, &irk->bdaddr); + ev->addr.type = link_to_bdaddr(link_type, irk->addr_type); + } else { + bacpy(&ev->addr.bdaddr, bdaddr); + ev->addr.type = link_to_bdaddr(link_type, addr_type); + } + ev->rssi = rssi; if (cfm_name) - ev->flags |= __constant_cpu_to_le32(MGMT_DEV_FOUND_CONFIRM_NAME); + ev->flags |= cpu_to_le32(MGMT_DEV_FOUND_CONFIRM_NAME); if (!ssp) - ev->flags |= __constant_cpu_to_le32(MGMT_DEV_FOUND_LEGACY_PAIRING); + ev->flags |= cpu_to_le32(MGMT_DEV_FOUND_LEGACY_PAIRING); if (eir_len > 0) memcpy(ev->eir, eir, eir_len); @@ -5010,8 +5954,11 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, eir_len = eir_append_data(ev->eir, eir_len, EIR_CLASS_OF_DEV, dev_class, 3); - ev->eir_len = cpu_to_le16(eir_len); - ev_size = sizeof(*ev) + eir_len; + if (scan_rsp_len > 0) + memcpy(ev->eir + eir_len, scan_rsp, scan_rsp_len); + + ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len); + ev_size = sizeof(*ev) + eir_len + scan_rsp_len; mgmt_event(MGMT_EV_DEVICE_FOUND, hdev, ev, ev_size, NULL); } diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index facd8a79c03..754b6fe4f74 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -186,9 +186,9 @@ static void rfcomm_l2state_change(struct sock *sk) rfcomm_schedule(); } -static void rfcomm_l2data_ready(struct sock *sk, int bytes) +static void rfcomm_l2data_ready(struct sock *sk) { - BT_DBG("%p bytes %d", sk, bytes); + BT_DBG("%p", sk); rfcomm_schedule(); } @@ -216,6 +216,7 @@ static int rfcomm_check_security(struct rfcomm_dlc *d) switch (d->sec_level) { case BT_SECURITY_HIGH: + case BT_SECURITY_FIPS: auth_type = HCI_AT_GENERAL_BONDING_MITM; break; case BT_SECURITY_MEDIUM: @@ -306,7 +307,7 @@ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio) setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d); skb_queue_head_init(&d->tx_queue); - spin_lock_init(&d->lock); + mutex_init(&d->lock); atomic_set(&d->refcnt, 1); rfcomm_dlc_clear_state(d); @@ -359,6 +360,11 @@ static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci) return NULL; } +static int rfcomm_check_channel(u8 channel) +{ + return channel < 1 || channel > 30; +} + static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel) { struct rfcomm_session *s; @@ -368,7 +374,7 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, BT_DBG("dlc %p state %ld %pMR -> %pMR channel %d", d, d->state, src, dst, channel); - if (channel < 1 || channel > 30) + if (rfcomm_check_channel(channel)) return -EINVAL; if (d->state != BT_OPEN && d->state != BT_CLOSED) @@ -425,6 +431,20 @@ int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 chann return r; } +static void __rfcomm_dlc_disconn(struct rfcomm_dlc *d) +{ + struct rfcomm_session *s = d->session; + + d->state = BT_DISCONN; + if (skb_queue_empty(&d->tx_queue)) { + rfcomm_send_disc(s, d->dlci); + rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT); + } else { + rfcomm_queue_disc(d); + rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2); + } +} + static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) { struct rfcomm_session *s = d->session; @@ -437,32 +457,29 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) switch (d->state) { case BT_CONNECT: case BT_CONFIG: + case BT_OPEN: + case BT_CONNECT2: if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { set_bit(RFCOMM_AUTH_REJECT, &d->flags); rfcomm_schedule(); - break; + return 0; } - /* Fall through */ + } + switch (d->state) { + case BT_CONNECT: case BT_CONNECTED: - d->state = BT_DISCONN; - if (skb_queue_empty(&d->tx_queue)) { - rfcomm_send_disc(s, d->dlci); - rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT); - } else { - rfcomm_queue_disc(d); - rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2); - } + __rfcomm_dlc_disconn(d); break; - case BT_OPEN: - case BT_CONNECT2: - if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { - set_bit(RFCOMM_AUTH_REJECT, &d->flags); - rfcomm_schedule(); + case BT_CONFIG: + if (s->state != BT_BOUND) { + __rfcomm_dlc_disconn(d); break; } - /* Fall through */ + /* if closing a dlc in a session that hasn't been started, + * just close and unlink the dlc + */ default: rfcomm_dlc_clear_timer(d); @@ -513,6 +530,25 @@ no_session: return r; } +struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel) +{ + struct rfcomm_session *s; + struct rfcomm_dlc *dlc = NULL; + u8 dlci; + + if (rfcomm_check_channel(channel)) + return ERR_PTR(-EINVAL); + + rfcomm_lock(); + s = rfcomm_session_get(src, dst); + if (s) { + dlci = __dlci(!s->initiator, channel); + dlc = rfcomm_dlc_get(s, dlci); + } + rfcomm_unlock(); + return dlc; +} + int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) { int len = skb->len; @@ -533,6 +569,20 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) return len; } +void rfcomm_dlc_send_noerror(struct rfcomm_dlc *d, struct sk_buff *skb) +{ + int len = skb->len; + + BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); + + rfcomm_make_uih(skb, d->addr); + skb_queue_tail(&d->tx_queue, skb); + + if (d->state == BT_CONNECTED && + !test_bit(RFCOMM_TX_THROTTLED, &d->flags)) + rfcomm_schedule(); +} + void __rfcomm_dlc_throttle(struct rfcomm_dlc *d) { BT_DBG("dlc %p state %ld", d, d->state); @@ -718,7 +768,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bacpy(&addr.l2_bdaddr, dst); addr.l2_family = AF_BLUETOOTH; - addr.l2_psm = __constant_cpu_to_le16(RFCOMM_PSM); + addr.l2_psm = cpu_to_le16(RFCOMM_PSM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); @@ -1943,12 +1993,11 @@ static void rfcomm_process_sessions(void) continue; } - if (s->state == BT_LISTEN) { + switch (s->state) { + case BT_LISTEN: rfcomm_accept_connection(s); continue; - } - switch (s->state) { case BT_BOUND: s = rfcomm_check_connection(s); break; @@ -1983,7 +2032,7 @@ static int rfcomm_add_listener(bdaddr_t *ba) /* Bind socket */ bacpy(&addr.l2_bdaddr, ba); addr.l2_family = AF_BLUETOOTH; - addr.l2_psm = __constant_cpu_to_le16(RFCOMM_PSM); + addr.l2_psm = cpu_to_le16(RFCOMM_PSM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); @@ -2085,7 +2134,8 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) set_bit(RFCOMM_SEC_PENDING, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); continue; - } else if (d->sec_level == BT_SECURITY_HIGH) { + } else if (d->sec_level == BT_SECURITY_HIGH || + d->sec_level == BT_SECURITY_FIPS) { set_bit(RFCOMM_ENC_DROP, &d->flags); continue; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 3c2d3e4aa2f..c603a5eb472 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -54,7 +54,7 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) atomic_add(skb->len, &sk->sk_rmem_alloc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) rfcomm_dlc_throttle(d); @@ -84,7 +84,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) sock_set_flag(sk, SOCK_ZAPPED); bt_accept_unlink(sk); } - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { if (d->state == BT_CONNECTED) rfcomm_session_getaddr(d->session, @@ -105,13 +105,18 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) } /* ---- Socket functions ---- */ -static struct sock *__rfcomm_get_sock_by_addr(u8 channel, bdaddr_t *src) +static struct sock *__rfcomm_get_listen_sock_by_addr(u8 channel, bdaddr_t *src) { struct sock *sk = NULL; sk_for_each(sk, &rfcomm_sk_list.head) { - if (rfcomm_pi(sk)->channel == channel && - !bacmp(&rfcomm_pi(sk)->src, src)) + if (rfcomm_pi(sk)->channel != channel) + continue; + + if (bacmp(&rfcomm_pi(sk)->src, src)) + continue; + + if (sk->sk_state == BT_BOUND || sk->sk_state == BT_LISTEN) break; } @@ -331,6 +336,7 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; + int chan = sa->rc_channel; int err = 0; BT_DBG("sk %p %pMR", sk, &sa->rc_bdaddr); @@ -352,12 +358,12 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr write_lock(&rfcomm_sk_list.lock); - if (sa->rc_channel && __rfcomm_get_sock_by_addr(sa->rc_channel, &sa->rc_bdaddr)) { + if (chan && __rfcomm_get_listen_sock_by_addr(chan, &sa->rc_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ bacpy(&rfcomm_pi(sk)->src, &sa->rc_bdaddr); - rfcomm_pi(sk)->channel = sa->rc_channel; + rfcomm_pi(sk)->channel = chan; sk->sk_state = BT_BOUND; } @@ -439,7 +445,7 @@ static int rfcomm_sock_listen(struct socket *sock, int backlog) write_lock(&rfcomm_sk_list.lock); for (channel = 1; channel < 31; channel++) - if (!__rfcomm_get_sock_by_addr(channel, src)) { + if (!__rfcomm_get_listen_sock_by_addr(channel, src)) { rfcomm_pi(sk)->channel = channel; err = 0; break; @@ -528,6 +534,10 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * BT_DBG("sock %p, sk %p", sock, sk); + if (peer && sk->sk_state != BT_CONNECTED && + sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2) + return -ENOTCONN; + memset(sa, 0, sizeof(*sa)); sa->rc_family = AF_BLUETOOTH; sa->rc_channel = rfcomm_pi(sk)->channel; @@ -648,6 +658,11 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u break; } + if (opt & RFCOMM_LM_FIPS) { + err = -EINVAL; + break; + } + if (opt & RFCOMM_LM_AUTH) rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW; if (opt & RFCOMM_LM_ENCRYPT) @@ -762,7 +777,11 @@ static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __u break; case BT_SECURITY_HIGH: opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT | - RFCOMM_LM_SECURE; + RFCOMM_LM_SECURE; + break; + case BT_SECURITY_FIPS: + opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT | + RFCOMM_LM_SECURE | RFCOMM_LM_FIPS; break; default: opt = 0; @@ -774,6 +793,7 @@ static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __u if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; + break; case RFCOMM_CONNINFO: diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 84fcf9fff3e..8e385a0ae60 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -40,6 +40,7 @@ #define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */ #define RFCOMM_TTY_MINOR 0 +static DEFINE_MUTEX(rfcomm_ioctl_mutex); static struct tty_driver *rfcomm_tty_driver; struct rfcomm_dev { @@ -51,6 +52,8 @@ struct rfcomm_dev { unsigned long flags; int err; + unsigned long status; /* don't export to userspace */ + bdaddr_t src; bdaddr_t dst; u8 channel; @@ -67,7 +70,7 @@ struct rfcomm_dev { }; static LIST_HEAD(rfcomm_dev_list); -static DEFINE_SPINLOCK(rfcomm_dev_lock); +static DEFINE_MUTEX(rfcomm_dev_lock); static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb); static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err); @@ -82,10 +85,6 @@ static void rfcomm_dev_destruct(struct tty_port *port) BT_DBG("dev %p dlc %p", dev, dlc); - spin_lock(&rfcomm_dev_lock); - list_del(&dev->list); - spin_unlock(&rfcomm_dev_lock); - rfcomm_dlc_lock(dlc); /* Detach DLC if it's owned by this dev */ if (dlc->owner == dev) @@ -94,7 +93,12 @@ static void rfcomm_dev_destruct(struct tty_port *port) rfcomm_dlc_put(dlc); - tty_unregister_device(rfcomm_tty_driver, dev->id); + if (dev->tty_dev) + tty_unregister_device(rfcomm_tty_driver, dev->id); + + mutex_lock(&rfcomm_dev_lock); + list_del(&dev->list); + mutex_unlock(&rfcomm_dev_lock); kfree(dev); @@ -107,8 +111,12 @@ static void rfcomm_dev_destruct(struct tty_port *port) static int rfcomm_dev_activate(struct tty_port *port, struct tty_struct *tty) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); + int err; - return rfcomm_dlc_open(dev->dlc, &dev->src, &dev->dst, dev->channel); + err = rfcomm_dlc_open(dev->dlc, &dev->src, &dev->dst, dev->channel); + if (err) + set_bit(TTY_IO_ERROR, &tty->flags); + return err; } /* we block the open until the dlc->state becomes BT_CONNECTED */ @@ -138,7 +146,7 @@ static const struct tty_port_operations rfcomm_port_ops = { .carrier_raised = rfcomm_dev_carrier_raised, }; -static struct rfcomm_dev *__rfcomm_dev_get(int id) +static struct rfcomm_dev *__rfcomm_dev_lookup(int id) { struct rfcomm_dev *dev; @@ -153,36 +161,41 @@ static struct rfcomm_dev *rfcomm_dev_get(int id) { struct rfcomm_dev *dev; - spin_lock(&rfcomm_dev_lock); + mutex_lock(&rfcomm_dev_lock); - dev = __rfcomm_dev_get(id); + dev = __rfcomm_dev_lookup(id); - if (dev) { - if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags)) - dev = NULL; - else - tty_port_get(&dev->port); - } + if (dev && !tty_port_get(&dev->port)) + dev = NULL; - spin_unlock(&rfcomm_dev_lock); + mutex_unlock(&rfcomm_dev_lock); return dev; } -static struct device *rfcomm_get_device(struct rfcomm_dev *dev) +static void rfcomm_reparent_device(struct rfcomm_dev *dev) { struct hci_dev *hdev; struct hci_conn *conn; hdev = hci_get_route(&dev->dst, &dev->src); if (!hdev) - return NULL; + return; + /* The lookup results are unsafe to access without the + * hci device lock (FIXME: why is this not documented?) + */ + hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &dev->dst); - hci_dev_put(hdev); + /* Just because the acl link is in the hash table is no + * guarantee the sysfs device has been added ... + */ + if (conn && device_is_registered(&conn->dev)) + device_move(dev->tty_dev, &conn->dev, DPM_ORDER_DEV_AFTER_PARENT); - return conn ? &conn->dev : NULL; + hci_dev_unlock(hdev); + hci_dev_put(hdev); } static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf) @@ -200,19 +213,18 @@ static ssize_t show_channel(struct device *tty_dev, struct device_attribute *att static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL); -static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) +static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, + struct rfcomm_dlc *dlc) { struct rfcomm_dev *dev, *entry; struct list_head *head = &rfcomm_dev_list; int err = 0; - BT_DBG("id %d channel %d", req->dev_id, req->channel); - dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL); if (!dev) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - spin_lock(&rfcomm_dev_lock); + mutex_lock(&rfcomm_dev_lock); if (req->dev_id < 0) { dev->id = 0; @@ -293,22 +305,37 @@ static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) holds reference to this module. */ __module_get(THIS_MODULE); + mutex_unlock(&rfcomm_dev_lock); + return dev; + out: - spin_unlock(&rfcomm_dev_lock); + mutex_unlock(&rfcomm_dev_lock); + kfree(dev); + return ERR_PTR(err); +} + +static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) +{ + struct rfcomm_dev *dev; + struct device *tty; + + BT_DBG("id %d channel %d", req->dev_id, req->channel); - if (err < 0) - goto free; + dev = __rfcomm_dev_add(req, dlc); + if (IS_ERR(dev)) { + rfcomm_dlc_put(dlc); + return PTR_ERR(dev); + } - dev->tty_dev = tty_port_register_device(&dev->port, rfcomm_tty_driver, + tty = tty_port_register_device(&dev->port, rfcomm_tty_driver, dev->id, NULL); - if (IS_ERR(dev->tty_dev)) { - err = PTR_ERR(dev->tty_dev); - spin_lock(&rfcomm_dev_lock); - list_del(&dev->list); - spin_unlock(&rfcomm_dev_lock); - goto free; + if (IS_ERR(tty)) { + tty_port_put(&dev->port); + return PTR_ERR(tty); } + dev->tty_dev = tty; + rfcomm_reparent_device(dev); dev_set_drvdata(dev->tty_dev, dev); if (device_create_file(dev->tty_dev, &dev_attr_address) < 0) @@ -318,24 +345,23 @@ out: BT_ERR("Failed to create channel attribute"); return dev->id; - -free: - kfree(dev); - return err; } /* ---- Send buffer ---- */ -static inline unsigned int rfcomm_room(struct rfcomm_dlc *dlc) +static inline unsigned int rfcomm_room(struct rfcomm_dev *dev) { - /* We can't let it be zero, because we don't get a callback - when tx_credits becomes nonzero, hence we'd never wake up */ - return dlc->mtu * (dlc->tx_credits?:1); + struct rfcomm_dlc *dlc = dev->dlc; + + /* Limit the outstanding number of packets not yet sent to 40 */ + int pending = 40 - atomic_read(&dev->wmem_alloc); + + return max(0, pending) * dlc->mtu; } static void rfcomm_wfree(struct sk_buff *skb) { struct rfcomm_dev *dev = (void *) skb->sk; - atomic_sub(skb->truesize, &dev->wmem_alloc); + atomic_dec(&dev->wmem_alloc); if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags)) tty_port_tty_wakeup(&dev->port); tty_port_put(&dev->port); @@ -344,28 +370,24 @@ static void rfcomm_wfree(struct sk_buff *skb) static void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev) { tty_port_get(&dev->port); - atomic_add(skb->truesize, &dev->wmem_alloc); + atomic_inc(&dev->wmem_alloc); skb->sk = (void *) dev; skb->destructor = rfcomm_wfree; } static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority) { - if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) { - struct sk_buff *skb = alloc_skb(size, priority); - if (skb) { - rfcomm_set_owner_w(skb, dev); - return skb; - } - } - return NULL; + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) + rfcomm_set_owner_w(skb, dev); + return skb; } /* ---- Device IOCTLs ---- */ #define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP)) -static int rfcomm_create_dev(struct sock *sk, void __user *arg) +static int __rfcomm_create_dev(struct sock *sk, void __user *arg) { struct rfcomm_dev_req req; struct rfcomm_dlc *dlc; @@ -387,16 +409,22 @@ static int rfcomm_create_dev(struct sock *sk, void __user *arg) dlc = rfcomm_pi(sk)->dlc; rfcomm_dlc_hold(dlc); } else { + /* Validate the channel is unused */ + dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel); + if (IS_ERR(dlc)) + return PTR_ERR(dlc); + else if (dlc) { + rfcomm_dlc_put(dlc); + return -EBUSY; + } dlc = rfcomm_dlc_alloc(GFP_KERNEL); if (!dlc) return -ENOMEM; } id = rfcomm_dev_add(&req, dlc); - if (id < 0) { - rfcomm_dlc_put(dlc); + if (id < 0) return id; - } if (req.flags & (1 << RFCOMM_REUSE_DLC)) { /* DLC is now used by device. @@ -407,7 +435,7 @@ static int rfcomm_create_dev(struct sock *sk, void __user *arg) return id; } -static int rfcomm_release_dev(void __user *arg) +static int __rfcomm_release_dev(void __user *arg) { struct rfcomm_dev_req req; struct rfcomm_dev *dev; @@ -427,6 +455,12 @@ static int rfcomm_release_dev(void __user *arg) return -EPERM; } + /* only release once */ + if (test_and_set_bit(RFCOMM_DEV_RELEASED, &dev->status)) { + tty_port_put(&dev->port); + return -EALREADY; + } + if (req.flags & (1 << RFCOMM_HANGUP_NOW)) rfcomm_dlc_close(dev->dlc, 0); @@ -437,13 +471,35 @@ static int rfcomm_release_dev(void __user *arg) tty_kref_put(tty); } - if (!test_and_set_bit(RFCOMM_TTY_RELEASED, &dev->flags)) + if (!test_bit(RFCOMM_TTY_OWNED, &dev->status)) tty_port_put(&dev->port); tty_port_put(&dev->port); return 0; } +static int rfcomm_create_dev(struct sock *sk, void __user *arg) +{ + int ret; + + mutex_lock(&rfcomm_ioctl_mutex); + ret = __rfcomm_create_dev(sk, arg); + mutex_unlock(&rfcomm_ioctl_mutex); + + return ret; +} + +static int rfcomm_release_dev(void __user *arg) +{ + int ret; + + mutex_lock(&rfcomm_ioctl_mutex); + ret = __rfcomm_release_dev(arg); + mutex_unlock(&rfcomm_ioctl_mutex); + + return ret; +} + static int rfcomm_get_dev_list(void __user *arg) { struct rfcomm_dev *dev; @@ -468,10 +524,10 @@ static int rfcomm_get_dev_list(void __user *arg) di = dl->dev_info; - spin_lock(&rfcomm_dev_lock); + mutex_lock(&rfcomm_dev_lock); list_for_each_entry(dev, &rfcomm_dev_list, list) { - if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags)) + if (!tty_port_get(&dev->port)) continue; (di + n)->id = dev->id; (di + n)->flags = dev->flags; @@ -479,11 +535,12 @@ static int rfcomm_get_dev_list(void __user *arg) (di + n)->channel = dev->channel; bacpy(&(di + n)->src, &dev->src); bacpy(&(di + n)->dst, &dev->dst); + tty_port_put(&dev->port); if (++n >= dev_num) break; } - spin_unlock(&rfcomm_dev_lock); + mutex_unlock(&rfcomm_dev_lock); dl->dev_num = n; size = sizeof(*dl) + n * sizeof(*di); @@ -576,8 +633,7 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err) dev->err = err; if (dlc->state == BT_CONNECTED) { - device_move(dev->tty_dev, rfcomm_get_device(dev), - DPM_ORDER_DEV_AFTER_PARENT); + rfcomm_reparent_device(dev); wake_up_interruptible(&dev->port.open_wait); } else if (dlc->state == BT_CLOSED) @@ -670,10 +726,22 @@ static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty) /* install the tty_port */ err = tty_port_install(&dev->port, driver, tty); - if (err) + if (err) { rfcomm_tty_cleanup(tty); + return err; + } - return err; + /* take over the tty_port reference if the port was created with the + * flag RFCOMM_RELEASE_ONHUP. This will force the release of the port + * when the last process closes the tty. The behaviour is expected by + * userspace. + */ + if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) { + set_bit(RFCOMM_TTY_OWNED, &dev->status); + tty_port_put(&dev->port); + } + + return 0; } static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) @@ -717,7 +785,7 @@ static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, in struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; struct rfcomm_dlc *dlc = dev->dlc; struct sk_buff *skb; - int err = 0, sent = 0, size; + int sent = 0, size; BT_DBG("tty %p count %d", tty, count); @@ -725,7 +793,6 @@ static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, in size = min_t(uint, count, dlc->mtu); skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC); - if (!skb) break; @@ -733,32 +800,24 @@ static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, in memcpy(skb_put(skb, size), buf + sent, size); - err = rfcomm_dlc_send(dlc, skb); - if (err < 0) { - kfree_skb(skb); - break; - } + rfcomm_dlc_send_noerror(dlc, skb); sent += size; count -= size; } - return sent ? sent : err; + return sent; } static int rfcomm_tty_write_room(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; - int room; + int room = 0; - BT_DBG("tty %p", tty); + if (dev && dev->dlc) + room = rfcomm_room(dev); - if (!dev || !dev->dlc) - return 0; - - room = rfcomm_room(dev->dlc) - atomic_read(&dev->wmem_alloc); - if (room < 0) - room = 0; + BT_DBG("tty %p room %d", tty, room); return room; } @@ -1010,10 +1069,6 @@ static void rfcomm_tty_hangup(struct tty_struct *tty) BT_DBG("tty %p dev %p", tty, dev); tty_port_hangup(&dev->port); - - if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags) && - !test_and_set_bit(RFCOMM_TTY_RELEASED, &dev->flags)) - tty_port_put(&dev->port); } static int rfcomm_tty_tiocmget(struct tty_struct *tty) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 24fa3964b3c..c06dbd3938e 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -676,20 +676,20 @@ static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) bacpy(&cp.bdaddr, &conn->dst); cp.pkt_type = cpu_to_le16(conn->pkt_type); - cp.tx_bandwidth = __constant_cpu_to_le32(0x00001f40); - cp.rx_bandwidth = __constant_cpu_to_le32(0x00001f40); + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.content_format = cpu_to_le16(setting); switch (setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (conn->pkt_type & ESCO_2EV3) - cp.max_latency = __constant_cpu_to_le16(0x0008); + cp.max_latency = cpu_to_le16(0x0008); else - cp.max_latency = __constant_cpu_to_le16(0x000D); + cp.max_latency = cpu_to_le16(0x000D); cp.retrans_effort = 0x02; break; case SCO_AIRMODE_CVSD: - cp.max_latency = __constant_cpu_to_le16(0xffff); + cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; } @@ -1024,7 +1024,7 @@ static void sco_conn_ready(struct sco_conn *conn) sk->sk_state = BT_CONNECTED; /* Wake up parent */ - parent->sk_data_ready(parent, 1); + parent->sk_data_ready(parent); bh_unlock_sock(parent); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4b07acb8293..e33a982161c 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -35,14 +35,41 @@ #define AUTH_REQ_MASK 0x07 -static inline void swap128(u8 src[16], u8 dst[16]) +#define SMP_FLAG_TK_VALID 1 +#define SMP_FLAG_CFM_PENDING 2 +#define SMP_FLAG_MITM_AUTH 3 +#define SMP_FLAG_COMPLETE 4 +#define SMP_FLAG_INITIATOR 5 + +struct smp_chan { + struct l2cap_conn *conn; + u8 preq[7]; /* SMP Pairing Request */ + u8 prsp[7]; /* SMP Pairing Response */ + u8 prnd[16]; /* SMP Pairing Random (local) */ + u8 rrnd[16]; /* SMP Pairing Random (remote) */ + u8 pcnf[16]; /* SMP Pairing Confirm */ + u8 tk[16]; /* SMP Temporary Key */ + u8 enc_key_size; + u8 remote_key_dist; + bdaddr_t id_addr; + u8 id_addr_type; + u8 irk[16]; + struct smp_csrk *csrk; + struct smp_csrk *slave_csrk; + struct smp_ltk *ltk; + struct smp_ltk *slave_ltk; + struct smp_irk *remote_irk; + unsigned long flags; +}; + +static inline void swap128(const u8 src[16], u8 dst[16]) { int i; for (i = 0; i < 16; i++) dst[15 - i] = src[i]; } -static inline void swap56(u8 src[7], u8 dst[7]) +static inline void swap56(const u8 src[7], u8 dst[7]) { int i; for (i = 0; i < 7; i++) @@ -53,8 +80,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) { struct blkcipher_desc desc; struct scatterlist sg; - int err, iv_len; - unsigned char iv[128]; + uint8_t tmp[16], data[16]; + int err; if (tfm == NULL) { BT_ERR("tfm %p", tfm); @@ -64,27 +91,89 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) desc.tfm = tfm; desc.flags = 0; - err = crypto_blkcipher_setkey(tfm, k, 16); + /* The most significant octet of key corresponds to k[0] */ + swap128(k, tmp); + + err = crypto_blkcipher_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } - sg_init_one(&sg, r, 16); + /* Most significant octet of plaintextData corresponds to data[0] */ + swap128(r, data); - iv_len = crypto_blkcipher_ivsize(tfm); - if (iv_len) { - memset(&iv, 0xff, iv_len); - crypto_blkcipher_set_iv(tfm, iv, iv_len); - } + sg_init_one(&sg, data, 16); err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16); if (err) BT_ERR("Encrypt data error %d", err); + /* Most significant octet of encryptedData corresponds to data[0] */ + swap128(data, r); + return err; } +static int smp_ah(struct crypto_blkcipher *tfm, u8 irk[16], u8 r[3], u8 res[3]) +{ + u8 _res[16]; + int err; + + /* r' = padding || r */ + memcpy(_res, r, 3); + memset(_res + 3, 0, 13); + + err = smp_e(tfm, irk, _res); + if (err) { + BT_ERR("Encrypt error"); + return err; + } + + /* The output of the random address function ah is: + * ah(h, r) = e(k, r') mod 2^24 + * The output of the security function e is then truncated to 24 bits + * by taking the least significant 24 bits of the output of e as the + * result of ah. + */ + memcpy(res, _res, 3); + + return 0; +} + +bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], + bdaddr_t *bdaddr) +{ + u8 hash[3]; + int err; + + BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); + + err = smp_ah(tfm, irk, &bdaddr->b[3], hash); + if (err) + return false; + + return !memcmp(bdaddr->b, hash, 3); +} + +int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa) +{ + int err; + + get_random_bytes(&rpa->b[3], 3); + + rpa->b[5] &= 0x3f; /* Clear two most significant bits */ + rpa->b[5] |= 0x40; /* Set second most significant bit */ + + err = smp_ah(tfm, irk, &rpa->b[3], rpa->b); + if (err < 0) + return err; + + BT_DBG("RPA %pMR", rpa); + + return 0; +} + static int smp_c1(struct crypto_blkcipher *tfm, u8 k[16], u8 r[16], u8 preq[7], u8 pres[7], u8 _iat, bdaddr_t *ia, u8 _rat, bdaddr_t *ra, u8 res[16]) @@ -95,16 +184,15 @@ static int smp_c1(struct crypto_blkcipher *tfm, u8 k[16], u8 r[16], memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ - swap56(pres, p1); - swap56(preq, p1 + 7); - p1[14] = _rat; - p1[15] = _iat; - - memset(p2, 0, 16); + p1[0] = _iat; + p1[1] = _rat; + memcpy(p1 + 2, preq, 7); + memcpy(p1 + 9, pres, 7); /* p2 = padding || ia || ra */ - baswap((bdaddr_t *) (p2 + 4), ia); - baswap((bdaddr_t *) (p2 + 10), ra); + memcpy(p2, ra, 6); + memcpy(p2 + 6, ia, 6); + memset(p2 + 12, 0, 4); /* res = r XOR p1 */ u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); @@ -133,8 +221,8 @@ static int smp_s1(struct crypto_blkcipher *tfm, u8 k[16], u8 r1[16], int err; /* Just least significant octets from r1 and r2 are considered */ - memcpy(_r, r1 + 8, 8); - memcpy(_r + 8, r2 + 8, 8); + memcpy(_r, r2, 8); + memcpy(_r + 8, r1, 8); err = smp_e(tfm, k, _r); if (err) @@ -143,13 +231,6 @@ static int smp_s1(struct crypto_blkcipher *tfm, u8 k[16], u8 r1[16], return err; } -static int smp_rand(u8 *buf) -{ - get_random_bytes(buf, 16); - - return 0; -} - static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code, u16 dlen, void *data) { @@ -168,7 +249,7 @@ static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code, lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); lh->len = cpu_to_le16(sizeof(code) + dlen); - lh->cid = __constant_cpu_to_le16(L2CAP_CID_SMP); + lh->cid = cpu_to_le16(L2CAP_CID_SMP); memcpy(skb_put(skb, sizeof(code)), &code, sizeof(code)); @@ -217,31 +298,45 @@ static void build_pairing_cmd(struct l2cap_conn *conn, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp, __u8 authreq) { - u8 dist_keys = 0; + struct smp_chan *smp = conn->smp_chan; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; + u8 local_dist = 0, remote_dist = 0; if (test_bit(HCI_PAIRABLE, &conn->hcon->hdev->dev_flags)) { - dist_keys = SMP_DIST_ENC_KEY; + local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; + remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; authreq |= SMP_AUTH_BONDING; } else { authreq &= ~SMP_AUTH_BONDING; } + if (test_bit(HCI_RPA_RESOLVING, &hdev->dev_flags)) + remote_dist |= SMP_DIST_ID_KEY; + + if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) + local_dist |= SMP_DIST_ID_KEY; + if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = SMP_OOB_NOT_PRESENT; req->max_key_size = SMP_MAX_ENC_KEY_SIZE; - req->init_key_dist = 0; - req->resp_key_dist = dist_keys; + req->init_key_dist = local_dist; + req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK); + + smp->remote_key_dist = remote_dist; return; } rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = SMP_OOB_NOT_PRESENT; rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; - rsp->init_key_dist = 0; - rsp->resp_key_dist = req->resp_key_dist & dist_keys; + rsp->init_key_dist = req->init_key_dist & remote_dist; + rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK); + + smp->remote_key_dist = rsp->init_key_dist; } static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) @@ -257,11 +352,11 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) return 0; } -static void smp_failure(struct l2cap_conn *conn, u8 reason, u8 send) +static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; - if (send) + if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), &reason); @@ -290,6 +385,16 @@ static const u8 gen_method[5][5] = { { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP }, }; +static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io) +{ + /* If either side has unknown io_caps, use JUST WORKS */ + if (local_io > SMP_IO_KEYBOARD_DISPLAY || + remote_io > SMP_IO_KEYBOARD_DISPLAY) + return JUST_WORKS; + + return gen_method[remote_io][local_io]; +} + static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { @@ -301,33 +406,34 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); - clear_bit(SMP_FLAG_TK_VALID, &smp->smp_flags); + clear_bit(SMP_FLAG_TK_VALID, &smp->flags); BT_DBG("tk_request: auth:%d lcl:%d rem:%d", auth, local_io, remote_io); /* If neither side wants MITM, use JUST WORKS */ - /* If either side has unknown io_caps, use JUST WORKS */ /* Otherwise, look up method from the table */ - if (!(auth & SMP_AUTH_MITM) || - local_io > SMP_IO_KEYBOARD_DISPLAY || - remote_io > SMP_IO_KEYBOARD_DISPLAY) + if (!(auth & SMP_AUTH_MITM)) method = JUST_WORKS; else - method = gen_method[remote_io][local_io]; + method = get_auth_method(smp, local_io, remote_io); /* If not bonding, don't ask user to confirm a Zero TK */ if (!(auth & SMP_AUTH_BONDING) && method == JUST_CFM) method = JUST_WORKS; + /* Don't confirm locally initiated pairing attempts */ + if (method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) + method = JUST_WORKS; + /* If Just Works, Continue with Zero TK */ if (method == JUST_WORKS) { - set_bit(SMP_FLAG_TK_VALID, &smp->smp_flags); + set_bit(SMP_FLAG_TK_VALID, &smp->flags); return 0; } /* Not Just Works/Confirm results in MITM Authentication */ if (method != JUST_CFM) - set_bit(SMP_FLAG_MITM_AUTH, &smp->smp_flags); + set_bit(SMP_FLAG_MITM_AUTH, &smp->flags); /* If both devices have Keyoard-Display I/O, the master * Confirms and the slave Enters the passkey. @@ -339,16 +445,14 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, method = REQ_PASSKEY; } - /* Generate random passkey. Not valid until confirmed. */ + /* Generate random passkey. */ if (method == CFM_PASSKEY) { - u8 key[16]; - - memset(key, 0, sizeof(key)); + memset(smp->tk, 0, sizeof(smp->tk)); get_random_bytes(&passkey, sizeof(passkey)); passkey %= 1000000; - put_unaligned_le32(passkey, key); - swap128(key, smp->tk); + put_unaligned_le32(passkey, smp->tk); BT_DBG("PassKey: %d", passkey); + set_bit(SMP_FLAG_TK_VALID, &smp->flags); } hci_dev_lock(hcon->hdev); @@ -356,141 +460,120 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, if (method == REQ_PASSKEY) ret = mgmt_user_passkey_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type); - else + else if (method == JUST_CFM) ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, - cpu_to_le32(passkey), 0); + passkey, 1); + else + ret = mgmt_user_passkey_notify(hcon->hdev, &hcon->dst, + hcon->type, hcon->dst_type, + passkey, 0); hci_dev_unlock(hcon->hdev); return ret; } -static void confirm_work(struct work_struct *work) +static u8 smp_confirm(struct smp_chan *smp) { - struct smp_chan *smp = container_of(work, struct smp_chan, confirm); struct l2cap_conn *conn = smp->conn; - struct crypto_blkcipher *tfm; + struct hci_dev *hdev = conn->hcon->hdev; + struct crypto_blkcipher *tfm = hdev->tfm_aes; struct smp_cmd_pairing_confirm cp; int ret; - u8 res[16], reason; BT_DBG("conn %p", conn); - tfm = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) { - reason = SMP_UNSPECIFIED; - goto error; - } + /* Prevent mutual access to hdev->tfm_aes */ + hci_dev_lock(hdev); - smp->tfm = tfm; + ret = smp_c1(tfm, smp->tk, smp->prnd, smp->preq, smp->prsp, + conn->hcon->init_addr_type, &conn->hcon->init_addr, + conn->hcon->resp_addr_type, &conn->hcon->resp_addr, + cp.confirm_val); - if (conn->hcon->out) - ret = smp_c1(tfm, smp->tk, smp->prnd, smp->preq, smp->prsp, - conn->hcon->src_type, &conn->hcon->src, - conn->hcon->dst_type, &conn->hcon->dst, res); - else - ret = smp_c1(tfm, smp->tk, smp->prnd, smp->preq, smp->prsp, - conn->hcon->dst_type, &conn->hcon->dst, - conn->hcon->src_type, &conn->hcon->src, res); - if (ret) { - reason = SMP_UNSPECIFIED; - goto error; - } + hci_dev_unlock(hdev); - clear_bit(SMP_FLAG_CFM_PENDING, &smp->smp_flags); + if (ret) + return SMP_UNSPECIFIED; - swap128(res, cp.confirm_val); - smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); + clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags); - return; + smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); -error: - smp_failure(conn, reason, 1); + return 0; } -static void random_work(struct work_struct *work) +static u8 smp_random(struct smp_chan *smp) { - struct smp_chan *smp = container_of(work, struct smp_chan, random); struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; - struct crypto_blkcipher *tfm = smp->tfm; - u8 reason, confirm[16], res[16], key[16]; + struct hci_dev *hdev = hcon->hdev; + struct crypto_blkcipher *tfm = hdev->tfm_aes; + u8 confirm[16]; int ret; - if (IS_ERR_OR_NULL(tfm)) { - reason = SMP_UNSPECIFIED; - goto error; - } + if (IS_ERR_OR_NULL(tfm)) + return SMP_UNSPECIFIED; BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); - if (hcon->out) - ret = smp_c1(tfm, smp->tk, smp->rrnd, smp->preq, smp->prsp, - hcon->src_type, &hcon->src, - hcon->dst_type, &hcon->dst, res); - else - ret = smp_c1(tfm, smp->tk, smp->rrnd, smp->preq, smp->prsp, - hcon->dst_type, &hcon->dst, - hcon->src_type, &hcon->src, res); - if (ret) { - reason = SMP_UNSPECIFIED; - goto error; - } + /* Prevent mutual access to hdev->tfm_aes */ + hci_dev_lock(hdev); + + ret = smp_c1(tfm, smp->tk, smp->rrnd, smp->preq, smp->prsp, + hcon->init_addr_type, &hcon->init_addr, + hcon->resp_addr_type, &hcon->resp_addr, confirm); + + hci_dev_unlock(hdev); - swap128(res, confirm); + if (ret) + return SMP_UNSPECIFIED; if (memcmp(smp->pcnf, confirm, sizeof(smp->pcnf)) != 0) { BT_ERR("Pairing failed (confirmation values mismatch)"); - reason = SMP_CONFIRM_FAILED; - goto error; + return SMP_CONFIRM_FAILED; } if (hcon->out) { - u8 stk[16], rand[8]; - __le16 ediv; + u8 stk[16]; + __le64 rand = 0; + __le16 ediv = 0; - memset(rand, 0, sizeof(rand)); - ediv = 0; - - smp_s1(tfm, smp->tk, smp->rrnd, smp->prnd, key); - swap128(key, stk); + smp_s1(tfm, smp->tk, smp->rrnd, smp->prnd, stk); memset(stk + smp->enc_key_size, 0, SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) { - reason = SMP_UNSPECIFIED; - goto error; - } + if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) + return SMP_UNSPECIFIED; hci_le_start_enc(hcon, ediv, rand, stk); hcon->enc_key_size = smp->enc_key_size; } else { - u8 stk[16], r[16], rand[8]; - __le16 ediv; - - memset(rand, 0, sizeof(rand)); - ediv = 0; + u8 stk[16], auth; + __le64 rand = 0; + __le16 ediv = 0; - swap128(smp->prnd, r); - smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(r), r); + smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), + smp->prnd); - smp_s1(tfm, smp->tk, smp->prnd, smp->rrnd, key); - swap128(key, stk); + smp_s1(tfm, smp->tk, smp->prnd, smp->rrnd, stk); memset(stk + smp->enc_key_size, 0, SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); + if (hcon->pending_sec_level == BT_SECURITY_HIGH) + auth = 1; + else + auth = 0; + hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, - HCI_SMP_STK_SLAVE, 0, 0, stk, smp->enc_key_size, + HCI_SMP_STK_SLAVE, auth, stk, smp->enc_key_size, ediv, rand); } - return; - -error: - smp_failure(conn, reason, 1); + return 0; } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) @@ -501,9 +584,6 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (!smp) return NULL; - INIT_WORK(&smp->confirm, confirm_work); - INIT_WORK(&smp->random, random_work); - smp->conn = conn; conn->smp_chan = smp; conn->hcon->smp_conn = conn; @@ -516,11 +596,33 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) void smp_chan_destroy(struct l2cap_conn *conn) { struct smp_chan *smp = conn->smp_chan; + bool complete; BUG_ON(!smp); - if (smp->tfm) - crypto_free_blkcipher(smp->tfm); + complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); + mgmt_smp_complete(conn->hcon, complete); + + kfree(smp->csrk); + kfree(smp->slave_csrk); + + /* If pairing failed clean up any keys we might have */ + if (!complete) { + if (smp->ltk) { + list_del(&smp->ltk->list); + kfree(smp->ltk); + } + + if (smp->slave_ltk) { + list_del(&smp->slave_ltk->list); + kfree(smp->slave_ltk); + } + + if (smp->remote_irk) { + list_del(&smp->remote_irk->list); + kfree(smp->remote_irk); + } + } kfree(smp); conn->smp_chan = NULL; @@ -533,7 +635,6 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) struct l2cap_conn *conn = hcon->smp_conn; struct smp_chan *smp; u32 value; - u8 key[16]; BT_DBG(""); @@ -545,26 +646,28 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_REPLY: value = le32_to_cpu(passkey); - memset(key, 0, sizeof(key)); + memset(smp->tk, 0, sizeof(smp->tk)); BT_DBG("PassKey: %d", value); - put_unaligned_le32(value, key); - swap128(key, smp->tk); + put_unaligned_le32(value, smp->tk); /* Fall Through */ case MGMT_OP_USER_CONFIRM_REPLY: - set_bit(SMP_FLAG_TK_VALID, &smp->smp_flags); + set_bit(SMP_FLAG_TK_VALID, &smp->flags); break; case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: - smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED, 1); + smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); return 0; default: - smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED, 1); + smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); return -EOPNOTSUPP; } /* If it is our turn to send Pairing Confirm, do so now */ - if (test_bit(SMP_FLAG_CFM_PENDING, &smp->smp_flags)) - queue_work(hcon->hdev->workqueue, &smp->confirm); + if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) { + u8 rsp = smp_confirm(smp); + if (rsp) + smp_failure(conn, rsp); + } return 0; } @@ -573,12 +676,14 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; struct smp_chan *smp; - u8 key_size; - u8 auth = SMP_AUTH_NONE; + u8 key_size, auth, sec_level; int ret; BT_DBG("conn %p", conn); + if (skb->len < sizeof(*req)) + return SMP_INVALID_PARAMS; + if (conn->hcon->link_mode & HCI_LM_MASTER) return SMP_CMD_NOTSUPP; @@ -595,10 +700,21 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) skb_pull(skb, sizeof(*req)); /* We didn't start the pairing, so match remote */ - if (req->auth_req & SMP_AUTH_BONDING) - auth = req->auth_req; + auth = req->auth_req; + + sec_level = authreq_to_seclevel(auth); + if (sec_level > conn->hcon->pending_sec_level) + conn->hcon->pending_sec_level = sec_level; - conn->hcon->pending_sec_level = authreq_to_seclevel(auth); + /* If we need MITM check that it can be acheived */ + if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { + u8 method; + + method = get_auth_method(smp, conn->hcon->io_capability, + req->io_capability); + if (method == JUST_WORKS || method == JUST_CFM) + return SMP_AUTH_REQUIREMENTS; + } build_pairing_cmd(conn, req, &rsp, auth); @@ -606,9 +722,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; - ret = smp_rand(smp->prnd); - if (ret) - return SMP_UNSPECIFIED; + get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); @@ -620,6 +734,8 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (ret) return SMP_UNSPECIFIED; + clear_bit(SMP_FLAG_INITIATOR, &smp->flags); + return 0; } @@ -627,12 +743,14 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing *req, *rsp = (void *) skb->data; struct smp_chan *smp = conn->smp_chan; - struct hci_dev *hdev = conn->hcon->hdev; u8 key_size, auth = SMP_AUTH_NONE; int ret; BT_DBG("conn %p", conn); + if (skb->len < sizeof(*rsp)) + return SMP_INVALID_PARAMS; + if (!(conn->hcon->link_mode & HCI_LM_MASTER)) return SMP_CMD_NOTSUPP; @@ -644,13 +762,26 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; - ret = smp_rand(smp->prnd); - if (ret) - return SMP_UNSPECIFIED; + /* If we need MITM check that it can be acheived */ + if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { + u8 method; + + method = get_auth_method(smp, req->io_capability, + rsp->io_capability); + if (method == JUST_WORKS || method == JUST_CFM) + return SMP_AUTH_REQUIREMENTS; + } + + get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], rsp, sizeof(*rsp)); + /* Update remote key distribution in case the remote cleared + * some bits that we had enabled in our request. + */ + smp->remote_key_dist &= rsp->resp_key_dist; + if ((req->auth_req & SMP_AUTH_BONDING) && (rsp->auth_req & SMP_AUTH_BONDING)) auth = SMP_AUTH_BONDING; @@ -661,13 +792,11 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (ret) return SMP_UNSPECIFIED; - set_bit(SMP_FLAG_CFM_PENDING, &smp->smp_flags); + set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); /* Can't compose response until we have been confirmed */ - if (!test_bit(SMP_FLAG_TK_VALID, &smp->smp_flags)) - return 0; - - queue_work(hdev->workqueue, &smp->confirm); + if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) + return smp_confirm(smp); return 0; } @@ -675,24 +804,22 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_chan *smp = conn->smp_chan; - struct hci_dev *hdev = conn->hcon->hdev; BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); + if (skb->len < sizeof(smp->pcnf)) + return SMP_INVALID_PARAMS; + memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); - if (conn->hcon->out) { - u8 random[16]; - - swap128(smp->prnd, random); - smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(random), - random); - } else if (test_bit(SMP_FLAG_TK_VALID, &smp->smp_flags)) { - queue_work(hdev->workqueue, &smp->confirm); - } else { - set_bit(SMP_FLAG_CFM_PENDING, &smp->smp_flags); - } + if (conn->hcon->out) + smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), + smp->prnd); + else if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) + return smp_confirm(smp); + else + set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } @@ -700,16 +827,16 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_chan *smp = conn->smp_chan; - struct hci_dev *hdev = conn->hcon->hdev; BT_DBG("conn %p", conn); - swap128(skb->data, smp->rrnd); - skb_pull(skb, sizeof(smp->rrnd)); + if (skb->len < sizeof(smp->rrnd)) + return SMP_INVALID_PARAMS; - queue_work(hdev->workqueue, &smp->random); + memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd)); + skb_pull(skb, sizeof(smp->rrnd)); - return 0; + return smp_random(smp); } static u8 smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) @@ -717,7 +844,8 @@ static u8 smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) struct smp_ltk *key; struct hci_conn *hcon = conn->hcon; - key = hci_find_ltk_by_addr(hcon->hdev, &hcon->dst, hcon->dst_type); + key = hci_find_ltk_by_addr(hcon->hdev, &hcon->dst, hcon->dst_type, + hcon->out); if (!key) return 0; @@ -739,13 +867,19 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp; + u8 sec_level; BT_DBG("conn %p", conn); + if (skb->len < sizeof(*rp)) + return SMP_INVALID_PARAMS; + if (!(conn->hcon->link_mode & HCI_LM_MASTER)) return SMP_CMD_NOTSUPP; - hcon->pending_sec_level = authreq_to_seclevel(rp->auth_req); + sec_level = authreq_to_seclevel(rp->auth_req); + if (sec_level > hcon->pending_sec_level) + hcon->pending_sec_level = sec_level; if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; @@ -765,29 +899,46 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + clear_bit(SMP_FLAG_INITIATOR, &smp->flags); + return 0; } +bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level) +{ + if (sec_level == BT_SECURITY_LOW) + return true; + + if (hcon->sec_level >= sec_level) + return true; + + return false; +} + int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; - struct smp_chan *smp = conn->smp_chan; + struct smp_chan *smp; __u8 authreq; BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); - if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) + /* This may be NULL if there's an unexpected disconnection */ + if (!conn) return 1; - if (sec_level == BT_SECURITY_LOW) + if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) return 1; - if (hcon->sec_level >= sec_level) + if (smp_sufficient_security(hcon, sec_level)) return 1; + if (sec_level > hcon->pending_sec_level) + hcon->pending_sec_level = sec_level; + if (hcon->link_mode & HCI_LM_MASTER) - if (smp_ltk_encrypt(conn, sec_level)) - goto done; + if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) + return 0; if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) return 0; @@ -798,6 +949,13 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq = seclevel_to_authreq(sec_level); + /* Require MITM if IO Capability allows or the security level + * requires it. + */ + if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || + hcon->pending_sec_level > BT_SECURITY_MEDIUM) + authreq |= SMP_AUTH_MITM; + if (hcon->link_mode & HCI_LM_MASTER) { struct smp_cmd_pairing cp; @@ -812,8 +970,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); } -done: - hcon->pending_sec_level = sec_level; + set_bit(SMP_FLAG_INITIATOR, &smp->flags); return 0; } @@ -823,6 +980,15 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_encrypt_info *rp = (void *) skb->data; struct smp_chan *smp = conn->smp_chan; + BT_DBG("conn %p", conn); + + if (skb->len < sizeof(*rp)) + return SMP_INVALID_PARAMS; + + /* Ignore this PDU if it wasn't requested */ + if (!(smp->remote_key_dist & SMP_DIST_ENC_KEY)) + return 0; + skb_pull(skb, sizeof(*rp)); memcpy(smp->tk, rp->ltk, sizeof(smp->tk)); @@ -836,16 +1002,138 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_chan *smp = conn->smp_chan; struct hci_dev *hdev = conn->hcon->hdev; struct hci_conn *hcon = conn->hcon; + struct smp_ltk *ltk; u8 authenticated; + BT_DBG("conn %p", conn); + + if (skb->len < sizeof(*rp)) + return SMP_INVALID_PARAMS; + + /* Ignore this PDU if it wasn't requested */ + if (!(smp->remote_key_dist & SMP_DIST_ENC_KEY)) + return 0; + + /* Mark the information as received */ + smp->remote_key_dist &= ~SMP_DIST_ENC_KEY; + skb_pull(skb, sizeof(*rp)); hci_dev_lock(hdev); authenticated = (hcon->sec_level == BT_SECURITY_HIGH); - hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, HCI_SMP_LTK, 1, - authenticated, smp->tk, smp->enc_key_size, - rp->ediv, rp->rand); - smp_distribute_keys(conn, 1); + ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, HCI_SMP_LTK, + authenticated, smp->tk, smp->enc_key_size, + rp->ediv, rp->rand); + smp->ltk = ltk; + if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) + smp_distribute_keys(conn); + hci_dev_unlock(hdev); + + return 0; +} + +static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct smp_cmd_ident_info *info = (void *) skb->data; + struct smp_chan *smp = conn->smp_chan; + + BT_DBG(""); + + if (skb->len < sizeof(*info)) + return SMP_INVALID_PARAMS; + + /* Ignore this PDU if it wasn't requested */ + if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) + return 0; + + skb_pull(skb, sizeof(*info)); + + memcpy(smp->irk, info->irk, 16); + + return 0; +} + +static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, + struct sk_buff *skb) +{ + struct smp_cmd_ident_addr_info *info = (void *) skb->data; + struct smp_chan *smp = conn->smp_chan; + struct hci_conn *hcon = conn->hcon; + bdaddr_t rpa; + + BT_DBG(""); + + if (skb->len < sizeof(*info)) + return SMP_INVALID_PARAMS; + + /* Ignore this PDU if it wasn't requested */ + if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) + return 0; + + /* Mark the information as received */ + smp->remote_key_dist &= ~SMP_DIST_ID_KEY; + + skb_pull(skb, sizeof(*info)); + + /* Strictly speaking the Core Specification (4.1) allows sending + * an empty address which would force us to rely on just the IRK + * as "identity information". However, since such + * implementations are not known of and in order to not over + * complicate our implementation, simply pretend that we never + * received an IRK for such a device. + */ + if (!bacmp(&info->bdaddr, BDADDR_ANY)) { + BT_ERR("Ignoring IRK with no identity address"); + smp_distribute_keys(conn); + return 0; + } + + bacpy(&smp->id_addr, &info->bdaddr); + smp->id_addr_type = info->addr_type; + + if (hci_bdaddr_is_rpa(&hcon->dst, hcon->dst_type)) + bacpy(&rpa, &hcon->dst); + else + bacpy(&rpa, BDADDR_ANY); + + smp->remote_irk = hci_add_irk(conn->hcon->hdev, &smp->id_addr, + smp->id_addr_type, smp->irk, &rpa); + + smp_distribute_keys(conn); + + return 0; +} + +static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct smp_cmd_sign_info *rp = (void *) skb->data; + struct smp_chan *smp = conn->smp_chan; + struct hci_dev *hdev = conn->hcon->hdev; + struct smp_csrk *csrk; + + BT_DBG("conn %p", conn); + + if (skb->len < sizeof(*rp)) + return SMP_INVALID_PARAMS; + + /* Ignore this PDU if it wasn't requested */ + if (!(smp->remote_key_dist & SMP_DIST_SIGN)) + return 0; + + /* Mark the information as received */ + smp->remote_key_dist &= ~SMP_DIST_SIGN; + + skb_pull(skb, sizeof(*rp)); + + hci_dev_lock(hdev); + csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); + if (csrk) { + csrk->master = 0x01; + memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); + } + smp->csrk = csrk; + if (!(smp->remote_key_dist & SMP_DIST_SIGN)) + smp_distribute_keys(conn); hci_dev_unlock(hdev); return 0; @@ -895,7 +1183,7 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) break; case SMP_CMD_PAIRING_FAIL: - smp_failure(conn, skb->data[0], 0); + smp_failure(conn, 0); reason = 0; err = -EPERM; break; @@ -925,10 +1213,15 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) break; case SMP_CMD_IDENT_INFO: + reason = smp_cmd_ident_info(conn, skb); + break; + case SMP_CMD_IDENT_ADDR_INFO: + reason = smp_cmd_ident_addr_info(conn, skb); + break; + case SMP_CMD_SIGN_INFO: - /* Just ignored */ - reason = 0; + reason = smp_cmd_sign_info(conn, skb); break; default: @@ -941,32 +1234,84 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) done: if (reason) - smp_failure(conn, reason, 1); + smp_failure(conn, reason); kfree_skb(skb); return err; } -int smp_distribute_keys(struct l2cap_conn *conn, __u8 force) +static void smp_notify_keys(struct l2cap_conn *conn) +{ + struct smp_chan *smp = conn->smp_chan; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; + struct smp_cmd_pairing *req = (void *) &smp->preq[1]; + struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; + bool persistent; + + if (smp->remote_irk) { + mgmt_new_irk(hdev, smp->remote_irk); + /* Now that user space can be considered to know the + * identity address track the connection based on it + * from now on. + */ + bacpy(&hcon->dst, &smp->remote_irk->bdaddr); + hcon->dst_type = smp->remote_irk->addr_type; + l2cap_conn_update_id_addr(hcon); + } + + /* The LTKs and CSRKs should be persistent only if both sides + * had the bonding bit set in their authentication requests. + */ + persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); + + if (smp->csrk) { + smp->csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->csrk, persistent); + } + + if (smp->slave_csrk) { + smp->slave_csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->slave_csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->slave_csrk, persistent); + } + + if (smp->ltk) { + smp->ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->ltk, persistent); + } + + if (smp->slave_ltk) { + smp->slave_ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->slave_ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->slave_ltk, persistent); + } +} + +int smp_distribute_keys(struct l2cap_conn *conn) { struct smp_cmd_pairing *req, *rsp; struct smp_chan *smp = conn->smp_chan; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; __u8 *keydist; - BT_DBG("conn %p force %d", conn, force); + BT_DBG("conn %p", conn); - if (!test_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) + if (!test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) return 0; rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ - if (!force && conn->hcon->out && (rsp->resp_key_dist & 0x07)) + if (hcon->out && (smp->remote_key_dist & 0x07)) return 0; req = (void *) &smp->preq[1]; - if (conn->hcon->out) { + if (hcon->out) { keydist = &rsp->init_key_dist; *keydist &= req->init_key_dist; } else { @@ -974,28 +1319,30 @@ int smp_distribute_keys(struct l2cap_conn *conn, __u8 force) *keydist &= req->resp_key_dist; } - BT_DBG("keydist 0x%x", *keydist); if (*keydist & SMP_DIST_ENC_KEY) { struct smp_cmd_encrypt_info enc; struct smp_cmd_master_ident ident; - struct hci_conn *hcon = conn->hcon; + struct smp_ltk *ltk; u8 authenticated; __le16 ediv; + __le64 rand; get_random_bytes(enc.ltk, sizeof(enc.ltk)); get_random_bytes(&ediv, sizeof(ediv)); - get_random_bytes(ident.rand, sizeof(ident.rand)); + get_random_bytes(&rand, sizeof(rand)); smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); authenticated = hcon->sec_level == BT_SECURITY_HIGH; - hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, - HCI_SMP_LTK_SLAVE, 1, authenticated, - enc.ltk, smp->enc_key_size, ediv, ident.rand); + ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, + HCI_SMP_LTK_SLAVE, authenticated, enc.ltk, + smp->enc_key_size, ediv, rand); + smp->slave_ltk = ltk; ident.ediv = ediv; + ident.rand = rand; smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); @@ -1006,14 +1353,18 @@ int smp_distribute_keys(struct l2cap_conn *conn, __u8 force) struct smp_cmd_ident_addr_info addrinfo; struct smp_cmd_ident_info idinfo; - /* Send a dummy key */ - get_random_bytes(idinfo.irk, sizeof(idinfo.irk)); + memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); - /* Just public address */ - memset(&addrinfo, 0, sizeof(addrinfo)); - bacpy(&addrinfo.bdaddr, &conn->hcon->src); + /* The hci_conn contains the local identity address + * after the connection has been established. + * + * This is true even when the connection has been + * established using a resolvable random address. + */ + bacpy(&addrinfo.bdaddr, &hcon->src); + addrinfo.addr_type = hcon->src_type; smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), &addrinfo); @@ -1023,20 +1374,33 @@ int smp_distribute_keys(struct l2cap_conn *conn, __u8 force) if (*keydist & SMP_DIST_SIGN) { struct smp_cmd_sign_info sign; + struct smp_csrk *csrk; - /* Send a dummy key */ + /* Generate a new random key */ get_random_bytes(sign.csrk, sizeof(sign.csrk)); + csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); + if (csrk) { + csrk->master = 0x00; + memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); + } + smp->slave_csrk = csrk; + smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); *keydist &= ~SMP_DIST_SIGN; } - if (conn->hcon->out || force) { - clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags); - cancel_delayed_work_sync(&conn->security_timer); - smp_chan_destroy(conn); - } + /* If there are still keys to be received wait for them */ + if ((smp->remote_key_dist & 0x07)) + return 0; + + clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); + cancel_delayed_work_sync(&conn->security_timer); + set_bit(SMP_FLAG_COMPLETE, &smp->flags); + smp_notify_keys(conn); + + smp_chan_destroy(conn); return 0; } diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index f8ba07f3e5f..5a8dc36460a 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -78,7 +78,7 @@ struct smp_cmd_encrypt_info { #define SMP_CMD_MASTER_IDENT 0x07 struct smp_cmd_master_ident { __le16 ediv; - __u8 rand[8]; + __le64 rand; } __packed; #define SMP_CMD_IDENT_INFO 0x08 @@ -111,36 +111,22 @@ struct smp_cmd_security_req { #define SMP_CMD_NOTSUPP 0x07 #define SMP_UNSPECIFIED 0x08 #define SMP_REPEATED_ATTEMPTS 0x09 +#define SMP_INVALID_PARAMS 0x0a #define SMP_MIN_ENC_KEY_SIZE 7 #define SMP_MAX_ENC_KEY_SIZE 16 -#define SMP_FLAG_TK_VALID 1 -#define SMP_FLAG_CFM_PENDING 2 -#define SMP_FLAG_MITM_AUTH 3 - -struct smp_chan { - struct l2cap_conn *conn; - u8 preq[7]; /* SMP Pairing Request */ - u8 prsp[7]; /* SMP Pairing Response */ - u8 prnd[16]; /* SMP Pairing Random (local) */ - u8 rrnd[16]; /* SMP Pairing Random (remote) */ - u8 pcnf[16]; /* SMP Pairing Confirm */ - u8 tk[16]; /* SMP Temporary Key */ - u8 enc_key_size; - unsigned long smp_flags; - struct crypto_blkcipher *tfm; - struct work_struct confirm; - struct work_struct random; - -}; - /* SMP Commands */ +bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb); -int smp_distribute_keys(struct l2cap_conn *conn, __u8 force); +int smp_distribute_keys(struct l2cap_conn *conn); int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); void smp_chan_destroy(struct l2cap_conn *conn); +bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], + bdaddr_t *bdaddr); +int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa); + #endif /* __SMP_H */ diff --git a/net/bridge/Makefile b/net/bridge/Makefile index e85498b2f16..8590b942bff 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ - br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ + br_ioctl.o br_stp.o br_stp_bpdu.o \ br_stp_if.o br_stp_timer.o br_netlink.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o @@ -16,4 +16,4 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o -obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/ +obj-$(CONFIG_NETFILTER) += netfilter/ diff --git a/net/bridge/br.c b/net/bridge/br.c index ba780cc8e51..1a755a1e541 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -22,14 +22,127 @@ #include "br_private.h" -static const struct stp_proto br_stp_proto = { - .rcv = br_stp_rcv, +/* + * Handle changes in state of network devices enslaved to a bridge. + * + * Note: don't care about up/down if bridge itself is down, because + * port state is checked when bridge is brought up. + */ +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net_bridge_port *p; + struct net_bridge *br; + bool changed_addr; + int err; + + /* register of bridge completed, add sysfs entries */ + if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) { + br_sysfs_addbr(dev); + return NOTIFY_DONE; + } + + /* not a port of a bridge */ + p = br_port_get_rtnl(dev); + if (!p) + return NOTIFY_DONE; + + br = p->br; + + switch (event) { + case NETDEV_CHANGEMTU: + dev_set_mtu(br->dev, br_min_mtu(br)); + break; + + case NETDEV_CHANGEADDR: + spin_lock_bh(&br->lock); + br_fdb_changeaddr(p, dev->dev_addr); + changed_addr = br_stp_recalculate_bridge_id(br); + spin_unlock_bh(&br->lock); + + if (changed_addr) + call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); + + break; + + case NETDEV_CHANGE: + br_port_carrier_check(p); + break; + + case NETDEV_FEAT_CHANGE: + netdev_update_features(br->dev); + break; + + case NETDEV_DOWN: + spin_lock_bh(&br->lock); + if (br->dev->flags & IFF_UP) + br_stp_disable_port(p); + spin_unlock_bh(&br->lock); + break; + + case NETDEV_UP: + if (netif_running(br->dev) && netif_oper_up(dev)) { + spin_lock_bh(&br->lock); + br_stp_enable_port(p); + spin_unlock_bh(&br->lock); + } + break; + + case NETDEV_UNREGISTER: + br_del_if(br, dev); + break; + + case NETDEV_CHANGENAME: + err = br_sysfs_renameif(p); + if (err) + return notifier_from_errno(err); + break; + + case NETDEV_PRE_TYPE_CHANGE: + /* Forbid underlaying device to change its type. */ + return NOTIFY_BAD; + + case NETDEV_RESEND_IGMP: + /* Propagate to master device */ + call_netdevice_notifiers(event, br->dev); + break; + } + + /* Events that may cause spanning tree to refresh */ + if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || + event == NETDEV_CHANGE || event == NETDEV_DOWN) + br_ifinfo_notify(RTM_NEWLINK, p); + + return NOTIFY_DONE; +} + +static struct notifier_block br_device_notifier = { + .notifier_call = br_device_event }; +static void __net_exit br_net_exit(struct net *net) +{ + struct net_device *dev; + LIST_HEAD(list); + + rtnl_lock(); + for_each_netdev(net, dev) + if (dev->priv_flags & IFF_EBRIDGE) + br_dev_delete(dev, &list); + + unregister_netdevice_many(&list); + rtnl_unlock(); + +} + static struct pernet_operations br_net_ops = { .exit = br_net_exit, }; +static const struct stp_proto br_stp_proto = { + .rcv = br_stp_rcv, +}; + static int __init br_init(void) { int err; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f00cfd2a014..568cccd39a3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -32,7 +32,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const unsigned char *dest = skb->data; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; - struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); u16 vid = 0; rcu_read_lock(); @@ -49,14 +49,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) brstats->tx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); - if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid)) - goto out; - BR_INPUT_SKB_CB(skb)->brdev = dev; skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); + if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid)) + goto out; + if (is_broadcast_ether_addr(dest)) br_flood_deliver(br, skb, false); else if (is_multicast_ether_addr(dest)) { @@ -88,18 +88,11 @@ out: static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - int i; - br->stats = alloc_percpu(struct br_cpu_netstats); + br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!br->stats) return -ENOMEM; - for_each_possible_cpu(i) { - struct br_cpu_netstats *br_dev_stats; - br_dev_stats = per_cpu_ptr(br->stats, i); - u64_stats_init(&br_dev_stats->syncp); - } - return 0; } @@ -119,6 +112,12 @@ static void br_dev_set_multicast_list(struct net_device *dev) { } +static void br_dev_change_rx_flags(struct net_device *dev, int change) +{ + if (change & IFF_PROMISC) + br_manage_promisc(netdev_priv(dev)); +} + static int br_dev_stop(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -135,17 +134,17 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct net_bridge *br = netdev_priv(dev); - struct br_cpu_netstats tmp, sum = { 0 }; + struct pcpu_sw_netstats tmp, sum = { 0 }; unsigned int cpu; for_each_possible_cpu(cpu) { unsigned int start; - const struct br_cpu_netstats *bstats + const struct pcpu_sw_netstats *bstats = per_cpu_ptr(br->stats, cpu); do { - start = u64_stats_fetch_begin_bh(&bstats->syncp); + start = u64_stats_fetch_begin_irq(&bstats->syncp); memcpy(&tmp, bstats, sizeof(tmp)); - } while (u64_stats_fetch_retry_bh(&bstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&bstats->syncp, start)); sum.tx_bytes += tmp.tx_bytes; sum.tx_packets += tmp.tx_packets; sum.rx_bytes += tmp.rx_bytes; @@ -187,8 +186,7 @@ static int br_set_mac_address(struct net_device *dev, void *p) spin_lock_bh(&br->lock); if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); - br_fdb_change_mac_address(br, addr->sa_data); + /* Mac address will be changed in br_stp_change_bridge_id(). */ br_stp_change_bridge_id(br, addr->sa_data); } spin_unlock_bh(&br->lock); @@ -226,8 +224,34 @@ static void br_netpoll_cleanup(struct net_device *dev) br_netpoll_disable(p); } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, - gfp_t gfp) +static int __br_netpoll_enable(struct net_bridge_port *p) +{ + struct netpoll *np; + int err; + + np = kzalloc(sizeof(*p->np), GFP_KERNEL); + if (!np) + return -ENOMEM; + + err = __netpoll_setup(np, p->dev); + if (err) { + kfree(np); + return err; + } + + p->np = np; + return err; +} + +int br_netpoll_enable(struct net_bridge_port *p) +{ + if (!p->br->dev->npinfo) + return 0; + + return __br_netpoll_enable(p); +} + +static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; @@ -236,7 +260,7 @@ static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, list_for_each_entry(p, &br->port_list, list) { if (!p->dev) continue; - err = br_netpoll_enable(p, gfp); + err = __br_netpoll_enable(p); if (err) goto fail; } @@ -249,28 +273,6 @@ fail: goto out; } -int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) -{ - struct netpoll *np; - int err; - - if (!p->br->dev->npinfo) - return 0; - - np = kzalloc(sizeof(*p->np), gfp); - if (!np) - return -ENOMEM; - - err = __netpoll_setup(np, p->dev, gfp); - if (err) { - kfree(np); - return err; - } - - p->np = np; - return err; -} - void br_netpoll_disable(struct net_bridge_port *p) { struct netpoll *np = p->np; @@ -313,6 +315,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_get_stats64 = br_get_stats64, .ndo_set_mac_address = br_set_mac_address, .ndo_set_rx_mode = br_dev_set_multicast_list, + .ndo_change_rx_flags = br_dev_change_rx_flags, .ndo_change_mtu = br_change_mtu, .ndo_do_ioctl = br_dev_ioctl, #ifdef CONFIG_NET_POLL_CONTROLLER @@ -352,14 +355,15 @@ void br_dev_setup(struct net_device *dev) dev->netdev_ops = &br_netdev_ops; dev->destructor = br_dev_free; - SET_ETHTOOL_OPS(dev, &br_ethtool_ops); + dev->ethtool_ops = &br_ethtool_ops; SET_NETDEV_DEVTYPE(dev, &br_type); dev->tx_queue_len = 0; dev->priv_flags = IFF_EBRIDGE; dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | - NETIF_F_HW_VLAN_CTAG_TX; - dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX; + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; + dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; dev->vlan_features = COMMON_FEATURES; br->dev = dev; @@ -370,10 +374,11 @@ void br_dev_setup(struct net_device *dev) br->bridge_id.prio[0] = 0x80; br->bridge_id.prio[1] = 0x00; - memcpy(br->group_addr, eth_reserved_addr_base, ETH_ALEN); + ether_addr_copy(br->group_addr, eth_reserved_addr_base); br->stp_enabled = BR_NO_STP; br->group_fwd_mask = BR_GROUPFWD_DEFAULT; + br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; br->designated_root = br->bridge_id; br->bridge_max_age = br->max_age = 20 * HZ; @@ -384,4 +389,5 @@ void br_dev_setup(struct net_device *dev) br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); + br_vlan_init(br); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 33e8f23acdd..b524c36c127 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -27,6 +27,9 @@ #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; +static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, + const unsigned char *addr, + __u16 vid); static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, @@ -82,18 +85,114 @@ static void fdb_rcu_free(struct rcu_head *head) kmem_cache_free(br_fdb_cache, ent); } +/* When a static FDB entry is added, the mac address from the entry is + * added to the bridge private HW address list and all required ports + * are then updated with the new information. + * Called under RTNL. + */ +static void fdb_add_hw(struct net_bridge *br, const unsigned char *addr) +{ + int err; + struct net_bridge_port *p, *tmp; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + if (!br_promisc_port(p)) { + err = dev_uc_add(p->dev, addr); + if (err) + goto undo; + } + } + + return; +undo: + list_for_each_entry(tmp, &br->port_list, list) { + if (tmp == p) + break; + if (!br_promisc_port(tmp)) + dev_uc_del(tmp->dev, addr); + } +} + +/* When a static FDB entry is deleted, the HW address from that entry is + * also removed from the bridge private HW address list and updates all + * the ports with needed information. + * Called under RTNL. + */ +static void fdb_del_hw(struct net_bridge *br, const unsigned char *addr) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + if (!br_promisc_port(p)) + dev_uc_del(p->dev, addr); + } +} + static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) { + if (f->is_static) + fdb_del_hw(br, f->addr.addr); + hlist_del_rcu(&f->hlist); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); } +/* Delete a local entry if no other port had the same address. */ +static void fdb_delete_local(struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_fdb_entry *f) +{ + const unsigned char *addr = f->addr.addr; + u16 vid = f->vlan_id; + struct net_bridge_port *op; + + /* Maybe another port has same hw addr? */ + list_for_each_entry(op, &br->port_list, list) { + if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && + (!vid || nbp_vlan_find(op, vid))) { + f->dst = op; + f->added_by_user = 0; + return; + } + } + + /* Maybe bridge device has same hw addr? */ + if (p && ether_addr_equal(br->dev->dev_addr, addr) && + (!vid || br_vlan_find(br, vid))) { + f->dst = NULL; + f->added_by_user = 0; + return; + } + + fdb_delete(br, f); +} + +void br_fdb_find_delete_local(struct net_bridge *br, + const struct net_bridge_port *p, + const unsigned char *addr, u16 vid) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; + struct net_bridge_fdb_entry *f; + + spin_lock_bh(&br->hash_lock); + f = fdb_find(head, addr, vid); + if (f && f->is_local && !f->added_by_user && f->dst == p) + fdb_delete_local(br, p, f); + spin_unlock_bh(&br->hash_lock); +} + void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge *br = p->br; - bool no_vlan = (nbp_get_vlan_info(p) == NULL) ? true : false; + struct net_port_vlans *pv = nbp_get_vlan_info(p); + bool no_vlan = !pv; int i; + u16 vid; spin_lock_bh(&br->hash_lock); @@ -104,38 +203,34 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) struct net_bridge_fdb_entry *f; f = hlist_entry(h, struct net_bridge_fdb_entry, hlist); - if (f->dst == p && f->is_local) { - /* maybe another port has same hw addr? */ - struct net_bridge_port *op; - u16 vid = f->vlan_id; - list_for_each_entry(op, &br->port_list, list) { - if (op != p && - ether_addr_equal(op->dev->dev_addr, - f->addr.addr) && - nbp_vlan_find(op, vid)) { - f->dst = op; - goto insert; - } - } - + if (f->dst == p && f->is_local && !f->added_by_user) { /* delete old one */ - fdb_delete(br, f); -insert: - /* insert new address, may fail if invalid - * address or dup. - */ - fdb_insert(br, p, newaddr, vid); + fdb_delete_local(br, p, f); /* if this port has no vlan information * configured, we can safely be done at * this point. */ if (no_vlan) - goto done; + goto insert; } } } +insert: + /* insert new address, may fail if invalid address or dup. */ + fdb_insert(br, p, newaddr, 0); + + if (no_vlan) + goto done; + + /* Now add entries for every VLAN configured on the port. + * This function runs under RTNL so the bitmap will not change + * from under us. + */ + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) + fdb_insert(br, p, newaddr, vid); + done: spin_unlock_bh(&br->hash_lock); } @@ -146,10 +241,12 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) struct net_port_vlans *pv; u16 vid = 0; + spin_lock_bh(&br->hash_lock); + /* If old entry was unassociated with any port, then delete it. */ f = __br_fdb_get(br, br->dev->dev_addr, 0); if (f && f->is_local && !f->dst) - fdb_delete(br, f); + fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -159,14 +256,16 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) */ pv = br_get_vlan_info(br); if (!pv) - return; + goto out; for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) { f = __br_fdb_get(br, br->dev->dev_addr, vid); if (f && f->is_local && !f->dst) - fdb_delete(br, f); + fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, vid); } +out: + spin_unlock_bh(&br->hash_lock); } void br_fdb_cleanup(unsigned long _data) @@ -235,25 +334,11 @@ void br_fdb_delete_by_port(struct net_bridge *br, if (f->is_static && !do_all) continue; - /* - * if multiple ports all have the same device address - * then when one port is deleted, assign - * the local entry to other port - */ - if (f->is_local) { - struct net_bridge_port *op; - list_for_each_entry(op, &br->port_list, list) { - if (op != p && - ether_addr_equal(op->dev->dev_addr, - f->addr.addr)) { - f->dst = op; - goto skip_delete; - } - } - } - fdb_delete(br, f); - skip_delete: ; + if (f->is_local) + fdb_delete_local(br, p, f); + else + fdb_delete(br, f); } } spin_unlock_bh(&br->hash_lock); @@ -397,6 +482,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, fdb->vlan_id = vid; fdb->is_local = 0; fdb->is_static = 0; + fdb->added_by_user = 0; fdb->updated = fdb->used = jiffies; hlist_add_head_rcu(&fdb->hlist, head); } @@ -430,6 +516,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return -ENOMEM; fdb->is_local = fdb->is_static = 1; + fdb_add_hw(br, addr); fdb_notify(br, fdb, RTM_NEWNEIGH); return 0; } @@ -447,10 +534,11 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, bool added_by_user) { struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; + bool fdb_modified = false; /* some users want to always flood. */ if (hold_time(br) == 0) @@ -471,15 +559,25 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->dev->name); } else { /* fastpath: update of existing entry */ - fdb->dst = source; + if (unlikely(source != fdb->dst)) { + fdb->dst = source; + fdb_modified = true; + } fdb->updated = jiffies; + if (unlikely(added_by_user)) + fdb->added_by_user = 1; + if (unlikely(fdb_modified)) + fdb_notify(br, fdb, RTM_NEWNEIGH); } } else { spin_lock(&br->hash_lock); if (likely(!fdb_find(head, addr, vid))) { fdb = fdb_create(head, source, addr, vid); - if (fdb) + if (fdb) { + if (unlikely(added_by_user)) + fdb->added_by_user = 1; fdb_notify(br, fdb, RTM_NEWNEIGH); + } } /* else we lose race and someone else inserts * it first, don't bother updating @@ -524,6 +622,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr)) goto nla_put_failure; + if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) + goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); @@ -545,6 +645,7 @@ static inline size_t fdb_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + nla_total_size(sizeof(struct nda_cacheinfo)); } @@ -570,8 +671,7 @@ static void fdb_notify(struct net_bridge *br, rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* Dump information about entries, in response to GETNEIGH */ @@ -638,16 +738,29 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, } if (fdb_to_nud(fdb) != state) { - if (state & NUD_PERMANENT) - fdb->is_local = fdb->is_static = 1; - else if (state & NUD_NOARP) { + if (state & NUD_PERMANENT) { + fdb->is_local = 1; + if (!fdb->is_static) { + fdb->is_static = 1; + fdb_add_hw(br, addr); + } + } else if (state & NUD_NOARP) { + fdb->is_local = 0; + if (!fdb->is_static) { + fdb->is_static = 1; + fdb_add_hw(br, addr); + } + } else { fdb->is_local = 0; - fdb->is_static = 1; - } else - fdb->is_local = fdb->is_static = 0; + if (fdb->is_static) { + fdb->is_static = 0; + fdb_del_hw(br, addr); + } + } modified = true; } + fdb->added_by_user = 1; fdb->used = jiffies; if (modified) { @@ -665,7 +778,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p, if (ndm->ndm_flags & NTF_USE) { rcu_read_lock(); - br_fdb_update(p->br, p, addr, vid); + br_fdb_update(p->br, p, addr, vid, true); rcu_read_unlock(); } else { spin_lock_bh(&p->br->hash_lock); @@ -750,8 +863,7 @@ out: return err; } -int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, - u16 vlan) +static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vlan) { struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)]; struct net_bridge_fdb_entry *fdb; @@ -834,3 +946,59 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], out: return err; } + +int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) +{ + struct net_bridge_fdb_entry *fdb, *tmp; + int i; + int err; + + ASSERT_RTNL(); + + for (i = 0; i < BR_HASH_SIZE; i++) { + hlist_for_each_entry(fdb, &br->hash[i], hlist) { + /* We only care for static entries */ + if (!fdb->is_static) + continue; + + err = dev_uc_add(p->dev, fdb->addr.addr); + if (err) + goto rollback; + } + } + return 0; + +rollback: + for (i = 0; i < BR_HASH_SIZE; i++) { + hlist_for_each_entry(tmp, &br->hash[i], hlist) { + /* If we reached the fdb that failed, we can stop */ + if (tmp == fdb) + break; + + /* We only care for static entries */ + if (!tmp->is_static) + continue; + + dev_uc_del(p->dev, tmp->addr.addr); + } + } + return err; +} + +void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) +{ + struct net_bridge_fdb_entry *fdb; + int i; + + ASSERT_RTNL(); + + for (i = 0; i < BR_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) { + /* We only care for static entries */ + if (!fdb->is_static) + continue; + + dev_uc_del(p->dev, fdb->addr.addr); + } + } +} diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 4b81b147178..056b67b0e27 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -26,25 +26,20 @@ static int deliver_clone(const struct net_bridge_port *prev, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb)); -/* Don't forward packets to originating port or forwarding diasabled */ +/* Don't forward packets to originating port or forwarding disabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { - return (((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && + return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) && - p->state == BR_STATE_FORWARDING); -} - -static inline unsigned int packet_length(const struct sk_buff *skb) -{ - return skb->len - (skb->protocol == htons(ETH_P_8021Q) ? VLAN_HLEN : 0); + p->state == BR_STATE_FORWARDING; } int br_dev_queue_push_xmit(struct sk_buff *skb) { /* ip_fragment doesn't copy the MAC header */ if (nf_bridge_maybe_copy_header(skb) || - (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))) { + !is_skb_forwardable(skb->dev, skb)) { kfree_skb(skb); } else { skb_push(skb, ETH_HLEN); @@ -71,7 +66,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) skb->dev = to->dev; if (unlikely(netpoll_tx_running(to->br->dev))) { - if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) + if (!is_skb_forwardable(skb->dev, skb)) kfree_skb(skb); else { skb_push(skb, ETH_HLEN); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4bf02adb5dc..3eca3fdf8fe 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -61,7 +61,7 @@ static int port_cost(struct net_device *dev) } -/* Check for port carrier transistions. */ +/* Check for port carrier transitions. */ void br_port_carrier_check(struct net_bridge_port *p) { struct net_device *dev = p->dev; @@ -85,6 +85,111 @@ void br_port_carrier_check(struct net_bridge_port *p) spin_unlock_bh(&br->lock); } +static void br_port_set_promisc(struct net_bridge_port *p) +{ + int err = 0; + + if (br_promisc_port(p)) + return; + + err = dev_set_promiscuity(p->dev, 1); + if (err) + return; + + br_fdb_unsync_static(p->br, p); + p->flags |= BR_PROMISC; +} + +static void br_port_clear_promisc(struct net_bridge_port *p) +{ + int err; + + /* Check if the port is already non-promisc or if it doesn't + * support UNICAST filtering. Without unicast filtering support + * we'll end up re-enabling promisc mode anyway, so just check for + * it here. + */ + if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT)) + return; + + /* Since we'll be clearing the promisc mode, program the port + * first so that we don't have interruption in traffic. + */ + err = br_fdb_sync_static(p->br, p); + if (err) + return; + + dev_set_promiscuity(p->dev, -1); + p->flags &= ~BR_PROMISC; +} + +/* When a port is added or removed or when certain port flags + * change, this function is called to automatically manage + * promiscuity setting of all the bridge ports. We are always called + * under RTNL so can skip using rcu primitives. + */ +void br_manage_promisc(struct net_bridge *br) +{ + struct net_bridge_port *p; + bool set_all = false; + + /* If vlan filtering is disabled or bridge interface is placed + * into promiscuous mode, place all ports in promiscuous mode. + */ + if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br)) + set_all = true; + + list_for_each_entry(p, &br->port_list, list) { + if (set_all) { + br_port_set_promisc(p); + } else { + /* If the number of auto-ports is <= 1, then all other + * ports will have their output configuration + * statically specified through fdbs. Since ingress + * on the auto-port becomes forwarding/egress to other + * ports and egress configuration is statically known, + * we can say that ingress configuration of the + * auto-port is also statically known. + * This lets us disable promiscuous mode and write + * this config to hw. + */ + if (br->auto_cnt == 0 || + (br->auto_cnt == 1 && br_auto_port(p))) + br_port_clear_promisc(p); + else + br_port_set_promisc(p); + } + } +} + +static void nbp_update_port_count(struct net_bridge *br) +{ + struct net_bridge_port *p; + u32 cnt = 0; + + list_for_each_entry(p, &br->port_list, list) { + if (br_auto_port(p)) + cnt++; + } + if (br->auto_cnt != cnt) { + br->auto_cnt = cnt; + br_manage_promisc(br); + } +} + +static void nbp_delete_promisc(struct net_bridge_port *p) +{ + /* If port is currently promiscuous, unset promiscuity. + * Otherwise, it is a static port so remove all addresses + * from it. + */ + dev_set_allmulti(p->dev, -1); + if (br_promisc_port(p)) + dev_set_promiscuity(p->dev, -1); + else + br_fdb_unsync_static(p->br, p); +} + static void release_nbp(struct kobject *kobj) { struct net_bridge_port *p @@ -133,7 +238,7 @@ static void del_nbp(struct net_bridge_port *p) sysfs_remove_link(br->ifobj, p->dev->name); - dev_set_promiscuity(dev, -1); + nbp_delete_promisc(p); spin_lock_bh(&br->lock); br_stp_disable_port(p); @@ -141,10 +246,11 @@ static void del_nbp(struct net_bridge_port *p) br_ifinfo_notify(RTM_DELLINK, p); + list_del_rcu(&p->list); + nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 1); - - list_del_rcu(&p->list); + nbp_update_port_count(br); dev->priv_flags &= ~IFF_BRIDGE_PORT; @@ -353,7 +459,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) call_netdevice_notifiers(NETDEV_JOIN, dev); - err = dev_set_promiscuity(dev, 1); + err = dev_set_allmulti(dev, 1); if (err) goto put_back; @@ -366,7 +472,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err2; - err = br_netpoll_enable(p, GFP_KERNEL); + err = br_netpoll_enable(p); if (err) goto err3; @@ -384,11 +490,16 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) list_add_rcu(&p->list, &br->port_list); + nbp_update_port_count(br); + netdev_update_features(br->dev); if (br->dev->needed_headroom < dev->needed_headroom) br->dev->needed_headroom = dev->needed_headroom; + if (br_fdb_insert(br, p, dev->dev_addr, 0)) + netdev_err(dev, "failed insert local address bridge forwarding table\n"); + spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); @@ -404,9 +515,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev_set_mtu(br->dev, br_min_mtu(br)); - if (br_fdb_insert(br, p, dev->dev_addr, 0)) - netdev_err(dev, "failed insert local address bridge forwarding table\n"); - kobject_uevent(&p->kobj, KOBJ_ADD); return 0; @@ -421,7 +529,7 @@ err2: kobject_put(&p->kobj); p = NULL; /* kobject_put frees */ err1: - dev_set_promiscuity(dev, -1); + dev_set_allmulti(dev, -1); put_back: dev_put(dev); kfree(p); @@ -456,17 +564,10 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) return 0; } -void __net_exit br_net_exit(struct net *net) +void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) { - struct net_device *dev; - LIST_HEAD(list); - - rtnl_lock(); - for_each_netdev(net, dev) - if (dev->priv_flags & IFF_EBRIDGE) - br_dev_delete(dev, &list); - - unregister_netdevice_many(&list); - rtnl_unlock(); + struct net_bridge *br = p->br; + if (mask & BR_AUTO_MASK) + nbp_update_port_count(br); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7e73c32e205..366c4364907 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -28,7 +28,8 @@ static int br_pass_frame_up(struct sk_buff *skb) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); - struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); + struct net_port_vlans *pv; u64_stats_update_begin(&brstats->syncp); brstats->rx_packets++; @@ -39,18 +40,18 @@ static int br_pass_frame_up(struct sk_buff *skb) * packet is allowed except in promisc modue when someone * may be running packet capture. */ + pv = br_get_vlan_info(br); if (!(brdev->flags & IFF_PROMISC) && - !br_allowed_egress(br, br_get_vlan_info(br), skb)) { + !br_allowed_egress(br, pv, skb)) { kfree_skb(skb); return NET_RX_DROP; } - skb = br_handle_vlan(br, br_get_vlan_info(br), skb); - if (!skb) - return NET_RX_DROP; - indev = skb->dev; skb->dev = brdev; + skb = br_handle_vlan(br, pv, skb); + if (!skb) + return NET_RX_DROP; return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, netif_receive_skb); @@ -72,12 +73,12 @@ int br_handle_frame_finish(struct sk_buff *skb) goto drop; if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid)) - goto drop; + goto out; /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && br_multicast_rcv(br, p, skb, vid)) @@ -146,9 +147,9 @@ static int br_handle_local_finish(struct sk_buff *skb) struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; - br_vlan_get_tag(skb, &vid); - if (p->flags & BR_LEARNING) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid); + /* check if vlan is allowed, to avoid spoofing */ + if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); return 0; /* process further */ } @@ -176,6 +177,8 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) p = br_port_get_rcu(skb->dev); if (unlikely(is_link_local_ether_addr(dest))) { + u16 fwd_mask = p->br->group_fwd_mask_required; + /* * See IEEE 802.1D Table 7-10 Reserved addresses * @@ -193,7 +196,8 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) case 0x00: /* Bridge Group Address */ /* If STP is turned off, then must forward to keep loop detection */ - if (p->br->stp_enabled == BR_NO_STP) + if (p->br->stp_enabled == BR_NO_STP || + fwd_mask & (1u << dest[5])) goto forward; break; @@ -202,7 +206,8 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) default: /* Allow selective forwarding for most other protocols */ - if (p->br->group_fwd_mask & (1u << dest[5])) + fwd_mask |= p->br->group_fwd_mask; + if (fwd_mask & (1u << dest[5])) goto forward; } diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index cd8c3a44ab7..a9a4a1b7863 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -381,7 +381,7 @@ int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { struct net_bridge *br = netdev_priv(dev); - switch(cmd) { + switch (cmd) { case SIOCDEVPRIVATE: return old_dev_ioctl(dev, rq, cmd); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index b7b1914dfa2..5df05269d17 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -418,13 +418,13 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) { - if (timer_pending(&br->ip4_querier.timer)) + if (timer_pending(&br->ip4_other_query.timer)) return -EBUSY; ip.u.ip4 = entry->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) } else { - if (timer_pending(&br->ip6_querier.timer)) + if (timer_pending(&br->ip6_other_query.timer)) return -EBUSY; ip.u.ip6 = entry->addr.u.ip6; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 4c214b2b88e..abfa0b65a11 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -11,6 +11,7 @@ */ #include <linux/err.h> +#include <linux/export.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/jhash.h> @@ -35,7 +36,7 @@ #include "br_private.h" static void br_multicast_start_querier(struct net_bridge *br, - struct bridge_mcast_query *query); + struct bridge_mcast_own_query *query); unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) @@ -363,7 +364,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_reset_mac_header(skb); eth = eth_hdr(skb); - memcpy(eth->h_source, br->dev->dev_addr, ETH_ALEN); + ether_addr_copy(eth->h_source, br->dev->dev_addr); eth->h_dest[0] = 1; eth->h_dest[1] = 0; eth->h_dest[2] = 0x5e; @@ -433,7 +434,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, skb_reset_mac_header(skb); eth = eth_hdr(skb); - memcpy(eth->h_source, br->dev->dev_addr, ETH_ALEN); + ether_addr_copy(eth->h_source, br->dev->dev_addr); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); @@ -761,7 +762,7 @@ static void br_multicast_local_router_expired(unsigned long data) } static void br_multicast_querier_expired(struct net_bridge *br, - struct bridge_mcast_query *query) + struct bridge_mcast_own_query *query) { spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || br->multicast_disabled) @@ -777,7 +778,7 @@ static void br_ip4_multicast_querier_expired(unsigned long data) { struct net_bridge *br = (void *)data; - br_multicast_querier_expired(br, &br->ip4_query); + br_multicast_querier_expired(br, &br->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) @@ -785,10 +786,22 @@ static void br_ip6_multicast_querier_expired(unsigned long data) { struct net_bridge *br = (void *)data; - br_multicast_querier_expired(br, &br->ip6_query); + br_multicast_querier_expired(br, &br->ip6_own_query); } #endif +static void br_multicast_select_own_querier(struct net_bridge *br, + struct br_ip *ip, + struct sk_buff *skb) +{ + if (ip->proto == htons(ETH_P_IP)) + br->ip4_querier.addr.u.ip4 = ip_hdr(skb)->saddr; +#if IS_ENABLED(CONFIG_IPV6) + else + br->ip6_querier.addr.u.ip6 = ipv6_hdr(skb)->saddr; +#endif +} + static void __br_multicast_send_query(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *ip) @@ -804,17 +817,19 @@ static void __br_multicast_send_query(struct net_bridge *br, skb->dev = port->dev; NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, dev_queue_xmit); - } else + } else { + br_multicast_select_own_querier(br, ip, skb); netif_rx(skb); + } } static void br_multicast_send_query(struct net_bridge *br, struct net_bridge_port *port, - struct bridge_mcast_query *query) + struct bridge_mcast_own_query *own_query) { unsigned long time; struct br_ip br_group; - struct bridge_mcast_querier *querier = NULL; + struct bridge_mcast_other_query *other_query = NULL; if (!netif_running(br->dev) || br->multicast_disabled || !br->multicast_querier) @@ -822,31 +837,32 @@ static void br_multicast_send_query(struct net_bridge *br, memset(&br_group.u, 0, sizeof(br_group.u)); - if (port ? (query == &port->ip4_query) : - (query == &br->ip4_query)) { - querier = &br->ip4_querier; + if (port ? (own_query == &port->ip4_own_query) : + (own_query == &br->ip4_own_query)) { + other_query = &br->ip4_other_query; br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) } else { - querier = &br->ip6_querier; + other_query = &br->ip6_other_query; br_group.proto = htons(ETH_P_IPV6); #endif } - if (!querier || timer_pending(&querier->timer)) + if (!other_query || timer_pending(&other_query->timer)) return; __br_multicast_send_query(br, port, &br_group); time = jiffies; - time += query->startup_sent < br->multicast_startup_query_count ? + time += own_query->startup_sent < br->multicast_startup_query_count ? br->multicast_startup_query_interval : br->multicast_query_interval; - mod_timer(&query->timer, time); + mod_timer(&own_query->timer, time); } -static void br_multicast_port_query_expired(struct net_bridge_port *port, - struct bridge_mcast_query *query) +static void +br_multicast_port_query_expired(struct net_bridge_port *port, + struct bridge_mcast_own_query *query) { struct net_bridge *br = port->br; @@ -868,7 +884,7 @@ static void br_ip4_multicast_port_query_expired(unsigned long data) { struct net_bridge_port *port = (void *)data; - br_multicast_port_query_expired(port, &port->ip4_query); + br_multicast_port_query_expired(port, &port->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) @@ -876,7 +892,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data) { struct net_bridge_port *port = (void *)data; - br_multicast_port_query_expired(port, &port->ip6_query); + br_multicast_port_query_expired(port, &port->ip6_own_query); } #endif @@ -886,11 +902,11 @@ void br_multicast_add_port(struct net_bridge_port *port) setup_timer(&port->multicast_router_timer, br_multicast_router_expired, (unsigned long)port); - setup_timer(&port->ip4_query.timer, br_ip4_multicast_port_query_expired, - (unsigned long)port); + setup_timer(&port->ip4_own_query.timer, + br_ip4_multicast_port_query_expired, (unsigned long)port); #if IS_ENABLED(CONFIG_IPV6) - setup_timer(&port->ip6_query.timer, br_ip6_multicast_port_query_expired, - (unsigned long)port); + setup_timer(&port->ip6_own_query.timer, + br_ip6_multicast_port_query_expired, (unsigned long)port); #endif } @@ -899,7 +915,7 @@ void br_multicast_del_port(struct net_bridge_port *port) del_timer_sync(&port->multicast_router_timer); } -static void br_multicast_enable(struct bridge_mcast_query *query) +static void br_multicast_enable(struct bridge_mcast_own_query *query) { query->startup_sent = 0; @@ -916,9 +932,9 @@ void br_multicast_enable_port(struct net_bridge_port *port) if (br->multicast_disabled || !netif_running(br->dev)) goto out; - br_multicast_enable(&port->ip4_query); + br_multicast_enable(&port->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - br_multicast_enable(&port->ip6_query); + br_multicast_enable(&port->ip6_own_query); #endif out: @@ -938,9 +954,9 @@ void br_multicast_disable_port(struct net_bridge_port *port) if (!hlist_unhashed(&port->rlist)) hlist_del_init_rcu(&port->rlist); del_timer(&port->multicast_router_timer); - del_timer(&port->ip4_query.timer); + del_timer(&port->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&port->ip6_query.timer); + del_timer(&port->ip6_own_query.timer); #endif spin_unlock(&br->multicast_lock); } @@ -1064,15 +1080,80 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, } #endif +static bool br_ip4_multicast_select_querier(struct net_bridge *br, + struct net_bridge_port *port, + __be32 saddr) +{ + if (!timer_pending(&br->ip4_own_query.timer) && + !timer_pending(&br->ip4_other_query.timer)) + goto update; + + if (!br->ip4_querier.addr.u.ip4) + goto update; + + if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.u.ip4)) + goto update; + + return false; + +update: + br->ip4_querier.addr.u.ip4 = saddr; + + /* update protected by general multicast_lock by caller */ + rcu_assign_pointer(br->ip4_querier.port, port); + + return true; +} + +#if IS_ENABLED(CONFIG_IPV6) +static bool br_ip6_multicast_select_querier(struct net_bridge *br, + struct net_bridge_port *port, + struct in6_addr *saddr) +{ + if (!timer_pending(&br->ip6_own_query.timer) && + !timer_pending(&br->ip6_other_query.timer)) + goto update; + + if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.u.ip6) <= 0) + goto update; + + return false; + +update: + br->ip6_querier.addr.u.ip6 = *saddr; + + /* update protected by general multicast_lock by caller */ + rcu_assign_pointer(br->ip6_querier.port, port); + + return true; +} +#endif + +static bool br_multicast_select_querier(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *saddr) +{ + switch (saddr->proto) { + case htons(ETH_P_IP): + return br_ip4_multicast_select_querier(br, port, saddr->u.ip4); +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + return br_ip6_multicast_select_querier(br, port, &saddr->u.ip6); +#endif + } + + return false; +} + static void -br_multicast_update_querier_timer(struct net_bridge *br, - struct bridge_mcast_querier *querier, - unsigned long max_delay) +br_multicast_update_query_timer(struct net_bridge *br, + struct bridge_mcast_other_query *query, + unsigned long max_delay) { - if (!timer_pending(&querier->timer)) - querier->delay_time = jiffies + max_delay; + if (!timer_pending(&query->timer)) + query->delay_time = jiffies + max_delay; - mod_timer(&querier->timer, jiffies + br->multicast_querier_interval); + mod_timer(&query->timer, jiffies + br->multicast_querier_interval); } /* @@ -1125,15 +1206,14 @@ timer: static void br_multicast_query_received(struct net_bridge *br, struct net_bridge_port *port, - struct bridge_mcast_querier *querier, - int saddr, + struct bridge_mcast_other_query *query, + struct br_ip *saddr, unsigned long max_delay) { - if (saddr) - br_multicast_update_querier_timer(br, querier, max_delay); - else if (timer_pending(&querier->timer)) + if (!br_multicast_select_querier(br, port, saddr)) return; + br_multicast_update_query_timer(br, query, max_delay); br_multicast_mark_router(br, port); } @@ -1148,6 +1228,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, struct igmpv3_query *ih3; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct br_ip saddr; unsigned long max_delay; unsigned long now = jiffies; __be32 group; @@ -1181,11 +1262,22 @@ static int br_ip4_multicast_query(struct net_bridge *br, IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; } - br_multicast_query_received(br, port, &br->ip4_querier, !!iph->saddr, - max_delay); + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) { + err = -EINVAL; + goto out; + } + + if (!group) { + saddr.proto = htons(ETH_P_IP); + saddr.u.ip4 = iph->saddr; - if (!group) + br_multicast_query_received(br, port, &br->ip4_other_query, + &saddr, max_delay); goto out; + } mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid); if (!mp) @@ -1225,9 +1317,11 @@ static int br_ip6_multicast_query(struct net_bridge *br, struct mld2_query *mld2q; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct br_ip saddr; unsigned long max_delay; unsigned long now = jiffies; const struct in6_addr *group = NULL; + bool is_general_query; int err = 0; spin_lock(&br->multicast_lock); @@ -1235,6 +1329,12 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; + /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ + if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) { + err = -EINVAL; + goto out; + } + if (skb->len == sizeof(*mld)) { if (!pskb_may_pull(skb, sizeof(*mld))) { err = -EINVAL; @@ -1256,11 +1356,26 @@ static int br_ip6_multicast_query(struct net_bridge *br, max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL); } - br_multicast_query_received(br, port, &br->ip6_querier, - !ipv6_addr_any(&ip6h->saddr), max_delay); + is_general_query = group && ipv6_addr_any(group); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer + * all-nodes destination address (ff02::1) for general queries + */ + if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) { + err = -EINVAL; + goto out; + } + + if (is_general_query) { + saddr.proto = htons(ETH_P_IPV6); + saddr.u.ip6 = ip6h->saddr; - if (!group) + br_multicast_query_received(br, port, &br->ip6_other_query, + &saddr, max_delay); goto out; + } else if (!group) { + goto out; + } mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid); if (!mp) @@ -1288,11 +1403,12 @@ out: } #endif -static void br_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, - struct br_ip *group, - struct bridge_mcast_querier *querier, - struct bridge_mcast_query *query) +static void +br_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *group, + struct bridge_mcast_other_query *other_query, + struct bridge_mcast_own_query *own_query) { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; @@ -1303,7 +1419,7 @@ static void br_multicast_leave_group(struct net_bridge *br, spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || (port && port->state == BR_STATE_DISABLED) || - timer_pending(&querier->timer)) + timer_pending(&other_query->timer)) goto out; mdb = mlock_dereference(br->mdb, br); @@ -1317,7 +1433,7 @@ static void br_multicast_leave_group(struct net_bridge *br, time = jiffies + br->multicast_last_member_count * br->multicast_last_member_interval; - mod_timer(&query->timer, time); + mod_timer(&own_query->timer, time); for (p = mlock_dereference(mp->ports, br); p != NULL; @@ -1398,17 +1514,19 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, __u16 vid) { struct br_ip br_group; - struct bridge_mcast_query *query = port ? &port->ip4_query : - &br->ip4_query; + struct bridge_mcast_own_query *own_query; if (ipv4_is_local_multicast(group)) return; + own_query = port ? &port->ip4_own_query : &br->ip4_own_query; + br_group.u.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip4_querier, query); + br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query, + own_query); } #if IS_ENABLED(CONFIG_IPV6) @@ -1418,18 +1536,19 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, __u16 vid) { struct br_ip br_group; - struct bridge_mcast_query *query = port ? &port->ip6_query : - &br->ip6_query; - + struct bridge_mcast_own_query *own_query; if (ipv6_addr_is_ll_all_nodes(group)) return; + own_query = port ? &port->ip6_own_query : &br->ip6_own_query; + br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip6_querier, query); + br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query, + own_query); } #endif @@ -1696,12 +1815,14 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, } static void br_multicast_query_expired(struct net_bridge *br, - struct bridge_mcast_query *query) + struct bridge_mcast_own_query *query, + struct bridge_mcast_querier *querier) { spin_lock(&br->multicast_lock); if (query->startup_sent < br->multicast_startup_query_count) query->startup_sent++; + rcu_assign_pointer(querier, NULL); br_multicast_send_query(br, NULL, query); spin_unlock(&br->multicast_lock); } @@ -1710,7 +1831,7 @@ static void br_ip4_multicast_query_expired(unsigned long data) { struct net_bridge *br = (void *)data; - br_multicast_query_expired(br, &br->ip4_query); + br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier); } #if IS_ENABLED(CONFIG_IPV6) @@ -1718,7 +1839,7 @@ static void br_ip6_multicast_query_expired(unsigned long data) { struct net_bridge *br = (void *)data; - br_multicast_query_expired(br, &br->ip6_query); + br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier); } #endif @@ -1740,28 +1861,30 @@ void br_multicast_init(struct net_bridge *br) br->multicast_querier_interval = 255 * HZ; br->multicast_membership_interval = 260 * HZ; - br->ip4_querier.delay_time = 0; + br->ip4_other_query.delay_time = 0; + br->ip4_querier.port = NULL; #if IS_ENABLED(CONFIG_IPV6) - br->ip6_querier.delay_time = 0; + br->ip6_other_query.delay_time = 0; + br->ip6_querier.port = NULL; #endif spin_lock_init(&br->multicast_lock); setup_timer(&br->multicast_router_timer, br_multicast_local_router_expired, 0); - setup_timer(&br->ip4_querier.timer, br_ip4_multicast_querier_expired, - (unsigned long)br); - setup_timer(&br->ip4_query.timer, br_ip4_multicast_query_expired, + setup_timer(&br->ip4_other_query.timer, + br_ip4_multicast_querier_expired, (unsigned long)br); + setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired, (unsigned long)br); #if IS_ENABLED(CONFIG_IPV6) - setup_timer(&br->ip6_querier.timer, br_ip6_multicast_querier_expired, - (unsigned long)br); - setup_timer(&br->ip6_query.timer, br_ip6_multicast_query_expired, + setup_timer(&br->ip6_other_query.timer, + br_ip6_multicast_querier_expired, (unsigned long)br); + setup_timer(&br->ip6_own_query.timer, br_ip6_multicast_query_expired, (unsigned long)br); #endif } static void __br_multicast_open(struct net_bridge *br, - struct bridge_mcast_query *query) + struct bridge_mcast_own_query *query) { query->startup_sent = 0; @@ -1773,9 +1896,9 @@ static void __br_multicast_open(struct net_bridge *br, void br_multicast_open(struct net_bridge *br) { - __br_multicast_open(br, &br->ip4_query); + __br_multicast_open(br, &br->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - __br_multicast_open(br, &br->ip6_query); + __br_multicast_open(br, &br->ip6_own_query); #endif } @@ -1788,11 +1911,11 @@ void br_multicast_stop(struct net_bridge *br) int i; del_timer_sync(&br->multicast_router_timer); - del_timer_sync(&br->ip4_querier.timer); - del_timer_sync(&br->ip4_query.timer); + del_timer_sync(&br->ip4_other_query.timer); + del_timer_sync(&br->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&br->ip6_querier.timer); - del_timer_sync(&br->ip6_query.timer); + del_timer_sync(&br->ip6_other_query.timer); + del_timer_sync(&br->ip6_own_query.timer); #endif spin_lock_bh(&br->multicast_lock); @@ -1896,7 +2019,7 @@ unlock: } static void br_multicast_start_querier(struct net_bridge *br, - struct bridge_mcast_query *query) + struct bridge_mcast_own_query *query) { struct net_bridge_port *port; @@ -1907,11 +2030,11 @@ static void br_multicast_start_querier(struct net_bridge *br, port->state == BR_STATE_BLOCKING) continue; - if (query == &br->ip4_query) - br_multicast_enable(&port->ip4_query); + if (query == &br->ip4_own_query) + br_multicast_enable(&port->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) else - br_multicast_enable(&port->ip6_query); + br_multicast_enable(&port->ip6_own_query); #endif } } @@ -1947,9 +2070,9 @@ rollback: goto rollback; } - br_multicast_start_querier(br, &br->ip4_query); + br_multicast_start_querier(br, &br->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - br_multicast_start_querier(br, &br->ip6_query); + br_multicast_start_querier(br, &br->ip6_own_query); #endif unlock: @@ -1974,16 +2097,16 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val) max_delay = br->multicast_query_response_interval; - if (!timer_pending(&br->ip4_querier.timer)) - br->ip4_querier.delay_time = jiffies + max_delay; + if (!timer_pending(&br->ip4_other_query.timer)) + br->ip4_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip4_query); + br_multicast_start_querier(br, &br->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - if (!timer_pending(&br->ip6_querier.timer)) - br->ip6_querier.delay_time = jiffies + max_delay; + if (!timer_pending(&br->ip6_other_query.timer)) + br->ip6_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip6_query); + br_multicast_start_querier(br, &br->ip6_own_query); #endif unlock: @@ -1998,7 +2121,7 @@ int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) u32 old; struct net_bridge_mdb_htable *mdb; - spin_lock(&br->multicast_lock); + spin_lock_bh(&br->multicast_lock); if (!netif_running(br->dev)) goto unlock; @@ -2030,7 +2153,113 @@ rollback: } unlock: - spin_unlock(&br->multicast_lock); + spin_unlock_bh(&br->multicast_lock); return err; } + +/** + * br_multicast_list_adjacent - Returns snooped multicast addresses + * @dev: The bridge port adjacent to which to retrieve addresses + * @br_ip_list: The list to store found, snooped multicast IP addresses in + * + * Creates a list of IP addresses (struct br_ip_list) sensed by the multicast + * snooping feature on all bridge ports of dev's bridge device, excluding + * the addresses from dev itself. + * + * Returns the number of items added to br_ip_list. + * + * Notes: + * - br_ip_list needs to be initialized by caller + * - br_ip_list might contain duplicates in the end + * (needs to be taken care of by caller) + * - br_ip_list needs to be freed by caller + */ +int br_multicast_list_adjacent(struct net_device *dev, + struct list_head *br_ip_list) +{ + struct net_bridge *br; + struct net_bridge_port *port; + struct net_bridge_port_group *group; + struct br_ip_list *entry; + int count = 0; + + rcu_read_lock(); + if (!br_ip_list || !br_port_exists(dev)) + goto unlock; + + port = br_port_get_rcu(dev); + if (!port || !port->br) + goto unlock; + + br = port->br; + + list_for_each_entry_rcu(port, &br->port_list, list) { + if (!port->dev || port->dev == dev) + continue; + + hlist_for_each_entry_rcu(group, &port->mglist, mglist) { + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + goto unlock; + + entry->addr = group->addr; + list_add(&entry->list, br_ip_list); + count++; + } + } + +unlock: + rcu_read_unlock(); + return count; +} +EXPORT_SYMBOL_GPL(br_multicast_list_adjacent); + +/** + * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port + * @dev: The bridge port adjacent to which to check for a querier + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a selected querier is behind one of the other ports of this + * bridge. Otherwise returns false. + */ +bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) +{ + struct net_bridge *br; + struct net_bridge_port *port; + bool ret = false; + + rcu_read_lock(); + if (!br_port_exists(dev)) + goto unlock; + + port = br_port_get_rcu(dev); + if (!port || !port->br) + goto unlock; + + br = port->br; + + switch (proto) { + case ETH_P_IP: + if (!timer_pending(&br->ip4_other_query.timer) || + rcu_dereference(br->ip4_querier.port) == port) + goto unlock; + break; +#if IS_ENABLED(CONFIG_IPV6) + case ETH_P_IPV6: + if (!timer_pending(&br->ip6_other_query.timer) || + rcu_dereference(br->ip6_querier.port) == port) + goto unlock; + break; +#endif + default: + goto unlock; + } + + ret = true; +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 80cad2cf02a..a615264cf01 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -167,7 +167,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) rt->dst.dev = br->dev; rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_NOPEER | DST_FAKE_RTABLE; + rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } @@ -506,7 +506,7 @@ bridged_dnat: 1); return 0; } - memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN); + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } } else { @@ -535,7 +535,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb)) return br; - vlan = __vlan_find_dev_deep(br, skb->vlan_proto, + vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, vlan_tx_tag_get(skb) & VLAN_VID_MASK); return vlan ? vlan : br; @@ -859,12 +859,12 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, return NF_STOLEN; } -#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4) +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) static int br_nf_dev_queue_xmit(struct sk_buff *skb) { int ret; - if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) && + if (skb->protocol == htons(ETH_P_IP) && skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && !skb_is_gso(skb)) { if (br_parse_ip_options(skb)) @@ -1001,7 +1001,7 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user * buffer, size_t * lenp, loff_t * ppos) + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f75d92e4f96..26edb518b83 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -195,8 +195,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_LINK, err); + rtnl_set_sk_err(net, RTNLGRP_LINK, err); } @@ -329,6 +328,7 @@ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) { int err; + unsigned long old_flags = p->flags; br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); @@ -354,6 +354,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (err) return err; } + + br_port_flags_change(p, old_flags ^ p->flags); return 0; } @@ -373,7 +375,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh) p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself if the AF_SPEC - * is set to see if someone is setting vlan info on the brigde + * is set to see if someone is setting vlan info on the bridge */ if (!p && !afspec) return -EINVAL; @@ -389,7 +391,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh) err = br_setport(p, tb); spin_unlock_bh(&p->br->lock); } else { - /* Binary compatability with old RSTP */ + /* Binary compatibility with old RSTP */ if (nla_len(protinfo) < sizeof(u8)) return -EINVAL; @@ -446,6 +448,20 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } +static int br_dev_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_bridge *br = netdev_priv(dev); + + if (tb[IFLA_ADDRESS]) { + spin_lock_bh(&br->lock); + br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); + spin_unlock_bh(&br->lock); + } + + return register_netdevice(dev); +} + static size_t br_get_link_af_size(const struct net_device *dev) { struct net_port_vlans *pv; @@ -474,6 +490,7 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, .validate = br_validate, + .newlink = br_dev_newlink, .dellink = br_dev_delete, }; @@ -482,9 +499,7 @@ int __init br_netlink_init(void) int err; br_mdb_init(); - err = rtnl_af_register(&br_af_ops); - if (err) - goto out; + rtnl_af_register(&br_af_ops); err = rtnl_link_register(&br_link_ops); if (err) @@ -494,7 +509,6 @@ int __init br_netlink_init(void) out_af: rtnl_af_unregister(&br_af_ops); -out: br_mdb_uninit(); return err; } diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c deleted file mode 100644 index 2998dd1769a..00000000000 --- a/net/bridge/br_notify.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Device event handling - * Linux ethernet bridge - * - * Authors: - * Lennert Buytenhek <buytenh@gnu.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/rtnetlink.h> -#include <net/net_namespace.h> - -#include "br_private.h" - -static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr); - -struct notifier_block br_device_notifier = { - .notifier_call = br_device_event -}; - -/* - * Handle changes in state of network devices enslaved to a bridge. - * - * Note: don't care about up/down if bridge itself is down, because - * port state is checked when bridge is brought up. - */ -static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net_bridge_port *p; - struct net_bridge *br; - bool changed_addr; - int err; - - /* register of bridge completed, add sysfs entries */ - if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) { - br_sysfs_addbr(dev); - return NOTIFY_DONE; - } - - /* not a port of a bridge */ - p = br_port_get_rtnl(dev); - if (!p) - return NOTIFY_DONE; - - br = p->br; - - switch (event) { - case NETDEV_CHANGEMTU: - dev_set_mtu(br->dev, br_min_mtu(br)); - break; - - case NETDEV_CHANGEADDR: - spin_lock_bh(&br->lock); - br_fdb_changeaddr(p, dev->dev_addr); - changed_addr = br_stp_recalculate_bridge_id(br); - spin_unlock_bh(&br->lock); - - if (changed_addr) - call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - - break; - - case NETDEV_CHANGE: - br_port_carrier_check(p); - break; - - case NETDEV_FEAT_CHANGE: - netdev_update_features(br->dev); - break; - - case NETDEV_DOWN: - spin_lock_bh(&br->lock); - if (br->dev->flags & IFF_UP) - br_stp_disable_port(p); - spin_unlock_bh(&br->lock); - break; - - case NETDEV_UP: - if (netif_running(br->dev) && netif_oper_up(dev)) { - spin_lock_bh(&br->lock); - br_stp_enable_port(p); - spin_unlock_bh(&br->lock); - } - break; - - case NETDEV_UNREGISTER: - br_del_if(br, dev); - break; - - case NETDEV_CHANGENAME: - err = br_sysfs_renameif(p); - if (err) - return notifier_from_errno(err); - break; - - case NETDEV_PRE_TYPE_CHANGE: - /* Forbid underlaying device to change its type. */ - return NOTIFY_BAD; - - case NETDEV_RESEND_IGMP: - /* Propagate to master device */ - call_netdevice_notifiers(event, br->dev); - break; - } - - /* Events that may cause spanning tree to refresh */ - if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || - event == NETDEV_CHANGE || event == NETDEV_DOWN) - br_ifinfo_notify(RTM_NEWLINK, p); - - return NOTIFY_DONE; -} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 045d56eaeca..23caf5b0309 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -35,6 +35,8 @@ #define BR_GROUPFWD_DEFAULT 0 /* Don't allow forwarding control protocols like STP and LLDP */ #define BR_GROUPFWD_RESTRICTED 0x4007u +/* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ +#define BR_GROUPFWD_8021AD 0xB801u /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" @@ -46,38 +48,32 @@ typedef __u16 port_id; struct bridge_id { unsigned char prio[2]; - unsigned char addr[6]; + unsigned char addr[ETH_ALEN]; }; struct mac_addr { - unsigned char addr[6]; -}; - -struct br_ip -{ - union { - __be32 ip4; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr ip6; -#endif - } u; - __be16 proto; - __u16 vid; + unsigned char addr[ETH_ALEN]; }; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING /* our own querier */ -struct bridge_mcast_query { +struct bridge_mcast_own_query { struct timer_list timer; u32 startup_sent; }; /* other querier */ -struct bridge_mcast_querier { +struct bridge_mcast_other_query { struct timer_list timer; unsigned long delay_time; }; + +/* selected querier */ +struct bridge_mcast_querier { + struct br_ip addr; + struct net_bridge_port __rcu *port; +}; #endif struct net_port_vlans { @@ -104,6 +100,7 @@ struct net_bridge_fdb_entry mac_addr addr; unsigned char is_local; unsigned char is_static; + unsigned char added_by_user; __u16 vlan_id; }; @@ -173,11 +170,13 @@ struct net_bridge_port #define BR_ADMIN_COST 0x00000010 #define BR_LEARNING 0x00000020 #define BR_FLOOD 0x00000040 +#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING) +#define BR_PROMISC 0x00000080 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - struct bridge_mcast_query ip4_query; + struct bridge_mcast_own_query ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) - struct bridge_mcast_query ip6_query; + struct bridge_mcast_own_query ip6_own_query; #endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; struct timer_list multicast_router_timer; @@ -197,6 +196,9 @@ struct net_bridge_port #endif }; +#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) +#define br_promisc_port(p) ((p)->flags & BR_PROMISC) + #define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT) static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev) @@ -210,21 +212,13 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device * rtnl_dereference(dev->rx_handler_data) : NULL; } -struct br_cpu_netstats { - u64 rx_packets; - u64 rx_bytes; - u64 tx_packets; - u64 tx_bytes; - struct u64_stats_sync syncp; -}; - struct net_bridge { spinlock_t lock; struct list_head port_list; struct net_device *dev; - struct br_cpu_netstats __percpu *stats; + struct pcpu_sw_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; #ifdef CONFIG_BRIDGE_NETFILTER @@ -234,6 +228,7 @@ struct net_bridge bool nf_call_arptables; #endif u16 group_fwd_mask; + u16 group_fwd_mask_required; /* STP */ bridge_id designated_root; @@ -248,6 +243,7 @@ struct net_bridge unsigned long bridge_forward_delay; u8 group_addr[ETH_ALEN]; + bool group_addr_set; u16 root_port; enum { @@ -284,11 +280,13 @@ struct net_bridge struct hlist_head router_list; struct timer_list multicast_router_timer; + struct bridge_mcast_other_query ip4_other_query; + struct bridge_mcast_own_query ip4_own_query; struct bridge_mcast_querier ip4_querier; - struct bridge_mcast_query ip4_query; #if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_other_query ip6_other_query; + struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; - struct bridge_mcast_query ip6_query; #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif @@ -297,8 +295,10 @@ struct net_bridge struct timer_list topology_change_timer; struct timer_list gc_timer; struct kobject *ifobj; + u32 auto_cnt; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_enabled; + __be16 vlan_proto; struct net_port_vlans __rcu *vlan_info; #endif }; @@ -334,8 +334,6 @@ struct br_input_skb_cb { #define br_debug(br, format, args...) \ pr_debug("%s: " format, (br)->dev->name, ##args) -extern struct notifier_block br_device_notifier; - /* called under bridge lock */ static inline int br_is_root_bridge(const struct net_bridge *br) { @@ -356,7 +354,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, netpoll_send_skb(np, skb); } -int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp); +int br_netpoll_enable(struct net_bridge_port *p); void br_netpoll_disable(struct net_bridge_port *p); #else static inline void br_netpoll_send_skb(const struct net_bridge_port *p, @@ -364,7 +362,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, { } -static inline int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) +static inline int br_netpoll_enable(struct net_bridge_port *p) { return 0; } @@ -378,6 +376,9 @@ static inline void br_netpoll_disable(struct net_bridge_port *p) int br_fdb_init(void); void br_fdb_fini(void); void br_fdb_flush(struct net_bridge *br); +void br_fdb_find_delete_local(struct net_bridge *br, + const struct net_bridge_port *p, + const unsigned char *addr, u16 vid); void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(unsigned long arg); @@ -391,8 +392,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid); -int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vid); + const unsigned char *addr, u16 vid, bool added_by_user); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr); @@ -400,6 +400,8 @@ int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 nlh_flags); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, int idx); +int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); +void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); /* br_forward.c */ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb); @@ -415,12 +417,13 @@ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, void br_port_carrier_check(struct net_bridge_port *p); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); -void br_net_exit(struct net *net); int br_add_if(struct net_bridge *br, struct net_device *dev); int br_del_if(struct net_bridge *br, struct net_device *dev); int br_min_mtu(const struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); +void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); +void br_manage_promisc(struct net_bridge *br); /* br_input.c */ int br_handle_frame_finish(struct sk_buff *skb); @@ -491,7 +494,7 @@ static inline bool br_multicast_is_router(struct net_bridge *br) static inline bool __br_multicast_querier_exists(struct net_bridge *br, - struct bridge_mcast_querier *querier) + struct bridge_mcast_other_query *querier) { return time_is_before_jiffies(querier->delay_time) && (br->multicast_querier || timer_pending(&querier->timer)); @@ -502,10 +505,10 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, { switch (eth->h_proto) { case (htons(ETH_P_IP)): - return __br_multicast_querier_exists(br, &br->ip4_querier); + return __br_multicast_querier_exists(br, &br->ip4_other_query); #if IS_ENABLED(CONFIG_IPV6) case (htons(ETH_P_IPV6)): - return __br_multicast_querier_exists(br, &br->ip6_querier); + return __br_multicast_querier_exists(br, &br->ip6_other_query); #endif default: return false; @@ -587,13 +590,18 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, struct sk_buff *skb, u16 *vid); bool br_allowed_egress(struct net_bridge *br, const struct net_port_vlans *v, const struct sk_buff *skb); +bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_port_vlans *v, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); +bool br_vlan_find(struct net_bridge *br, u16 vid); +void br_recalculate_fwd_mask(struct net_bridge *br); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); +int br_vlan_set_proto(struct net_bridge *br, unsigned long val); +void br_vlan_init(struct net_bridge *br); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); @@ -637,6 +645,10 @@ static inline u16 br_get_pvid(const struct net_port_vlans *v) return v->pvid ?: VLAN_N_VID; } +static inline int br_vlan_enabled(struct net_bridge *br) +{ + return br->vlan_enabled; +} #else static inline bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, @@ -653,6 +665,12 @@ static inline bool br_allowed_egress(struct net_bridge *br, return true; } +static inline bool br_should_learn(struct net_bridge_port *p, + struct sk_buff *skb, u16 *vid) +{ + return true; +} + static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_port_vlans *v, struct sk_buff *skb) @@ -674,6 +692,19 @@ static inline void br_vlan_flush(struct net_bridge *br) { } +static inline bool br_vlan_find(struct net_bridge *br, u16 vid) +{ + return false; +} + +static inline void br_recalculate_fwd_mask(struct net_bridge *br) +{ +} + +static inline void br_vlan_init(struct net_bridge *br) +{ +} + static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { return -EOPNOTSUPP; @@ -712,6 +743,11 @@ static inline u16 br_get_pvid(const struct net_port_vlans *v) { return VLAN_N_VID; /* Returns invalid vid */ } + +static inline int br_vlan_enabled(struct net_bridge *br) +{ + return 0; +} #endif /* br_netfilter.c */ @@ -721,7 +757,7 @@ void br_netfilter_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else #define br_netfilter_init() (0) -#define br_netfilter_fini() do { } while(0) +#define br_netfilter_fini() do { } while (0) #define br_netfilter_rtable_init(x) #endif diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 656a6f3e40d..189ba1e7d85 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -194,6 +194,8 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) wasroot = br_is_root_bridge(br); + br_fdb_change_mac_address(br, addr); + memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); memcpy(br->dev->dev_addr, addr, ETH_ALEN); diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 950663d4d33..558c46d19e0 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -110,7 +110,7 @@ static void br_tcn_timer_expired(unsigned long arg) if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) { br_transmit_tcn(br); - mod_timer(&br->tcn_timer,jiffies + br->bridge_hello_time); + mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } spin_unlock(&br->lock); } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 3b9637fb793..c9e2572b15f 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -49,53 +49,51 @@ static ssize_t store_bridge_parm(struct device *d, } -static ssize_t show_forward_delay(struct device *d, +static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } -static ssize_t store_forward_delay(struct device *d, +static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_set_forward_delay); } -static DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR, - show_forward_delay, store_forward_delay); +static DEVICE_ATTR_RW(forward_delay); -static ssize_t show_hello_time(struct device *d, struct device_attribute *attr, +static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->hello_time)); } -static ssize_t store_hello_time(struct device *d, +static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_set_hello_time); } -static DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time, - store_hello_time); +static DEVICE_ATTR_RW(hello_time); -static ssize_t show_max_age(struct device *d, struct device_attribute *attr, +static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->max_age)); } -static ssize_t store_max_age(struct device *d, struct device_attribute *attr, +static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_set_max_age); } -static DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age, store_max_age); +static DEVICE_ATTR_RW(max_age); -static ssize_t show_ageing_time(struct device *d, +static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -108,16 +106,15 @@ static int set_ageing_time(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_ageing_time(struct device *d, +static ssize_t ageing_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_ageing_time); } -static DEVICE_ATTR(ageing_time, S_IRUGO | S_IWUSR, show_ageing_time, - store_ageing_time); +static DEVICE_ATTR_RW(ageing_time); -static ssize_t show_stp_state(struct device *d, +static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -125,7 +122,7 @@ static ssize_t show_stp_state(struct device *d, } -static ssize_t store_stp_state(struct device *d, +static ssize_t stp_state_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { @@ -147,20 +144,21 @@ static ssize_t store_stp_state(struct device *d, return len; } -static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, - store_stp_state); +static DEVICE_ATTR_RW(stp_state); -static ssize_t show_group_fwd_mask(struct device *d, - struct device_attribute *attr, char *buf) +static ssize_t group_fwd_mask_show(struct device *d, + struct device_attribute *attr, + char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#x\n", br->group_fwd_mask); } -static ssize_t store_group_fwd_mask(struct device *d, - struct device_attribute *attr, const char *buf, - size_t len) +static ssize_t group_fwd_mask_store(struct device *d, + struct device_attribute *attr, + const char *buf, + size_t len) { struct net_bridge *br = to_bridge(d); char *endp; @@ -180,10 +178,9 @@ static ssize_t store_group_fwd_mask(struct device *d, return len; } -static DEVICE_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask, - store_group_fwd_mask); +static DEVICE_ATTR_RW(group_fwd_mask); -static ssize_t show_priority(struct device *d, struct device_attribute *attr, +static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -197,93 +194,91 @@ static int set_priority(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_priority(struct device *d, struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t priority_store(struct device *d, struct device_attribute *attr, + const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_priority); } -static DEVICE_ATTR(priority, S_IRUGO | S_IWUSR, show_priority, store_priority); +static DEVICE_ATTR_RW(priority); -static ssize_t show_root_id(struct device *d, struct device_attribute *attr, +static ssize_t root_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->designated_root); } -static DEVICE_ATTR(root_id, S_IRUGO, show_root_id, NULL); +static DEVICE_ATTR_RO(root_id); -static ssize_t show_bridge_id(struct device *d, struct device_attribute *attr, +static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->bridge_id); } -static DEVICE_ATTR(bridge_id, S_IRUGO, show_bridge_id, NULL); +static DEVICE_ATTR_RO(bridge_id); -static ssize_t show_root_port(struct device *d, struct device_attribute *attr, +static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_port); } -static DEVICE_ATTR(root_port, S_IRUGO, show_root_port, NULL); +static DEVICE_ATTR_RO(root_port); -static ssize_t show_root_path_cost(struct device *d, +static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); } -static DEVICE_ATTR(root_path_cost, S_IRUGO, show_root_path_cost, NULL); +static DEVICE_ATTR_RO(root_path_cost); -static ssize_t show_topology_change(struct device *d, +static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->topology_change); } -static DEVICE_ATTR(topology_change, S_IRUGO, show_topology_change, NULL); +static DEVICE_ATTR_RO(topology_change); -static ssize_t show_topology_change_detected(struct device *d, +static ssize_t topology_change_detected_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->topology_change_detected); } -static DEVICE_ATTR(topology_change_detected, S_IRUGO, - show_topology_change_detected, NULL); +static DEVICE_ATTR_RO(topology_change_detected); -static ssize_t show_hello_timer(struct device *d, +static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); } -static DEVICE_ATTR(hello_timer, S_IRUGO, show_hello_timer, NULL); +static DEVICE_ATTR_RO(hello_timer); -static ssize_t show_tcn_timer(struct device *d, struct device_attribute *attr, +static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } -static DEVICE_ATTR(tcn_timer, S_IRUGO, show_tcn_timer, NULL); +static DEVICE_ATTR_RO(tcn_timer); -static ssize_t show_topology_change_timer(struct device *d, +static ssize_t topology_change_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } -static DEVICE_ATTR(topology_change_timer, S_IRUGO, show_topology_change_timer, - NULL); +static DEVICE_ATTR_RO(topology_change_timer); -static ssize_t show_gc_timer(struct device *d, struct device_attribute *attr, +static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); } -static DEVICE_ATTR(gc_timer, S_IRUGO, show_gc_timer, NULL); +static DEVICE_ATTR_RO(gc_timer); -static ssize_t show_group_addr(struct device *d, +static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -293,7 +288,7 @@ static ssize_t show_group_addr(struct device *d, br->group_addr[4], br->group_addr[5]); } -static ssize_t store_group_addr(struct device *d, +static ssize_t group_addr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { @@ -317,17 +312,25 @@ static ssize_t store_group_addr(struct device *d, new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; + if (!rtnl_trylock()) + return restart_syscall(); + spin_lock_bh(&br->lock); for (i = 0; i < 6; i++) br->group_addr[i] = new_addr[i]; spin_unlock_bh(&br->lock); + + br->group_addr_set = true; + br_recalculate_fwd_mask(br); + + rtnl_unlock(); + return len; } -static DEVICE_ATTR(group_addr, S_IRUGO | S_IWUSR, - show_group_addr, store_group_addr); +static DEVICE_ATTR_RW(group_addr); -static ssize_t store_flush(struct device *d, +static ssize_t flush_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { @@ -339,26 +342,25 @@ static ssize_t store_flush(struct device *d, br_fdb_flush(br); return len; } -static DEVICE_ATTR(flush, S_IWUSR, NULL, store_flush); +static DEVICE_ATTR_WO(flush); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING -static ssize_t show_multicast_router(struct device *d, +static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_router); } -static ssize_t store_multicast_router(struct device *d, +static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_set_router); } -static DEVICE_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, - store_multicast_router); +static DEVICE_ATTR_RW(multicast_router); -static ssize_t show_multicast_snooping(struct device *d, +static ssize_t multicast_snooping_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -366,18 +368,17 @@ static ssize_t show_multicast_snooping(struct device *d, return sprintf(buf, "%d\n", !br->multicast_disabled); } -static ssize_t store_multicast_snooping(struct device *d, +static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_toggle); } -static DEVICE_ATTR(multicast_snooping, S_IRUGO | S_IWUSR, - show_multicast_snooping, store_multicast_snooping); +static DEVICE_ATTR_RW(multicast_snooping); -static ssize_t show_multicast_query_use_ifaddr(struct device *d, - struct device_attribute *attr, - char *buf) +static ssize_t multicast_query_use_ifaddr_show(struct device *d, + struct device_attribute *attr, + char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr); @@ -390,17 +391,15 @@ static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val) } static ssize_t -store_multicast_query_use_ifaddr(struct device *d, +multicast_query_use_ifaddr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_use_ifaddr); } -static DEVICE_ATTR(multicast_query_use_ifaddr, S_IRUGO | S_IWUSR, - show_multicast_query_use_ifaddr, - store_multicast_query_use_ifaddr); +static DEVICE_ATTR_RW(multicast_query_use_ifaddr); -static ssize_t show_multicast_querier(struct device *d, +static ssize_t multicast_querier_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -408,16 +407,15 @@ static ssize_t show_multicast_querier(struct device *d, return sprintf(buf, "%d\n", br->multicast_querier); } -static ssize_t store_multicast_querier(struct device *d, +static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_set_querier); } -static DEVICE_ATTR(multicast_querier, S_IRUGO | S_IWUSR, - show_multicast_querier, store_multicast_querier); +static DEVICE_ATTR_RW(multicast_querier); -static ssize_t show_hash_elasticity(struct device *d, +static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -430,31 +428,29 @@ static int set_elasticity(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_hash_elasticity(struct device *d, +static ssize_t hash_elasticity_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_elasticity); } -static DEVICE_ATTR(hash_elasticity, S_IRUGO | S_IWUSR, show_hash_elasticity, - store_hash_elasticity); +static DEVICE_ATTR_RW(hash_elasticity); -static ssize_t show_hash_max(struct device *d, struct device_attribute *attr, +static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->hash_max); } -static ssize_t store_hash_max(struct device *d, struct device_attribute *attr, +static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_set_hash_max); } -static DEVICE_ATTR(hash_max, S_IRUGO | S_IWUSR, show_hash_max, - store_hash_max); +static DEVICE_ATTR_RW(hash_max); -static ssize_t show_multicast_last_member_count(struct device *d, +static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -468,17 +464,15 @@ static int set_last_member_count(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_last_member_count(struct device *d, +static ssize_t multicast_last_member_count_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_count); } -static DEVICE_ATTR(multicast_last_member_count, S_IRUGO | S_IWUSR, - show_multicast_last_member_count, - store_multicast_last_member_count); +static DEVICE_ATTR_RW(multicast_last_member_count); -static ssize_t show_multicast_startup_query_count( +static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -491,17 +485,15 @@ static int set_startup_query_count(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_startup_query_count( +static ssize_t multicast_startup_query_count_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_count); } -static DEVICE_ATTR(multicast_startup_query_count, S_IRUGO | S_IWUSR, - show_multicast_startup_query_count, - store_multicast_startup_query_count); +static DEVICE_ATTR_RW(multicast_startup_query_count); -static ssize_t show_multicast_last_member_interval( +static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -515,17 +507,15 @@ static int set_last_member_interval(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_last_member_interval( +static ssize_t multicast_last_member_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_interval); } -static DEVICE_ATTR(multicast_last_member_interval, S_IRUGO | S_IWUSR, - show_multicast_last_member_interval, - store_multicast_last_member_interval); +static DEVICE_ATTR_RW(multicast_last_member_interval); -static ssize_t show_multicast_membership_interval( +static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -539,17 +529,15 @@ static int set_membership_interval(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_membership_interval( +static ssize_t multicast_membership_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_membership_interval); } -static DEVICE_ATTR(multicast_membership_interval, S_IRUGO | S_IWUSR, - show_multicast_membership_interval, - store_multicast_membership_interval); +static DEVICE_ATTR_RW(multicast_membership_interval); -static ssize_t show_multicast_querier_interval(struct device *d, +static ssize_t multicast_querier_interval_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -564,17 +552,15 @@ static int set_querier_interval(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_querier_interval(struct device *d, +static ssize_t multicast_querier_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_querier_interval); } -static DEVICE_ATTR(multicast_querier_interval, S_IRUGO | S_IWUSR, - show_multicast_querier_interval, - store_multicast_querier_interval); +static DEVICE_ATTR_RW(multicast_querier_interval); -static ssize_t show_multicast_query_interval(struct device *d, +static ssize_t multicast_query_interval_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -589,17 +575,15 @@ static int set_query_interval(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_query_interval(struct device *d, +static ssize_t multicast_query_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_interval); } -static DEVICE_ATTR(multicast_query_interval, S_IRUGO | S_IWUSR, - show_multicast_query_interval, - store_multicast_query_interval); +static DEVICE_ATTR_RW(multicast_query_interval); -static ssize_t show_multicast_query_response_interval( +static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -614,17 +598,15 @@ static int set_query_response_interval(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_query_response_interval( +static ssize_t multicast_query_response_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_response_interval); } -static DEVICE_ATTR(multicast_query_response_interval, S_IRUGO | S_IWUSR, - show_multicast_query_response_interval, - store_multicast_query_response_interval); +static DEVICE_ATTR_RW(multicast_query_response_interval); -static ssize_t show_multicast_startup_query_interval( +static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -639,18 +621,16 @@ static int set_startup_query_interval(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_multicast_startup_query_interval( +static ssize_t multicast_startup_query_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_interval); } -static DEVICE_ATTR(multicast_startup_query_interval, S_IRUGO | S_IWUSR, - show_multicast_startup_query_interval, - store_multicast_startup_query_interval); +static DEVICE_ATTR_RW(multicast_startup_query_interval); #endif #ifdef CONFIG_BRIDGE_NETFILTER -static ssize_t show_nf_call_iptables( +static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -663,16 +643,15 @@ static int set_nf_call_iptables(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_nf_call_iptables( +static ssize_t nf_call_iptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_iptables); } -static DEVICE_ATTR(nf_call_iptables, S_IRUGO | S_IWUSR, - show_nf_call_iptables, store_nf_call_iptables); +static DEVICE_ATTR_RW(nf_call_iptables); -static ssize_t show_nf_call_ip6tables( +static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -685,16 +664,15 @@ static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_nf_call_ip6tables( +static ssize_t nf_call_ip6tables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_ip6tables); } -static DEVICE_ATTR(nf_call_ip6tables, S_IRUGO | S_IWUSR, - show_nf_call_ip6tables, store_nf_call_ip6tables); +static DEVICE_ATTR_RW(nf_call_ip6tables); -static ssize_t show_nf_call_arptables( +static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); @@ -707,17 +685,16 @@ static int set_nf_call_arptables(struct net_bridge *br, unsigned long val) return 0; } -static ssize_t store_nf_call_arptables( +static ssize_t nf_call_arptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_arptables); } -static DEVICE_ATTR(nf_call_arptables, S_IRUGO | S_IWUSR, - show_nf_call_arptables, store_nf_call_arptables); +static DEVICE_ATTR_RW(nf_call_arptables); #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING -static ssize_t show_vlan_filtering(struct device *d, +static ssize_t vlan_filtering_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -725,14 +702,29 @@ static ssize_t show_vlan_filtering(struct device *d, return sprintf(buf, "%d\n", br->vlan_enabled); } -static ssize_t store_vlan_filtering(struct device *d, +static ssize_t vlan_filtering_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_filter_toggle); } -static DEVICE_ATTR(vlan_filtering, S_IRUGO | S_IWUSR, - show_vlan_filtering, store_vlan_filtering); +static DEVICE_ATTR_RW(vlan_filtering); + +static ssize_t vlan_protocol_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); +} + +static ssize_t vlan_protocol_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_proto); +} +static DEVICE_ATTR_RW(vlan_protocol); #endif static struct attribute *bridge_attrs[] = { @@ -778,6 +770,7 @@ static struct attribute *bridge_attrs[] = { #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, + &dev_attr_vlan_protocol.attr, #endif NULL }; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 2a2cdb756d5..e561cd59b8a 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -26,7 +26,7 @@ struct brport_attribute { int (*store)(struct net_bridge_port *, unsigned long); }; -#define BRPORT_ATTR(_name,_mode,_show,_store) \ +#define BRPORT_ATTR(_name, _mode, _show, _store) \ const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ @@ -41,20 +41,30 @@ static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \ } \ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ - unsigned long flags = p->flags; \ - if (v) \ - flags |= _mask; \ - else \ - flags &= ~_mask; \ - if (flags != p->flags) { \ - p->flags = flags; \ - br_ifinfo_notify(RTM_NEWLINK, p); \ - } \ - return 0; \ + return store_flag(p, v, _mask); \ } \ static BRPORT_ATTR(_name, S_IRUGO | S_IWUSR, \ show_##_name, store_##_name) +static int store_flag(struct net_bridge_port *p, unsigned long v, + unsigned long mask) +{ + unsigned long flags; + + flags = p->flags; + + if (v) + flags |= mask; + else + flags &= ~mask; + + if (flags != p->flags) { + p->flags = flags; + br_port_flags_change(p, mask); + br_ifinfo_notify(RTM_NEWLINK, p); + } + return 0; +} static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) { @@ -209,21 +219,21 @@ static const struct brport_attribute *brport_attrs[] = { #define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) #define to_brport(obj) container_of(obj, struct net_bridge_port, kobj) -static ssize_t brport_show(struct kobject * kobj, - struct attribute * attr, char * buf) +static ssize_t brport_show(struct kobject *kobj, + struct attribute *attr, char *buf) { - struct brport_attribute * brport_attr = to_brport_attr(attr); - struct net_bridge_port * p = to_brport(kobj); + struct brport_attribute *brport_attr = to_brport_attr(attr); + struct net_bridge_port *p = to_brport(kobj); return brport_attr->show(p, buf); } -static ssize_t brport_store(struct kobject * kobj, - struct attribute * attr, - const char * buf, size_t count) +static ssize_t brport_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) { - struct brport_attribute * brport_attr = to_brport_attr(attr); - struct net_bridge_port * p = to_brport(kobj); + struct brport_attribute *brport_attr = to_brport_attr(attr); + struct net_bridge_port *p = to_brport(kobj); ssize_t ret = -EINVAL; char *endp; unsigned long val; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index af5ebd18d70..2b2774fe070 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -60,7 +60,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) * that ever changes this code will allow tagged * traffic to enter the bridge. */ - err = vlan_vid_add(dev, htons(ETH_P_8021Q), vid); + err = vlan_vid_add(dev, br->vlan_proto, vid); if (err) return err; } @@ -80,7 +80,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) out_filt: if (p) - vlan_vid_del(dev, htons(ETH_P_8021Q), vid); + vlan_vid_del(dev, br->vlan_proto, vid); return err; } @@ -92,16 +92,18 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid) __vlan_delete_pvid(v, vid); clear_bit(vid, v->untagged_bitmap); - if (v->port_idx) - vlan_vid_del(v->parent.port->dev, htons(ETH_P_8021Q), vid); + if (v->port_idx) { + struct net_bridge_port *p = v->parent.port; + vlan_vid_del(p->dev, p->br->vlan_proto, vid); + } clear_bit(vid, v->vlan_bitmap); v->num_vlans--; if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) { if (v->port_idx) - rcu_assign_pointer(v->parent.port->vlan_info, NULL); + RCU_INIT_POINTER(v->parent.port->vlan_info, NULL); else - rcu_assign_pointer(v->parent.br->vlan_info, NULL); + RCU_INIT_POINTER(v->parent.br->vlan_info, NULL); kfree_rcu(v, rcu); } return 0; @@ -113,28 +115,12 @@ static void __vlan_flush(struct net_port_vlans *v) v->pvid = 0; bitmap_zero(v->vlan_bitmap, VLAN_N_VID); if (v->port_idx) - rcu_assign_pointer(v->parent.port->vlan_info, NULL); + RCU_INIT_POINTER(v->parent.port->vlan_info, NULL); else - rcu_assign_pointer(v->parent.br->vlan_info, NULL); + RCU_INIT_POINTER(v->parent.br->vlan_info, NULL); kfree_rcu(v, rcu); } -/* Strip the tag from the packet. Will return skb with tci set 0. */ -static struct sk_buff *br_vlan_untag(struct sk_buff *skb) -{ - if (skb->protocol != htons(ETH_P_8021Q)) { - skb->vlan_tci = 0; - return skb; - } - - skb->vlan_tci = 0; - skb = vlan_untag(skb); - if (skb) - skb->vlan_tci = 0; - - return skb; -} - struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_port_vlans *pv, struct sk_buff *skb) @@ -144,34 +130,27 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, if (!br->vlan_enabled) goto out; + /* Vlan filter table must be configured at this point. The + * only exception is the bridge is set in promisc mode and the + * packet is destined for the bridge device. In this case + * pass the packet as is. + */ + if (!pv) { + if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { + goto out; + } else { + kfree_skb(skb); + return NULL; + } + } + /* At this point, we know that the frame was filtered and contains * a valid vlan id. If the vlan id is set in the untagged bitmap, - * send untagged; otherwise, send taged. + * send untagged; otherwise, send tagged. */ br_vlan_get_tag(skb, &vid); if (test_bit(vid, pv->untagged_bitmap)) - skb = br_vlan_untag(skb); - else { - /* Egress policy says "send tagged". If output device - * is the bridge, we need to add the VLAN header - * ourselves since we'll be going through the RX path. - * Sending to ports puts the frame on the TX path and - * we let dev_hard_start_xmit() add the header. - */ - if (skb->protocol != htons(ETH_P_8021Q) && - pv->port_idx == 0) { - /* vlan_put_tag expects skb->data to point to - * mac header. - */ - skb_push(skb, ETH_HLEN); - skb = __vlan_put_tag(skb, skb->vlan_proto, skb->vlan_tci); - if (!skb) - goto out; - /* put skb->data back to where it was */ - skb_pull(skb, ETH_HLEN); - skb->vlan_tci = 0; - } - } + skb->vlan_tci = 0; out: return skb; @@ -181,7 +160,8 @@ out: bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, struct sk_buff *skb, u16 *vid) { - int err; + bool tagged; + __be16 proto; /* If VLAN filtering is disabled on the bridge, all packets are * permitted. @@ -193,9 +173,43 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, * rejected. */ if (!v) - return false; + goto drop; + + proto = br->vlan_proto; + + /* If vlan tx offload is disabled on bridge device and frame was + * sent from vlan device on the bridge device, it does not have + * HW accelerated vlan tag. + */ + if (unlikely(!vlan_tx_tag_present(skb) && + skb->protocol == proto)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + return false; + } + + if (!br_vlan_get_tag(skb, vid)) { + /* Tagged frame */ + if (skb->vlan_proto != proto) { + /* Protocol-mismatch, empty out vlan_tci for new tag */ + skb_push(skb, ETH_HLEN); + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + return false; + + skb_pull(skb, ETH_HLEN); + skb_reset_mac_len(skb); + *vid = 0; + tagged = false; + } else { + tagged = true; + } + } else { + /* Untagged frame */ + tagged = false; + } - err = br_vlan_get_tag(skb, vid); if (!*vid) { u16 pvid = br_get_pvid(v); @@ -204,15 +218,15 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, * vlan untagged or priority-tagged traffic belongs to. */ if (pvid == VLAN_N_VID) - return false; + goto drop; /* PVID is set on this port. Any untagged or priority-tagged * ingress frame is considered to belong to this vlan. */ *vid = pvid; - if (likely(err)) + if (likely(!tagged)) /* Untagged Frame. */ - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), pvid); + __vlan_hwaccel_put_tag(skb, proto, pvid); else /* Priority-tagged Frame. * At this point, We know that skb->vlan_tci had @@ -227,7 +241,8 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, /* Frame had a valid vlan tag. See if vlan is allowed */ if (test_bit(*vid, v->vlan_bitmap)) return true; - +drop: + kfree_skb(skb); return false; } @@ -251,6 +266,36 @@ bool br_allowed_egress(struct net_bridge *br, return false; } +/* Called under RCU */ +bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) +{ + struct net_bridge *br = p->br; + struct net_port_vlans *v; + + if (!br->vlan_enabled) + return true; + + v = rcu_dereference(p->vlan_info); + if (!v) + return false; + + if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) + *vid = 0; + + if (!*vid) { + *vid = br_get_pvid(v); + if (*vid == VLAN_N_VID) + return false; + + return true; + } + + if (test_bit(*vid, v->vlan_bitmap)) + return true; + + return false; +} + /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ @@ -296,9 +341,7 @@ int br_vlan_delete(struct net_bridge *br, u16 vid) if (!pv) return -EINVAL; - spin_lock_bh(&br->hash_lock); - fdb_delete_by_addr(br, br->dev->dev_addr, vid); - spin_unlock_bh(&br->hash_lock); + br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); __vlan_del(pv, vid); return 0; @@ -316,6 +359,52 @@ void br_vlan_flush(struct net_bridge *br) __vlan_flush(pv); } +bool br_vlan_find(struct net_bridge *br, u16 vid) +{ + struct net_port_vlans *pv; + bool found = false; + + rcu_read_lock(); + pv = rcu_dereference(br->vlan_info); + + if (!pv) + goto out; + + if (test_bit(vid, pv->vlan_bitmap)) + found = true; + +out: + rcu_read_unlock(); + return found; +} + +/* Must be protected by RTNL. */ +static void recalculate_group_addr(struct net_bridge *br) +{ + if (br->group_addr_set) + return; + + spin_lock_bh(&br->lock); + if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) { + /* Bridge Group Address */ + br->group_addr[5] = 0x00; + } else { /* vlan_enabled && ETH_P_8021AD */ + /* Provider Bridge Group Address */ + br->group_addr[5] = 0x08; + } + spin_unlock_bh(&br->lock); +} + +/* Must be protected by RTNL. */ +void br_recalculate_fwd_mask(struct net_bridge *br) +{ + if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) + br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; + else /* vlan_enabled && ETH_P_8021AD */ + br->group_fwd_mask_required = BR_GROUPFWD_8021AD & + ~(1u << br->group_addr[5]); +} + int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { if (!rtnl_trylock()) @@ -325,12 +414,88 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) goto unlock; br->vlan_enabled = val; + br_manage_promisc(br); + recalculate_group_addr(br); + br_recalculate_fwd_mask(br); unlock: rtnl_unlock(); return 0; } +int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +{ + int err = 0; + struct net_bridge_port *p; + struct net_port_vlans *pv; + __be16 proto, oldproto; + u16 vid, errvid; + + if (val != ETH_P_8021Q && val != ETH_P_8021AD) + return -EPROTONOSUPPORT; + + if (!rtnl_trylock()) + return restart_syscall(); + + proto = htons(val); + if (br->vlan_proto == proto) + goto unlock; + + /* Add VLANs for the new proto to the device filter. */ + list_for_each_entry(p, &br->port_list, list) { + pv = rtnl_dereference(p->vlan_info); + if (!pv) + continue; + + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + err = vlan_vid_add(p->dev, proto, vid); + if (err) + goto err_filt; + } + } + + oldproto = br->vlan_proto; + br->vlan_proto = proto; + + recalculate_group_addr(br); + br_recalculate_fwd_mask(br); + + /* Delete VLANs for the old proto from the device filter. */ + list_for_each_entry(p, &br->port_list, list) { + pv = rtnl_dereference(p->vlan_info); + if (!pv) + continue; + + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) + vlan_vid_del(p->dev, oldproto, vid); + } + +unlock: + rtnl_unlock(); + return err; + +err_filt: + errvid = vid; + for_each_set_bit(vid, pv->vlan_bitmap, errvid) + vlan_vid_del(p->dev, proto, vid); + + list_for_each_entry_continue_reverse(p, &br->port_list, list) { + pv = rtnl_dereference(p->vlan_info); + if (!pv) + continue; + + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) + vlan_vid_del(p->dev, proto, vid); + } + + goto unlock; +} + +void br_vlan_init(struct net_bridge *br) +{ + br->vlan_proto = htons(ETH_P_8021Q); +} + /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ @@ -380,9 +545,7 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) if (!pv) return -EINVAL; - spin_lock_bh(&port->br->hash_lock); - fdb_delete_by_addr(port->br, port->dev->dev_addr, vid); - spin_unlock_bh(&port->br->hash_lock); + br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); return __vlan_del(pv, vid); } @@ -399,7 +562,7 @@ void nbp_vlan_flush(struct net_bridge_port *port) return; for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - vlan_vid_del(port->dev, htons(ETH_P_8021Q), vid); + vlan_vid_del(port->dev, port->br->vlan_proto, vid); __vlan_flush(pv); } diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 5ca74a0e595..629dc77874a 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -2,14 +2,23 @@ # Bridge netfilter configuration # # -config NF_TABLES_BRIDGE - depends on NF_TABLES +menuconfig NF_TABLES_BRIDGE + depends on BRIDGE && NETFILTER && NF_TABLES tristate "Ethernet Bridge nf_tables support" +if NF_TABLES_BRIDGE + +config NFT_BRIDGE_META + tristate "Netfilter nf_table bridge meta support" + depends on NFT_META + help + Add support for bridge dedicated meta key. + +endif # NF_TABLES_BRIDGE + menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" - depends on BRIDGE && NETFILTER - select NETFILTER_XTABLES + depends on BRIDGE && NETFILTER && NETFILTER_XTABLES help ebtables is a general, extensible frame/packet identification framework. Say 'Y' or 'M' here if you want to do Ethernet diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index ea7629f58b3..6f2f3943d66 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o +obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c index 3fb3c848aff..9024283d2bc 100644 --- a/net/bridge/netfilter/ebt_among.c +++ b/net/bridge/netfilter/ebt_among.c @@ -28,7 +28,7 @@ static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh, uint32_t cmp[2] = { 0, 0 }; int key = ((const unsigned char *)mac)[5]; - memcpy(((char *) cmp) + 2, mac, ETH_ALEN); + ether_addr_copy(((char *) cmp) + 2, mac); start = wh->table[key]; limit = wh->table[key + 1]; if (ip) { diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index c59f7bfae6e..4e0b0c35932 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -22,7 +22,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) if (!skb_make_writable(skb, 0)) return EBT_DROP; - memcpy(eth_hdr(skb)->h_dest, info->mac, ETH_ALEN); + ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); return info->target; } diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 19c37a4929b..5322a36867a 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -96,7 +96,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, bitmask = NF_LOG_MASK; if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == - htons(ETH_P_IP)){ + htons(ETH_P_IP)) { const struct iphdr *ih; struct iphdr _iph; diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 46624bb6d9b..203964997a5 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -25,10 +25,10 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) if (par->hooknum != NF_BR_BROUTING) /* rcu_read_lock()ed by nf_hook_slow */ - memcpy(eth_hdr(skb)->h_dest, - br_port_get_rcu(par->in)->br->dev->dev_addr, ETH_ALEN); + ether_addr_copy(eth_hdr(skb)->h_dest, + br_port_get_rcu(par->in)->br->dev->dev_addr); else - memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN); + ether_addr_copy(eth_hdr(skb)->h_dest, par->in->dev_addr); skb->pkt_type = PACKET_HOST; return info->target; } diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index f8f0bd1a1d5..e56ccd060d2 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -24,7 +24,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) if (!skb_make_writable(skb, 0)) return EBT_DROP; - memcpy(eth_hdr(skb)->h_source, info->mac, ETH_ALEN); + ether_addr_copy(eth_hdr(skb)->h_source, info->mac); if (!(info->target & NAT_ARP_BIT) && eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { const struct arphdr *ap; @@ -35,7 +35,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) return EBT_DROP; if (ap->ar_hln != ETH_ALEN) goto out; - if (skb_store_bits(skb, sizeof(_ah), info->mac,ETH_ALEN)) + if (skb_store_bits(skb, sizeof(_ah), info->mac, ETH_ALEN)) return EBT_DROP; } out: diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index dbd1c783431..d2cdf5d6e98 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -23,8 +23,7 @@ static struct ebt_entries initial_chain = { .policy = EBT_ACCEPT, }; -static struct ebt_replace_kernel initial_table = -{ +static struct ebt_replace_kernel initial_table = { .name = "broute", .valid_hooks = 1 << NF_BR_BROUTING, .entries_size = sizeof(struct ebt_entries), @@ -41,8 +40,7 @@ static int check(const struct ebt_table_info *info, unsigned int valid_hooks) return 0; } -static const struct ebt_table broute_table = -{ +static const struct ebt_table broute_table = { .name = "broute", .table = &initial_table, .valid_hooks = 1 << NF_BR_BROUTING, diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index bb2da7b706e..ce205aabf9c 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -14,8 +14,7 @@ #define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ (1 << NF_BR_LOCAL_OUT)) -static struct ebt_entries initial_chains[] = -{ +static struct ebt_entries initial_chains[] = { { .name = "INPUT", .policy = EBT_ACCEPT, @@ -30,8 +29,7 @@ static struct ebt_entries initial_chains[] = }, }; -static struct ebt_replace_kernel initial_table = -{ +static struct ebt_replace_kernel initial_table = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .entries_size = 3 * sizeof(struct ebt_entries), @@ -50,8 +48,7 @@ static int check(const struct ebt_table_info *info, unsigned int valid_hooks) return 0; } -static const struct ebt_table frame_filter = -{ +static const struct ebt_table frame_filter = { .name = "filter", .table = &initial_table, .valid_hooks = FILTER_VALID_HOOKS, diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index bd238f1f105..a0ac2984fb6 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -14,8 +14,7 @@ #define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ (1 << NF_BR_POST_ROUTING)) -static struct ebt_entries initial_chains[] = -{ +static struct ebt_entries initial_chains[] = { { .name = "PREROUTING", .policy = EBT_ACCEPT, @@ -30,8 +29,7 @@ static struct ebt_entries initial_chains[] = } }; -static struct ebt_replace_kernel initial_table = -{ +static struct ebt_replace_kernel initial_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, .entries_size = 3 * sizeof(struct ebt_entries), @@ -50,8 +48,7 @@ static int check(const struct ebt_table_info *info, unsigned int valid_hooks) return 0; } -static struct ebt_table frame_nat = -{ +static struct ebt_table frame_nat = { .name = "nat", .table = &initial_table, .valid_hooks = NAT_VALID_HOOKS, diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index ac780242838..1059ed3bc25 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -118,10 +118,10 @@ ebt_dev_check(const char *entry, const struct net_device *device) /* 1 is the wildcard token */ while (entry[i] != '\0' && entry[i] != 1 && entry[i] == devname[i]) i++; - return (devname[i] != entry[i] && entry[i] != 1); + return devname[i] != entry[i] && entry[i] != 1; } -#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg)) +#define FWINV2(bool, invflg) ((bool) ^ !!(e->invflags & invflg)) /* process standard matches */ static inline int ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, @@ -1044,10 +1044,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, if (repl->num_counters && copy_to_user(repl->counters, counterstmp, repl->num_counters * sizeof(struct ebt_counter))) { - ret = -EFAULT; + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n"); } - else - ret = 0; /* decrease module count and free resources */ EBT_ENTRY_ITERATE(table->entries, table->entries_size, @@ -1441,7 +1440,7 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user, return -EFAULT; if (*len != sizeof(struct ebt_replace) + entries_size + - (tmp.num_counters? nentries * sizeof(struct ebt_counter): 0)) + (tmp.num_counters ? nentries * sizeof(struct ebt_counter) : 0)) return -EINVAL; if (tmp.nentries != nentries) { @@ -1477,7 +1476,7 @@ static int do_ebt_set_ctl(struct sock *sk, if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - switch(cmd) { + switch (cmd) { case EBT_SO_SET_ENTRIES: ret = do_replace(net, user, len); break; @@ -1507,10 +1506,10 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) if (!t) return ret; - switch(cmd) { + switch (cmd) { case EBT_SO_GET_INFO: case EBT_SO_GET_INIT_INFO: - if (*len != sizeof(struct ebt_replace)){ + if (*len != sizeof(struct ebt_replace)) { ret = -EINVAL; mutex_unlock(&ebt_mutex); break; @@ -1525,7 +1524,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) tmp.valid_hooks = t->table->valid_hooks; } mutex_unlock(&ebt_mutex); - if (copy_to_user(user, &tmp, *len) != 0){ + if (copy_to_user(user, &tmp, *len) != 0) { BUGPRINT("c2u Didn't work\n"); ret = -EFAULT; break; @@ -2375,8 +2374,7 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, } #endif -static struct nf_sockopt_ops ebt_sockopts = -{ +static struct nf_sockopt_ops ebt_sockopts = { .pf = PF_INET, .set_optmin = EBT_BASE_CTL, .set_optmax = EBT_SO_SET_MAX + 1, diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index cf54b22818c..5bcc0d8b31f 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -14,10 +14,30 @@ #include <linux/netfilter_bridge.h> #include <net/netfilter/nf_tables.h> +static unsigned int +nft_do_chain_bridge(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, ops, skb, in, out); + + return nft_do_chain(&pkt, ops); +} + static struct nft_af_info nft_af_bridge __read_mostly = { .family = NFPROTO_BRIDGE, .nhooks = NF_BR_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, + .hooks = { + [NF_BR_LOCAL_IN] = nft_do_chain_bridge, + [NF_BR_FORWARD] = nft_do_chain_bridge, + [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, + }, }; static int nf_tables_bridge_init_net(struct net *net) @@ -48,32 +68,14 @@ static struct pernet_operations nf_tables_bridge_net_ops = { .exit = nf_tables_bridge_exit_net, }; -static unsigned int -nft_do_chain_bridge(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_bridge = { - .family = NFPROTO_BRIDGE, +static const struct nf_chain_type filter_bridge = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_BRIDGE, + .owner = THIS_MODULE, .hook_mask = (1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | (1 << NF_BR_LOCAL_OUT), - .fn = { - [NF_BR_LOCAL_IN] = nft_do_chain_bridge, - [NF_BR_FORWARD] = nft_do_chain_bridge, - [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, - }, }; static int __init nf_tables_bridge_init(void) diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c new file mode 100644 index 00000000000..4f02109d708 --- /dev/null +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2014 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> + +#include "../br_private.h" + +static void nft_meta_bridge_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct net_device *in = pkt->in, *out = pkt->out; + struct nft_data *dest = &data[priv->dreg]; + const struct net_bridge_port *p; + + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + if (in == NULL || (p = br_port_get_rcu(in)) == NULL) + goto err; + break; + case NFT_META_BRI_OIFNAME: + if (out == NULL || (p = br_port_get_rcu(out)) == NULL) + goto err; + break; + default: + goto out; + } + + strncpy((char *)dest->data, p->br->dev->name, sizeof(dest->data)); + return; +out: + return nft_meta_get_eval(expr, data, pkt); +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + break; + default: + return nft_meta_get_init(ctx, expr, tb); + } + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + return 0; +} + +static struct nft_expr_type nft_meta_bridge_type; +static const struct nft_expr_ops nft_meta_bridge_get_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_bridge_get_eval, + .init = nft_meta_bridge_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_bridge_set_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .dump = nft_meta_set_dump, +}; + +static const struct nft_expr_ops * +nft_meta_bridge_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_bridge_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_bridge_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_bridge_type __read_mostly = { + .family = NFPROTO_BRIDGE, + .name = "meta", + .select_ops = &nft_meta_bridge_select_ops, + .policy = nft_meta_policy, + .maxattr = NFTA_META_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_meta_bridge_module_init(void) +{ + return nft_register_expr(&nft_meta_bridge_type); +} + +static void __exit nft_meta_bridge_module_exit(void) +{ + nft_unregister_expr(&nft_meta_bridge_type); +} + +module_init(nft_meta_bridge_module_init); +module_exit(nft_meta_bridge_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 4dca159435c..edbca468fa7 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -22,6 +22,7 @@ #include <net/pkt_sched.h> #include <net/caif/caif_device.h> #include <net/caif/caif_layer.h> +#include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index d6be3edb7a4..e8437094d15 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -124,7 +124,6 @@ static void caif_flow_ctrl(struct sock *sk, int mode) static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -153,14 +152,13 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * may be freed by other threads of control pulling packets * from the queue. */ - skb_len = skb->len; spin_lock_irqsave(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); else kfree_skb(skb); return 0; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 75ed04b78fa..ba02db02290 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -14,6 +14,7 @@ #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> +#include <linux/etherdevice.h> #include <net/netns/generic.h> #include <net/caif/caif_dev.h> #include <net/caif/caif_layer.h> @@ -105,8 +106,8 @@ static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], * 5-11 source address * 12-13 protocol type */ - memcpy(&this->tx_eth_hdr[ETH_ALEN], braddr, ETH_ALEN); - memcpy(&this->tx_eth_hdr[ETH_ALEN], ethaddr, ETH_ALEN); + ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], braddr); + ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], ethaddr); this->tx_eth_hdr[12] = cpu_to_be16(ETH_P_802_EX1) & 0xff; this->tx_eth_hdr[13] = (cpu_to_be16(ETH_P_802_EX1) >> 8) & 0xff; pr_debug("caif ethernet TX-header dst:%pM src:%pM type:%02x%02x\n", diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index 353f793d1b3..a6e11546305 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -15,6 +15,7 @@ #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> +#include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 7344a8fa1bb..4589ff67bfa 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -285,7 +285,7 @@ static int chnl_net_open(struct net_device *dev) goto error; } - lldev = dev_get_by_index(dev_net(dev), llifindex); + lldev = __dev_get_by_index(dev_net(dev), llifindex); if (lldev == NULL) { pr_debug("no interface?\n"); @@ -307,7 +307,6 @@ static int chnl_net_open(struct net_device *dev) mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom)); mtu = min_t(int, GPRS_PDP_MTU, mtu); dev_set_mtu(dev, mtu); - dev_put(lldev); if (mtu < 100) { pr_warn("CAIF Interface MTU too small (%d)\n", mtu); diff --git a/net/can/af_can.c b/net/can/af_can.c index d249874a366..ce82337521f 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -57,6 +57,7 @@ #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> +#include <linux/can/skb.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -290,7 +291,7 @@ int can_send(struct sk_buff *skb, int loop) return -ENOMEM; } - newskb->sk = skb->sk; + can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } @@ -337,6 +338,29 @@ static struct dev_rcv_lists *find_dev_rcv_lists(struct net_device *dev) } /** + * effhash - hash function for 29 bit CAN identifier reduction + * @can_id: 29 bit CAN identifier + * + * Description: + * To reduce the linear traversal in one linked list of _single_ EFF CAN + * frame subscriptions the 29 bit identifier is mapped to 10 bits. + * (see CAN_EFF_RCV_HASH_BITS definition) + * + * Return: + * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask ) + */ +static unsigned int effhash(canid_t can_id) +{ + unsigned int hash; + + hash = can_id; + hash ^= can_id >> CAN_EFF_RCV_HASH_BITS; + hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS); + + return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1); +} + +/** * find_rcv_list - determine optimal filterlist inside device filter struct * @can_id: pointer to CAN identifier of a given can_filter * @mask: pointer to CAN mask of a given can_filter @@ -399,10 +423,8 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, !(*can_id & CAN_RTR_FLAG)) { if (*can_id & CAN_EFF_FLAG) { - if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) { - /* RFC: a future use-case for hash-tables? */ - return &d->rx[RX_EFF]; - } + if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) + return &d->rx_eff[effhash(*can_id)]; } else { if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) return &d->rx_sff[*can_id]; @@ -631,7 +653,7 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) return matches; if (can_id & CAN_EFF_FLAG) { - hlist_for_each_entry_rcu(r, &d->rx[RX_EFF], list) { + hlist_for_each_entry_rcu(r, &d->rx_eff[effhash(can_id)], list) { if (r->can_id == can_id) { deliver(skb, r); matches++; diff --git a/net/can/af_can.h b/net/can/af_can.h index 6de58b40535..fca0fe9fc45 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -59,12 +59,17 @@ struct receiver { char *ident; }; -enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_EFF, RX_MAX }; +#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) +#define CAN_EFF_RCV_HASH_BITS 10 +#define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) + +enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX }; /* per device receive filters linked at dev->ml_priv */ struct dev_rcv_lists { struct hlist_head rx[RX_MAX]; - struct hlist_head rx_sff[0x800]; + struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ]; + struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ]; int remove_on_zero_entries; int entries; }; diff --git a/net/can/bcm.c b/net/can/bcm.c index 46f20bfafc0..dcb75c0e66c 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -268,7 +268,7 @@ static void bcm_can_tx(struct bcm_op *op) /* send with loopback */ skb->dev = dev; - skb->sk = op->sk; + can_skb_set_owner(skb, op->sk); can_send(skb, 1); /* update statistics */ @@ -1223,7 +1223,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) can_skb_prv(skb)->ifindex = dev->ifindex; skb->dev = dev; - skb->sk = sk; + can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); @@ -1256,8 +1256,7 @@ static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock, if (!ifindex && msg->msg_name) { /* no bound device as default => check msg_name */ - struct sockaddr_can *addr = - (struct sockaddr_can *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) return -EINVAL; @@ -1568,6 +1567,7 @@ static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock, sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + __sockaddr_check_size(sizeof(struct sockaddr_can)); msg->msg_namelen = sizeof(struct sockaddr_can); memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } diff --git a/net/can/gw.c b/net/can/gw.c index 3f9b0f3a281..050a2110d43 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -804,7 +804,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) u8 limhops = 0; int err = 0; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) @@ -839,23 +839,21 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (!gwj->ccgw.src_idx || !gwj->ccgw.dst_idx) goto out; - gwj->src.dev = dev_get_by_index(&init_net, gwj->ccgw.src_idx); + gwj->src.dev = __dev_get_by_index(&init_net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; - /* check for CAN netdev not using header_ops - see gw_rcv() */ - if (gwj->src.dev->type != ARPHRD_CAN || gwj->src.dev->header_ops) - goto put_src_out; + if (gwj->src.dev->type != ARPHRD_CAN) + goto out; - gwj->dst.dev = dev_get_by_index(&init_net, gwj->ccgw.dst_idx); + gwj->dst.dev = __dev_get_by_index(&init_net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) - goto put_src_out; + goto out; - /* check for CAN netdev not using header_ops - see gw_rcv() */ - if (gwj->dst.dev->type != ARPHRD_CAN || gwj->dst.dev->header_ops) - goto put_src_dst_out; + if (gwj->dst.dev->type != ARPHRD_CAN) + goto out; gwj->limit_hops = limhops; @@ -864,11 +862,6 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) err = cgw_register_filter(gwj); if (!err) hlist_add_head_rcu(&gwj->list, &cgw_list); - -put_src_dst_out: - dev_put(gwj->dst.dev); -put_src_out: - dev_put(gwj->src.dev); out: if (err) kmem_cache_free(cgw_cache, gwj); @@ -900,7 +893,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) u8 limhops = 0; int err = 0; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) diff --git a/net/can/proc.c b/net/can/proc.c index b543470c8f8..1a19b985a86 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -80,7 +80,6 @@ static const char rx_list_name[][8] = { [RX_ALL] = "rx_all", [RX_FIL] = "rx_fil", [RX_INV] = "rx_inv", - [RX_EFF] = "rx_eff", }; /* @@ -389,25 +388,26 @@ static const struct file_operations can_rcvlist_proc_fops = { .release = single_release, }; -static inline void can_rcvlist_sff_proc_show_one(struct seq_file *m, - struct net_device *dev, - struct dev_rcv_lists *d) +static inline void can_rcvlist_proc_show_array(struct seq_file *m, + struct net_device *dev, + struct hlist_head *rcv_array, + unsigned int rcv_array_sz) { - int i; + unsigned int i; int all_empty = 1; /* check whether at least one list is non-empty */ - for (i = 0; i < 0x800; i++) - if (!hlist_empty(&d->rx_sff[i])) { + for (i = 0; i < rcv_array_sz; i++) + if (!hlist_empty(&rcv_array[i])) { all_empty = 0; break; } if (!all_empty) { can_print_recv_banner(m); - for (i = 0; i < 0x800; i++) { - if (!hlist_empty(&d->rx_sff[i])) - can_print_rcvlist(m, &d->rx_sff[i], dev); + for (i = 0; i < rcv_array_sz; i++) { + if (!hlist_empty(&rcv_array[i])) + can_print_rcvlist(m, &rcv_array[i], dev); } } else seq_printf(m, " (%s: no entry)\n", DNAME(dev)); @@ -425,12 +425,15 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) /* sff receive list for 'all' CAN devices (dev == NULL) */ d = &can_rx_alldev_list; - can_rcvlist_sff_proc_show_one(m, NULL, d); + can_rcvlist_proc_show_array(m, NULL, d->rx_sff, ARRAY_SIZE(d->rx_sff)); /* sff receive list for registered CAN devices */ for_each_netdev_rcu(&init_net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) - can_rcvlist_sff_proc_show_one(m, dev, dev->ml_priv); + if (dev->type == ARPHRD_CAN && dev->ml_priv) { + d = dev->ml_priv; + can_rcvlist_proc_show_array(m, dev, d->rx_sff, + ARRAY_SIZE(d->rx_sff)); + } } rcu_read_unlock(); @@ -452,6 +455,49 @@ static const struct file_operations can_rcvlist_sff_proc_fops = { .release = single_release, }; + +static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) +{ + struct net_device *dev; + struct dev_rcv_lists *d; + + /* RX_EFF */ + seq_puts(m, "\nreceive list 'rx_eff':\n"); + + rcu_read_lock(); + + /* eff receive list for 'all' CAN devices (dev == NULL) */ + d = &can_rx_alldev_list; + can_rcvlist_proc_show_array(m, NULL, d->rx_eff, ARRAY_SIZE(d->rx_eff)); + + /* eff receive list for registered CAN devices */ + for_each_netdev_rcu(&init_net, dev) { + if (dev->type == ARPHRD_CAN && dev->ml_priv) { + d = dev->ml_priv; + can_rcvlist_proc_show_array(m, dev, d->rx_eff, + ARRAY_SIZE(d->rx_eff)); + } + } + + rcu_read_unlock(); + + seq_putc(m, '\n'); + return 0; +} + +static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, can_rcvlist_eff_proc_show, NULL); +} + +static const struct file_operations can_rcvlist_eff_proc_fops = { + .owner = THIS_MODULE, + .open = can_rcvlist_eff_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * proc utility functions */ @@ -491,8 +537,8 @@ void can_init_proc(void) &can_rcvlist_proc_fops, (void *)RX_FIL); pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644, can_dir, &can_rcvlist_proc_fops, (void *)RX_INV); - pde_rcvlist_eff = proc_create_data(CAN_PROC_RCVLIST_EFF, 0644, can_dir, - &can_rcvlist_proc_fops, (void *)RX_EFF); + pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644, can_dir, + &can_rcvlist_eff_proc_fops); pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644, can_dir, &can_rcvlist_sff_proc_fops); } diff --git a/net/can/raw.c b/net/can/raw.c index 641e1c89512..081e81fd017 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -121,13 +121,9 @@ static void raw_rcv(struct sk_buff *oskb, void *data) if (!ro->recv_own_msgs && oskb->sk == sk) return; - /* do not pass frames with DLC > 8 to a legacy socket */ - if (!ro->fd_frames) { - struct canfd_frame *cfd = (struct canfd_frame *)oskb->data; - - if (unlikely(cfd->len > CAN_MAX_DLEN)) - return; - } + /* do not pass non-CAN2.0 frames to a legacy socket */ + if (!ro->fd_frames && oskb->len != CAN_MTU) + return; /* clone the given skb to be able to enqueue it into the rcv queue */ skb = skb_clone(oskb, GFP_ATOMIC); @@ -675,8 +671,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, int err; if (msg->msg_name) { - struct sockaddr_can *addr = - (struct sockaddr_can *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) return -EINVAL; @@ -716,6 +711,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = dev; skb->sk = sk; + skb->priority = sk->sk_priority; err = can_send(skb, ro->loopback); @@ -738,9 +734,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; - struct raw_sock *ro = raw_sk(sk); struct sk_buff *skb; - int rxmtu; int err = 0; int noblock; @@ -751,20 +745,10 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, if (!skb) return err; - /* - * when serving a legacy socket the DLC <= 8 is already checked inside - * raw_rcv(). Now check if we need to pass a canfd_frame to a legacy - * socket and cut the possible CANFD_MTU/CAN_MTU length to CAN_MTU - */ - if (!ro->fd_frames) - rxmtu = CAN_MTU; - else - rxmtu = skb->len; - - if (size < rxmtu) + if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else - size = rxmtu; + size = skb->len; err = memcpy_toiovec(msg->msg_iov, skb->data, size); if (err < 0) { @@ -775,6 +759,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + __sockaddr_check_size(sizeof(struct sockaddr_can)); msg->msg_namelen = sizeof(struct sockaddr_can); memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index bf3e6a13c21..621b5f65407 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -6,6 +6,7 @@ #include <linux/ceph/buffer.h> #include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); - if (!b->vec.iov_base) { - kfree(b); - return NULL; - } - b->is_vmalloc = true; + b->vec.iov_base = ceph_kvmalloc(len, gfp); + if (!b->vec.iov_base) { + kfree(b); + return NULL; } kref_init(&b->kref); @@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref) struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); dout("buffer_release %p\n", b); - if (b->vec.iov_base) { - if (b->is_vmalloc) - vfree(b->vec.iov_base); - else - kfree(b->vec.iov_base); - } + ceph_kvfree(b->vec.iov_base); kfree(b); } EXPORT_SYMBOL(ceph_buffer_release); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 34b11ee8124..1675021d8c1 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> +#include <linux/vmalloc.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> @@ -71,6 +72,8 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; case CEPH_MSG_STATFS: return "statfs"; case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; + case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; + case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply"; case CEPH_MSG_MDS_MAP: return "mds_map"; case CEPH_MSG_CLIENT_SESSION: return "client_session"; case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; @@ -170,6 +173,25 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +void *ceph_kvmalloc(size_t size, gfp_t flags) +{ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *ptr = kmalloc(size, flags | __GFP_NOWARN); + if (ptr) + return ptr; + } + + return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); +} + +void ceph_kvfree(const void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int parse_fsid(const char *str, struct ceph_fsid *fsid) { @@ -461,8 +483,8 @@ EXPORT_SYMBOL(ceph_client_id); * create a fresh client instance */ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - unsigned int supported_features, - unsigned int required_features) + u64 supported_features, + u64 required_features) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 089613234f0..16bc199d9a6 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -116,11 +116,14 @@ void crush_destroy(struct crush_map *map) if (map->rules) { __u32 b; for (b = 0; b < map->max_rules; b++) - kfree(map->rules[b]); + crush_destroy_rule(map->rules[b]); kfree(map->rules); } kfree(map); } - +void crush_destroy_rule(struct crush_rule *rule) +{ + kfree(rule); +} diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index cbd06a91941..a1ef53c0441 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -189,7 +189,7 @@ static int terminal(int x) static int bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) { - int n, l; + int n; __u32 w; __u64 t; @@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, n = bucket->num_nodes >> 1; while (!terminal(n)) { + int l; /* pick point in [0, w) */ w = bucket->node_weights[n]; t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, @@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) * true if device is marked "out" (failed, fully offloaded) * of the cluster */ -static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) +static int is_out(const struct crush_map *map, + const __u32 *weight, int weight_max, + int item, int x) { + if (item >= weight_max) + return 1; if (weight[item] >= 0x10000) return 0; if (weight[item] == 0) @@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in } /** - * crush_choose - choose numrep distinct items of given type + * crush_choose_firstn - choose numrep distinct items of given type * @map: the crush_map * @bucket: the bucket we are choose an item from * @x: crush input value @@ -285,18 +290,28 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector - * @firstn: true if choosing "first n" items, false if choosing "indep" - * @recurse_to_leaf: true if we want one device under each item of given type - * @descend_once: true if we should only try one descent before giving up + * @tries: number of attempts to make + * @recurse_tries: number of attempts to have recursive chooseleaf make + * @local_retries: localized retries + * @local_fallback_retries: localized fallback retries + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent */ -static int crush_choose(const struct crush_map *map, - struct crush_bucket *bucket, - const __u32 *weight, - int x, int numrep, int type, - int *out, int outpos, - int firstn, int recurse_to_leaf, - int descend_once, int *out2) +static int crush_choose_firstn(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + unsigned int tries, + unsigned int recurse_tries, + unsigned int local_retries, + unsigned int local_fallback_retries, + int recurse_to_leaf, + unsigned int vary_r, + int *out2, + int parent_r) { int rep; unsigned int ftotal, flocal; @@ -308,8 +323,11 @@ static int crush_choose(const struct crush_map *map, int itemtype; int collide, reject; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", - bucket->id, x, outpos, numrep); + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep, + tries, recurse_tries, local_retries, local_fallback_retries, + parent_r); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -324,36 +342,18 @@ static int crush_choose(const struct crush_map *map, do { collide = 0; retry_bucket = 0; - r = rep; - if (in->alg == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || (__u32)numrep >= in->size) - /* r' = r + f_total */ - r += ftotal; - else if (in->size % numrep == 0) - /* r'=r+(n+1)*f_local */ - r += (numrep+1) * - (flocal+ftotal); - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } else { - if (firstn) - /* r' = r + f_total */ - r += ftotal; - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } + r = rep + parent_r; + /* r' = r + f_total */ + r += ftotal; /* bucket choose */ if (in->size == 0) { reject = 1; goto reject; } - if (map->choose_local_fallback_tries > 0 && + if (local_fallback_retries > 0 && flocal >= (in->size>>1) && - flocal > map->choose_local_fallback_tries) + flocal > local_fallback_retries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -394,14 +394,23 @@ static int crush_choose(const struct crush_map *map, reject = 0; if (!collide && recurse_to_leaf) { if (item < 0) { - if (crush_choose(map, + int sub_r; + if (vary_r) + sub_r = r >> (vary_r-1); + else + sub_r = 0; + if (crush_choose_firstn(map, map->buckets[-1-item], - weight, + weight, weight_max, x, outpos+1, 0, out2, outpos, - firstn, 0, - map->chooseleaf_descend_once, - NULL) <= outpos) + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -414,6 +423,7 @@ static int crush_choose(const struct crush_map *map, /* out? */ if (itemtype == 0) reject = is_out(map, weight, + weight_max, item, x); else reject = 0; @@ -424,17 +434,14 @@ reject: ftotal++; flocal++; - if (reject && descend_once) - /* let outer call try again */ - skip_rep = 1; - else if (collide && flocal <= map->choose_local_tries) + if (collide && flocal <= local_retries) /* retry locally a few times */ retry_bucket = 1; - else if (map->choose_local_fallback_tries > 0 && - flocal <= in->size + map->choose_local_fallback_tries) + else if (local_fallback_retries > 0 && + flocal <= in->size + local_fallback_retries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal <= map->choose_total_tries) + else if (ftotal < tries) /* then retry descent */ retry_descent = 1; else @@ -464,21 +471,179 @@ reject: /** + * crush_choose_indep: alternative breadth-first positionally stable mapping + * + */ +static void crush_choose_indep(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int left, int numrep, int type, + int *out, int outpos, + unsigned int tries, + unsigned int recurse_tries, + int recurse_to_leaf, + int *out2, + int parent_r) +{ + struct crush_bucket *in = bucket; + int endpos = outpos + left; + int rep; + unsigned int ftotal; + int r; + int i; + int item = 0; + int itemtype; + int collide; + + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + /* initially my result is undefined */ + for (rep = outpos; rep < endpos; rep++) { + out[rep] = CRUSH_ITEM_UNDEF; + if (out2) + out2[rep] = CRUSH_ITEM_UNDEF; + } + + for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] != CRUSH_ITEM_UNDEF) + continue; + + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + for (;;) { + /* note: we base the choice on the position + * even in the nested call. that means that + * if the first layer chooses the same bucket + * in a different position, we will tend to + * choose a different item in that bucket. + * this will involve more devices in data + * movement and tend to distribute the load. + */ + r = rep + parent_r; + + /* be careful */ + if (in->alg == CRUSH_BUCKET_UNIFORM && + in->size % numrep == 0) + /* r'=r+(n+1)*f_total */ + r += (numrep+1) * ftotal; + else + /* r' = r + n*f_total */ + r += numrep * ftotal; + + /* bucket choose */ + if (in->size == 0) { + dprintk(" empty bucket\n"); + break; + } + + item = crush_bucket_choose(in, x, r); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = CRUSH_ITEM_NONE; + left--; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = + CRUSH_ITEM_NONE; + left--; + break; + } + in = map->buckets[-1-item]; + continue; + } + + /* collision? */ + collide = 0; + for (i = outpos; i < endpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + if (collide) + break; + + if (recurse_to_leaf) { + if (item < 0) { + crush_choose_indep(map, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); + if (out2[rep] == CRUSH_ITEM_NONE) { + /* placed nothing; no leaf */ + break; + } + } else { + /* we already have a leaf! */ + out2[rep] = item; + } + } + + /* out? */ + if (itemtype == 0 && + is_out(map, weight, weight_max, item, x)) + break; + + /* yay! */ + out[rep] = item; + left--; + break; + } + } + } + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] == CRUSH_ITEM_UNDEF) { + out[rep] = CRUSH_ITEM_NONE; + } + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { + out2[rep] = CRUSH_ITEM_NONE; + } + } +} + +/** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map * @ruleno: the rule id * @x: hash input * @result: pointer to result vector * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @scratch: scratch vector for private use; must be >= 3 * result_max */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight) + const __u32 *weight, int weight_max, + int *scratch) { int result_len; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int c[CRUSH_MAX_SET]; + int *a = scratch; + int *b = scratch + result_max; + int *c = scratch + result_max*2; int recurse_to_leaf; int *w; int wsize = 0; @@ -489,8 +654,20 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int firstn; - const int descend_once = 0; + /* + * the original choose_total_tries value was off by one (it + * counted "retries" and not "tries"). add one. + */ + int choose_tries = map->choose_total_tries + 1; + int choose_leaf_tries = 0; + /* + * the local tries values were counted as "retries", though, + * and need no adjustment + */ + int choose_local_retries = map->choose_local_tries; + int choose_local_fallback_retries = map->choose_local_fallback_tries; + + int vary_r = map->chooseleaf_vary_r; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -503,29 +680,54 @@ int crush_do_rule(const struct crush_map *map, o = b; for (step = 0; step < rule->len; step++) { + int firstn = 0; struct crush_rule_step *curstep = &rule->steps[step]; - firstn = 0; switch (curstep->op) { case CRUSH_RULE_TAKE: w[0] = curstep->arg1; wsize = 1; break; - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: + case CRUSH_RULE_SET_CHOOSE_TRIES: + if (curstep->arg1 > 0) + choose_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: + if (curstep->arg1 > 0) + choose_leaf_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + if (curstep->arg1 >= 0) + choose_local_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + if (curstep->arg1 >= 0) + choose_local_fallback_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: + if (curstep->arg1 >= 0) + vary_r = curstep->arg1; + break; + + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; /* fall through */ - case CRUSH_RULE_CHOOSE_LEAF_INDEP: + case CRUSH_RULE_CHOOSELEAF_INDEP: case CRUSH_RULE_CHOOSE_INDEP: if (wsize == 0) break; recurse_to_leaf = curstep->op == - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || + CRUSH_RULE_CHOOSELEAF_FIRSTN || curstep->op == - CRUSH_RULE_CHOOSE_LEAF_INDEP; + CRUSH_RULE_CHOOSELEAF_INDEP; /* reset output */ osize = 0; @@ -543,22 +745,53 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; - osize += crush_choose(map, - map->buckets[-1-w[i]], - weight, - x, numrep, - curstep->arg2, - o+osize, j, - firstn, - recurse_to_leaf, - descend_once, c+osize); + if (firstn) { + int recurse_tries; + if (choose_leaf_tries) + recurse_tries = + choose_leaf_tries; + else if (map->chooseleaf_descend_once) + recurse_tries = 1; + else + recurse_tries = choose_tries; + osize += crush_choose_firstn( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + recurse_tries, + choose_local_retries, + choose_local_fallback_retries, + recurse_to_leaf, + vary_r, + c+osize, + 0); + } else { + crush_choose_indep( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + choose_leaf_tries ? + choose_leaf_tries : 1, + recurse_to_leaf, + c+osize, + 0); + osize += numrep; + } } if (recurse_to_leaf) /* copy final _leaf_ values to output set */ memcpy(o, c, osize*sizeof(*o)); - /* swap t and w arrays */ + /* swap o and w arrays */ tmp = o; o = w; w = tmp; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 83661cdc076..d1a62c69a9f 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -53,34 +53,55 @@ static int osdmap_show(struct seq_file *s, void *p) { int i; struct ceph_client *client = s->private; + struct ceph_osdmap *map = client->osdc.osdmap; struct rb_node *n; - if (client->osdc.osdmap == NULL) + if (map == NULL) return 0; - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + + seq_printf(s, "epoch %d\n", map->epoch); seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL" : "", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL" : ""); - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", + (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); + + for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pool = rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %llu pg_num %d / %d\n", - (unsigned long long)pool->id, pool->pg_num, - pool->pg_num_mask); + + seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", + pool->id, pool->pg_num, pool->pg_num_mask, + pool->read_tier, pool->write_tier); } - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { - struct ceph_entity_addr *addr = - &client->osdc.osdmap->osd_addr[i]; - int state = client->osdc.osdmap->osd_state[i]; + for (i = 0; i < map->max_osd; i++) { + struct ceph_entity_addr *addr = &map->osd_addr[i]; + int state = map->osd_state[i]; char sb[64]; - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", i, ceph_pr_addr(&addr->in_addr), - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); + ((map->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state), + ((ceph_get_primary_affinity(map, i)*100) >> 16)); + } + for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_temp.len; i++) + seq_printf(s, "%s%d", (i == 0 ? "" : ","), + pg->pg_temp.osds[i]); + seq_printf(s, "]\n"); } + for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, + pg->pgid.seed, pg->primary_temp.osd); + } + return 0; } @@ -105,9 +126,13 @@ static int monc_show(struct seq_file *s, void *p) req = rb_entry(rp, struct ceph_mon_generic_request, node); op = le16_to_cpu(req->request->hdr.type); if (op == CEPH_MSG_STATFS) - seq_printf(s, "%lld statfs\n", req->tid); + seq_printf(s, "%llu statfs\n", req->tid); + else if (op == CEPH_MSG_POOLOP) + seq_printf(s, "%llu poolop\n", req->tid); + else if (op == CEPH_MSG_MON_GET_VERSION) + seq_printf(s, "%llu mon_get_version", req->tid); else - seq_printf(s, "%lld unknown\n", req->tid); + seq_printf(s, "%llu unknown\n", req->tid); } mutex_unlock(&monc->mutex); @@ -132,7 +157,8 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); + seq_printf(s, "%.*s", req->r_base_oid.name_len, + req->r_base_oid.name); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4a5df7b1cc9..1948d592aa5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -15,6 +15,7 @@ #include <linux/dns_resolver.h> #include <net/tcp.h> +#include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> @@ -382,7 +383,7 @@ static void con_sock_state_closed(struct ceph_connection *con) */ /* data available on socket, or listen socket received a connect */ -static void ceph_sock_data_ready(struct sock *sk, int count_unused) +static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; if (atomic_read(&con->msgr->stopping)) { @@ -556,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, return r; } -static int ceph_tcp_sendpage(struct socket *sock, struct page *page, +static int __ceph_tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, bool more) { int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); @@ -569,6 +570,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, return ret; } +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, bool more) +{ + int ret; + struct kvec iov; + + /* sendpage cannot properly handle pages with page_count == 0, + * we need to fallback to sendmsg if that's the case */ + if (page_count(page) >= 1) + return __ceph_tcp_sendpage(sock, page, offset, size, more); + + iov.iov_base = kmap(page) + offset; + iov.iov_len = size; + ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more); + kunmap(page); + + return ret; +} /* * Shutdown/close the socket for the given connection. @@ -777,13 +796,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, bio = data->bio; BUG_ON(!bio); - BUG_ON(!bio->bi_vcnt); cursor->resid = min(length, data->bio_length); cursor->bio = bio; - cursor->vector_index = 0; - cursor->vector_offset = 0; - cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; + cursor->bvec_iter = bio->bi_iter; + cursor->last_piece = + cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, @@ -792,71 +810,67 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, { struct ceph_msg_data *data = cursor->data; struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); - bio_vec = &bio->bi_io_vec[index]; - BUG_ON(cursor->vector_offset >= bio_vec->bv_len); - *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + *page_offset = (size_t) bio_vec.bv_offset; BUG_ON(*page_offset >= PAGE_SIZE); if (cursor->last_piece) /* pagelist offset is always 0 */ *length = cursor->resid; else - *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + *length = (size_t) bio_vec.bv_len; BUG_ON(*length > cursor->resid); BUG_ON(*page_offset + *length > PAGE_SIZE); - return bio_vec->bv_page; + return bio_vec.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); - bio_vec = &bio->bi_io_vec[index]; + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); /* Advance the cursor offset */ BUG_ON(cursor->resid < bytes); cursor->resid -= bytes; - cursor->vector_offset += bytes; - if (cursor->vector_offset < bio_vec->bv_len) + + bio_advance_iter(bio, &cursor->bvec_iter, bytes); + + if (bytes < bio_vec.bv_len) return false; /* more bytes to process in this segment */ - BUG_ON(cursor->vector_offset != bio_vec->bv_len); /* Move on to the next segment, and possibly the next bio */ - if (++index == (unsigned int) bio->bi_vcnt) { + if (!cursor->bvec_iter.bi_size) { bio = bio->bi_next; - index = 0; + cursor->bio = bio; + if (bio) + cursor->bvec_iter = bio->bi_iter; + else + memset(&cursor->bvec_iter, 0, + sizeof(cursor->bvec_iter)); } - cursor->bio = bio; - cursor->vector_index = index; - cursor->vector_offset = 0; if (!cursor->last_piece) { BUG_ON(!cursor->resid); BUG_ON(!bio); /* A short read is OK, so use <= rather than == */ - if (cursor->resid <= bio->bi_io_vec[index].bv_len) + if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) cursor->last_piece = true; } @@ -923,6 +937,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ + if (!cursor->resid) + return false; /* no more data */ + /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); @@ -1008,6 +1025,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ + if (!cursor->resid) + return false; /* no more data */ + /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); @@ -1865,7 +1885,9 @@ int ceph_parse_ips(const char *c, const char *end, port = (port * 10) + (*p - '0'); p++; } - if (port > 65535 || port == 0) + if (port == 0) + port = CEPH_MON_PORT; + else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; @@ -1945,7 +1967,8 @@ static int process_connect(struct ceph_connection *con) { u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); + u64 server_feat = ceph_sanitize_features( + le64_to_cpu(con->in_reply.features)); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -2853,8 +2876,8 @@ static void con_fault(struct ceph_connection *con) */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc) { msgr->supported_features = supported_features; @@ -3126,15 +3149,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->data); /* front */ - m->front_max = front_len; if (front_len) { - if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, flags, - PAGE_KERNEL); - m->front_is_vmalloc = true; - } else { - m->front.iov_base = kmalloc(front_len, flags); - } + m->front.iov_base = ceph_kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); @@ -3143,7 +3159,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, } else { m->front.iov_base = NULL; } - m->front.iov_len = front_len; + m->front_alloc_len = m->front.iov_len = front_len; dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@ -3256,10 +3272,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) void ceph_msg_kfree(struct ceph_msg *m) { dout("msg_kfree %p\n", m); - if (m->front_is_vmalloc) - vfree(m->front.iov_base); - else - kfree(m->front.iov_base); + ceph_kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } @@ -3301,8 +3314,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->data_length); + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, + msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 1fe25cd29d0..067d3af2eaf 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -152,7 +152,7 @@ static int __open_session(struct ceph_mon_client *monc) /* initiatiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); __send_prepared_auth_request(monc, ret); } else { dout("open_session mon%d already open\n", monc->cur_mon); @@ -196,7 +196,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) int num; p = msg->front.iov_base; - end = p + msg->front_max; + end = p + msg->front_alloc_len; num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; ceph_encode_32(&p, num); @@ -296,6 +296,33 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) __send_subscribe(monc); mutex_unlock(&monc->mutex); } +EXPORT_SYMBOL(ceph_monc_request_next_osdmap); + +int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, + unsigned long timeout) +{ + unsigned long started = jiffies; + int ret; + + mutex_lock(&monc->mutex); + while (monc->have_osdmap < epoch) { + mutex_unlock(&monc->mutex); + + if (timeout != 0 && time_after_eq(jiffies, started + timeout)) + return -ETIMEDOUT; + + ret = wait_event_interruptible_timeout(monc->client->auth_wq, + monc->have_osdmap >= epoch, timeout); + if (ret < 0) + return ret; + + mutex_lock(&monc->mutex); + } + + mutex_unlock(&monc->mutex); + return 0; +} +EXPORT_SYMBOL(ceph_monc_wait_osdmap); /* * @@ -477,14 +504,13 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, return m; } -static int do_generic_request(struct ceph_mon_client *monc, - struct ceph_mon_generic_request *req) +static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, + struct ceph_mon_generic_request *req) { int err; /* register request */ - mutex_lock(&monc->mutex); - req->tid = ++monc->last_tid; + req->tid = tid != 0 ? tid : ++monc->last_tid; req->request->hdr.tid = cpu_to_le64(req->tid); __insert_generic_request(monc, req); monc->num_generic_requests++; @@ -496,13 +522,24 @@ static int do_generic_request(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); rb_erase(&req->node, &monc->generic_request_tree); monc->num_generic_requests--; - mutex_unlock(&monc->mutex); if (!err) err = req->result; return err; } +static int do_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *req) +{ + int err; + + mutex_lock(&monc->mutex); + err = __do_generic_request(monc, 0, req); + mutex_unlock(&monc->mutex); + + return err; +} + /* * statfs */ @@ -579,6 +616,96 @@ out: } EXPORT_SYMBOL(ceph_monc_do_statfs); +static void handle_get_version_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(msg->hdr.tid); + void *p = msg->front.iov_base; + void *end = p + msg->front_alloc_len; + u64 handle; + + dout("%s %p tid %llu\n", __func__, msg, tid); + + ceph_decode_need(&p, end, 2*sizeof(u64), bad); + handle = ceph_decode_64(&p); + if (tid != 0 && tid != handle) + goto bad; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, handle); + if (req) { + *(u64 *)req->buf = ceph_decode_64(&p); + req->result = 0; + get_generic_request(req); + } + mutex_unlock(&monc->mutex); + if (req) { + complete_all(&req->completion); + put_generic_request(req); + } + + return; +bad: + pr_err("corrupt mon_get_version reply\n"); + ceph_msg_dump(msg); +} + +/* + * Send MMonGetVersion and wait for the reply. + * + * @what: one of "mdsmap", "osdmap" or "monmap" + */ +int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest) +{ + struct ceph_mon_generic_request *req; + void *p, *end; + u64 tid; + int err; + + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = newest; + req->buf_len = sizeof(*newest); + init_completion(&req->completion); + + req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, + sizeof(u64) + sizeof(u32) + strlen(what), + GFP_NOFS, true); + if (!req->request) { + err = -ENOMEM; + goto out; + } + + req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, + GFP_NOFS, true); + if (!req->reply) { + err = -ENOMEM; + goto out; + } + + p = req->request->front.iov_base; + end = p + req->request->front_alloc_len; + + /* fill out request */ + mutex_lock(&monc->mutex); + tid = ++monc->last_tid; + ceph_encode_64(&p, tid); /* handle */ + ceph_encode_string(&p, end, what, strlen(what)); + + err = __do_generic_request(monc, tid, req); + + mutex_unlock(&monc->mutex); +out: + kref_put(&req->kref, release_generic_request); + return err; +} +EXPORT_SYMBOL(ceph_monc_do_get_version); + /* * pool ops */ @@ -897,7 +1024,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret < 0) { monc->client->auth_err = ret; wake_up_all(&monc->client->auth_wq); @@ -939,7 +1066,7 @@ static int __validate_auth(struct ceph_mon_client *monc) return 0; ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret <= 0) return ret; /* either an error, or no need to authenticate */ __send_prepared_auth_request(monc, ret); @@ -981,6 +1108,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) handle_statfs_reply(monc, msg); break; + case CEPH_MSG_MON_GET_VERSION_REPLY: + handle_get_version_reply(monc, msg); + break; + case CEPH_MSG_POOLOP_REPLY: handle_poolop_reply(monc, msg); break; @@ -1029,6 +1160,15 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_AUTH_REPLY: m = ceph_msg_get(monc->m_auth_reply); break; + case CEPH_MSG_MON_GET_VERSION_REPLY: + if (le64_to_cpu(hdr->tid) != 0) + return get_generic_reply(con, hdr, skip); + + /* + * Older OSDs don't set reply tid even if the orignal + * request had a non-zero tid. Workaround this weirdness + * by falling through to the allocate case. + */ case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2b4b32aaa89..05be0c18169 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,7 +338,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ - msg_size += 4 + MAX_OBJ_NAME_SIZE; + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ @@ -368,6 +368,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); + req->r_base_oloc.pool = -1; + req->r_target_oloc.pool = -1; + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -433,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode) case CEPH_OSD_OP_OMAPCLEAR: case CEPH_OSD_OP_OMAPRMKEYS: case CEPH_OSD_OP_OMAP_CMP: + case CEPH_OSD_OP_SETALLOCHINT: case CEPH_OSD_OP_CLONERANGE: case CEPH_OSD_OP_ASSERT_SRC_VERSION: case CEPH_OSD_OP_SRC_CMPXATTR: @@ -588,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_watch_init); +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + CEPH_OSD_OP_SETALLOCHINT); + + op->alloc_hint.expected_object_size = expected_object_size; + op->alloc_hint.expected_write_size = expected_write_size; + + /* + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + * not worth a feature bit. Set FAILOK per-op flag to make + * sure older osds don't trip over an unsupported opcode. + */ + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); + static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { @@ -678,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->watch.ver = cpu_to_le64(src->watch.ver); dst->watch.flag = src->watch.flag; break; + case CEPH_OSD_OP_SETALLOCHINT: + dst->alloc_hint.expected_object_size = + cpu_to_le64(src->alloc_hint.expected_object_size); + dst->alloc_hint.expected_write_size = + cpu_to_le64(src->alloc_hint.expected_write_size); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -685,7 +715,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, return 0; } + dst->op = cpu_to_le16(src->op); + dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->payload_len); return request_data_len; @@ -761,11 +793,11 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (num_ops > 1) osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); - req->r_file_layout = *layout; /* keep a copy */ + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", - vino.ino, objnum); - req->r_oid_len = strlen(req->r_oid); + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), + "%llx.%08llx", vino.ino, objnum); + req->r_base_oid.name_len = strlen(req->r_base_oid.name); return req; } @@ -1044,8 +1076,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) !ceph_con_opened(&osd->o_con)) { struct ceph_osd_request *req; - dout(" osd addr hasn't changed and connection never opened," - " letting msgr retry"); + dout("osd addr hasn't changed and connection never opened, " + "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ list_for_each_entry(req, &osd->o_requests, r_osd_item) req->r_stamp = jiffies; @@ -1232,6 +1264,61 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, EXPORT_SYMBOL(ceph_osdc_set_request_linger); /* + * Returns whether a request should be blocked from being sent + * based on the current osdmap and osd_client settings. + * + * Caller should hold map_sem for read. + */ +static bool __req_should_be_paused(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || + (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + +/* + * Calculate mapping of a request to a PG. Takes tiering into account. + */ +static int __calc_request_pg(struct ceph_osdmap *osdmap, + struct ceph_osd_request *req, + struct ceph_pg *pg_out) +{ + bool need_check_tiering; + + need_check_tiering = false; + if (req->r_target_oloc.pool == -1) { + req->r_target_oloc = req->r_base_oloc; /* struct */ + need_check_tiering = true; + } + if (req->r_target_oid.name_len == 0) { + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); + need_check_tiering = true; + } + + if (need_check_tiering && + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); + if (pi) { + if ((req->r_flags & CEPH_OSD_FLAG_READ) && + pi->read_tier >= 0) + req->r_target_oloc.pool = pi->read_tier; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + pi->write_tier >= 0) + req->r_target_oloc.pool = pi->write_tier; + } + /* !pi is caught in ceph_oloc_oid_to_pg() */ + } + + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, + &req->r_target_oid, pg_out); +} + +/* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is * no up osd, set r_osd to NULL. Move the request to the appropriate list @@ -1246,30 +1333,35 @@ static int __map_request(struct ceph_osd_client *osdc, { struct ceph_pg pgid; int acting[CEPH_PG_MAX_SIZE]; - int o = -1, num = 0; + int num, o; int err; + bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, - ceph_file_layout_pg_pool(req->r_file_layout)); + + err = __calc_request_pg(osdc->osdmap, req, &pgid); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; } req->r_pgid = pgid; - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); - if (err > 0) { - o = acting[0]; - num = err; - } + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); + if (num < 0) + num = 0; + + was_paused = req->r_paused; + req->r_paused = __req_should_be_paused(osdc, req); + if (was_paused && !req->r_paused) + force_resend = 1; if ((!force_resend && req->r_osd && req->r_osd->o_osd == o && req->r_sent >= req->r_osd->o_incarnation && req->r_num_pg_osds == num && memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || - (req->r_osd == NULL && o == -1)) + (req->r_osd == NULL && o == -1) || + req->r_paused) return 0; /* no change */ dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", @@ -1331,7 +1423,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); @@ -1362,6 +1454,40 @@ static void __send_queued(struct ceph_osd_client *osdc) } /* + * Caller should hold map_sem for read and request_mutex. + */ +static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc; + + __register_request(osdc, req); + req->r_sent = 0; + req->r_got_reply = 0; + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } + return rc; + } + + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + + return 0; +} + +/* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this * happens, we ping all OSDs with requests who have timed out to @@ -1432,6 +1558,109 @@ static void handle_osds_timeout(struct work_struct *work) round_jiffies_relative(delay)); } +static int ceph_oloc_decode(void **p, void *end, + struct ceph_object_locator *oloc) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret = 0; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_v < 3) { + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + if (struct_cv > 6) { + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + oloc->pool = ceph_decode_64(p); + *p += 4; /* skip preferred */ + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::key is set\n"); + goto e_inval; + } + + if (struct_v >= 5) { + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::nspace is set\n"); + goto e_inval; + } + } + + if (struct_v >= 6) { + s64 hash = ceph_decode_64(p); + if (hash != -1) { + pr_warn("ceph_object_locator::hash is set\n"); + goto e_inval; + } + } + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + +static int ceph_redirect_decode(void **p, void *end, + struct ceph_request_redirect *redir) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_cv > 1) { + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + ret = ceph_oloc_decode(p, end, &redir->oloc); + if (ret) + goto out; + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_request_redirect::object_name is set\n"); + goto e_inval; + } + + len = ceph_decode_32(p); + *p += len; /* skip osd_instructions */ + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + static void complete_request(struct ceph_osd_request *req) { complete_all(&req->r_safe_completion); /* fsync waiter */ @@ -1446,6 +1675,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, { void *p, *end; struct ceph_osd_request *req; + struct ceph_request_redirect redir; u64 tid; int object_len; unsigned int numops; @@ -1484,6 +1714,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, osdmap_epoch = ceph_decode_32(&p); /* lookup */ + down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (req == NULL) { @@ -1525,10 +1756,40 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - already_completed = req->r_got_reply; + if (le16_to_cpu(msg->hdr.version) >= 6) { + p += 8 + 4; /* skip replay_version */ + p += 8; /* skip user_version */ - if (!req->r_got_reply) { + err = ceph_redirect_decode(&p, end, &redir); + if (err) + goto bad_put; + } else { + redir.oloc.pool = -1; + } + if (redir.oloc.pool != -1) { + dout("redirect pool %lld\n", redir.oloc.pool); + + __unregister_request(osdc, req); + + req->r_target_oloc = redir.oloc; /* struct */ + + /* + * Start redirect requests with nofail=true. If + * mapping fails, request will end up on the notarget + * list, waiting for the new osdmap (which can take + * a while), even though the original request mapped + * successfully. In the future we might want to follow + * original request's nofail setting here. + */ + err = __ceph_osdc_start_request(osdc, req, true); + BUG_ON(err); + + goto out_unlock; + } + + already_completed = req->r_got_reply; + if (!req->r_got_reply) { req->r_result = result; dout("handle_reply result %d bytes %d\n", req->r_result, bytes); @@ -1542,8 +1803,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req->r_got_reply = 1; } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { dout("handle_reply tid %llu dup ack\n", tid); - mutex_unlock(&osdc->request_mutex); - goto done; + goto out_unlock; } dout("handle_reply tid %llu flags %d\n", tid, flags); @@ -1558,6 +1818,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, __unregister_request(osdc, req); mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); if (!already_completed) { if (req->r_unsafe_callback && @@ -1575,15 +1836,27 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, complete_request(req); } -done: +out: dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; +out_unlock: + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + goto out; bad_put: + req->r_result = -EIO; + __unregister_request(osdc, req); + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + complete_request(req); ceph_osdc_put_request(req); bad_mutex: mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); bad: pr_err("corrupt osd_op_reply got %d %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); @@ -1613,14 +1886,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) * * Caller should hold map_sem for read. */ -static void kick_requests(struct ceph_osd_client *osdc, int force_resend) +static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, + bool force_resend_writes) { struct ceph_osd_request *req, *nreq; struct rb_node *p; int needmap = 0; int err; + bool force_resend_req; - dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); + dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", + force_resend_writes ? " (force resend writes)" : ""); mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; ) { req = rb_entry(p, struct ceph_osd_request, r_node); @@ -1645,7 +1921,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) continue; } - err = __map_request(osdc, req, force_resend); + force_resend_req = force_resend || + (force_resend_writes && + req->r_flags & CEPH_OSD_FLAG_WRITE); + err = __map_request(osdc, req, force_resend_req); if (err < 0) continue; /* error */ if (req->r_osd == NULL) { @@ -1665,7 +1944,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) r_linger_item) { dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); - err = __map_request(osdc, req, force_resend); + err = __map_request(osdc, req, + force_resend || force_resend_writes); dout("__map_request returned %d\n", err); if (err == 0) continue; /* no change and no osd was specified */ @@ -1707,6 +1987,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) struct ceph_osdmap *newmap = NULL, *oldmap; int err; struct ceph_fsid fsid; + bool was_full; dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); p = msg->front.iov_base; @@ -1720,6 +2001,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) down_write(&osdc->map_sem); + was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + /* incremental maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); dout(" %d inc maps\n", nr_maps); @@ -1744,7 +2027,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } - kick_requests(osdc, 0); + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, 0, was_full); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); @@ -1774,7 +2060,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) int skipped_map = 0; dout("taking full map %u len %d\n", epoch, maplen); - newmap = osdmap_decode(&p, p+maplen); + newmap = ceph_osdmap_decode(&p, p+maplen); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; @@ -1787,7 +2073,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) skipped_map = 1; ceph_osdmap_destroy(oldmap); } - kick_requests(osdc, skipped_map); + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, skipped_map, was_full); } p += maplen; nr_maps--; @@ -1804,7 +2093,9 @@ done: * we find out when we are no longer full and stop returning * ENOSPC. */ - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) ceph_monc_request_next_osdmap(&osdc->client->monc); mutex_lock(&osdc->request_mutex); @@ -1818,7 +2109,6 @@ bad: pr_err("osdc handle_map corrupt msg\n"); ceph_msg_dump(msg); up_write(&osdc->map_sem); - return; } /* @@ -2017,7 +2307,6 @@ done_err: bad: pr_err("osdc handle_watch_notify corrupt msg\n"); - return; } /* @@ -2068,10 +2357,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, ceph_encode_32(&p, -1); /* preferred */ /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + ceph_encode_32(&p, req->r_base_oid.name_len); + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, + req->r_base_oid.name, req->r_base_oid.name_len); + p += req->r_base_oid.name_len; /* ops--can imply data */ ceph_encode_16(&p, (u16)req->r_num_ops); @@ -2125,34 +2415,16 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail) { - int rc = 0; + int rc; down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - req->r_sent = 0; - req->r_got_reply = 0; - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } else { - __unregister_request(osdc, req); - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_queued(osdc); - } - rc = 0; -out_unlock: + + rc = __ceph_osdc_start_request(osdc, req, nofail); + mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); + return rc; } EXPORT_SYMBOL(ceph_osdc_start_request); @@ -2219,7 +2491,7 @@ EXPORT_SYMBOL(ceph_osdc_sync); * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked */ -extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) +void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) { flush_workqueue(osdc->notify_wq); } @@ -2278,9 +2550,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) err = -ENOMEM; osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); if (!osdc->notify_wq) - goto out_msgpool; + goto out_msgpool_reply; + return 0; +out_msgpool_reply: + ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: @@ -2454,7 +2729,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_msg *m; struct ceph_osd_request *req; - int front = le32_to_cpu(hdr->front_len); + int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; @@ -2474,12 +2749,13 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_reply->con); ceph_msg_revoke_incoming(req->r_reply); - if (front > req->r_reply->front.iov_len) { + if (front_len > req->r_reply->front_alloc_len) { pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", - front, (int)req->r_reply->front.iov_len, + front_len, req->r_reply->front_alloc_len, (unsigned int)con->peer_name.type, le64_to_cpu(con->peer_name.num)); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, + false); if (!m) goto out; ceph_msg_put(req->r_reply); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index dbd9a479242..c547e46084d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -329,6 +329,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_descend_once = %d", c->chooseleaf_descend_once); + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_vary_r = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_vary_r = %d", + c->chooseleaf_vary_r); + done: dout("crush_decode success\n"); return c; @@ -343,7 +348,7 @@ bad: /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) + * to a set of osds) and primary_temp (explicit primary setting) */ static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) { @@ -464,6 +469,11 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) return NULL; } +struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) +{ + return __lookup_pg_pool(&map->pg_pools, id); +} + const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; @@ -501,7 +511,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) kfree(pi); } -static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) +static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { u8 ev, cv; unsigned len, num; @@ -514,8 +524,8 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); return -EINVAL; } - if (cv > 7) { - pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); + if (cv > 9) { + pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); return -EINVAL; } len = ceph_decode_32(p); @@ -543,12 +553,34 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += len; } - /* skip removed snaps */ + /* skip removed_snaps */ num = ceph_decode_32(p); *p += num * (8 + 8); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); + *p += 4; /* skip crash_replay_interval */ + + if (ev >= 7) + *p += 1; /* skip min_size */ + + if (ev >= 8) + *p += 8 + 8; /* skip quota_max_* */ + + if (ev >= 9) { + /* skip tiers */ + num = ceph_decode_32(p); + *p += num * 8; + + *p += 8; /* skip tier_of */ + *p += 1; /* skip cache_mode */ + + pi->read_tier = ceph_decode_64(p); + pi->write_tier = ceph_decode_64(p); + } else { + pi->read_tier = -1; + pi->write_tier = -1; + } /* ignore the rest */ @@ -560,7 +592,7 @@ bad: return -EINVAL; } -static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) { struct ceph_pg_pool_info *pi; u32 num, len; @@ -606,6 +638,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) rb_erase(&pg->node, &map->pg_temp); kfree(pg); } + while (!RB_EMPTY_ROOT(&map->primary_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->primary_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->primary_temp); + kfree(pg); + } while (!RB_EMPTY_ROOT(&map->pg_pools)) { struct ceph_pg_pool_info *pi = rb_entry(rb_first(&map->pg_pools), @@ -615,186 +654,516 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_state); kfree(map->osd_weight); kfree(map->osd_addr); + kfree(map->osd_primary_affinity); kfree(map); } /* - * adjust max osd value. reallocate arrays. + * Adjust max_osd value, (re)allocate arrays. + * + * The new elements are properly initialized. */ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) { u8 *state; - struct ceph_entity_addr *addr; u32 *weight; + struct ceph_entity_addr *addr; + int i; - state = kcalloc(max, sizeof(*state), GFP_NOFS); - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); - if (state == NULL || addr == NULL || weight == NULL) { + state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); + weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); + addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); + if (!state || !weight || !addr) { kfree(state); - kfree(addr); kfree(weight); + kfree(addr); + return -ENOMEM; } - /* copy old? */ - if (map->osd_state) { - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); - kfree(map->osd_state); - kfree(map->osd_addr); - kfree(map->osd_weight); + for (i = map->max_osd; i < max; i++) { + state[i] = 0; + weight[i] = CEPH_OSD_OUT; + memset(addr + i, 0, sizeof(*addr)); } map->osd_state = state; map->osd_weight = weight; map->osd_addr = addr; + + if (map->osd_primary_affinity) { + u32 *affinity; + + affinity = krealloc(map->osd_primary_affinity, + max*sizeof(*affinity), GFP_NOFS); + if (!affinity) + return -ENOMEM; + + for (i = map->max_osd; i < max; i++) + affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + map->osd_primary_affinity = affinity; + } + map->max_osd = max; + return 0; } +#define OSDMAP_WRAPPER_COMPAT_VER 7 +#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 + /* - * decode a full map. + * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, + * to struct_v of the client_data section for new (v7 and above) + * osdmaps. */ -struct ceph_osdmap *osdmap_decode(void **p, void *end) +static int get_osdmap_client_data_v(void **p, void *end, + const char *prefix, u8 *v) { - struct ceph_osdmap *map; - u16 version; - u32 len, max, i; - int err = -EINVAL; - void *start = *p; - struct ceph_pg_pool_info *pi; + u8 struct_v; + + ceph_decode_8_safe(p, end, struct_v, e_inval); + if (struct_v >= 7) { + u8 struct_compat; + + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { + pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n", + struct_v, struct_compat, + OSDMAP_WRAPPER_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore wrapper struct_len */ + + ceph_decode_8_safe(p, end, struct_v, e_inval); + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { + pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n", + struct_v, struct_compat, + OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore client data struct_len */ + } else { + u16 version; + + *p -= 1; + ceph_decode_16_safe(p, end, version, e_inval); + if (version < 6) { + pr_warning("got v %d < 6 of %s ceph_osdmap\n", version, + prefix); + return -EINVAL; + } - dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + /* old osdmap enconding */ + struct_v = 0; + } - map = kzalloc(sizeof(*map), GFP_NOFS); - if (map == NULL) - return ERR_PTR(-ENOMEM); - map->pg_temp = RB_ROOT; + *v = struct_v; + return 0; - ceph_decode_16_safe(p, end, version, bad); - if (version > 6) { - pr_warning("got unknown v %d > 6 of osdmap\n", version); - goto bad; +e_inval: + return -EINVAL; +} + +static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg_pool_info *pi; + u64 pool; + int ret; + + ceph_decode_64_safe(p, end, pool, e_inval); + + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!incremental || !pi) { + pi = kzalloc(sizeof(*pi), GFP_NOFS); + if (!pi) + return -ENOMEM; + + pi->id = pool; + + ret = __insert_pg_pool(&map->pg_pools, pi); + if (ret) { + kfree(pi); + return ret; + } + } + + ret = decode_pool(p, end, pi); + if (ret) + return ret; } - if (version < 6) { - pr_warning("got old v %d < 6 of osdmap\n", version); - goto bad; + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, false); +} + +static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, true); +} + +static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 len, i; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, len, e_inval); + + ret = __remove_pg_mapping(&map->pg_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || len > 0) { + struct ceph_pg_mapping *pg; + + ceph_decode_need(p, end, len*sizeof(u32), e_inval); + + if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + return -EINVAL; + + pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->pg_temp.len = len; + for (i = 0; i < len; i++) + pg->pg_temp.osds[i] = ceph_decode_32(p); + + ret = __insert_pg_mapping(pg, &map->pg_temp); + if (ret) { + kfree(pg); + return ret; + } + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, false); +} + +static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, true); +} + +static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 osd; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + + ret = __remove_pg_mapping(&map->primary_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || osd != (u32)-1) { + struct ceph_pg_mapping *pg; + + pg = kzalloc(sizeof(*pg), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->primary_temp.osd = osd; + + ret = __insert_pg_mapping(pg, &map->primary_temp); + if (ret) { + kfree(pg); + return ret; + } + } } - ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, false); +} + +static int decode_new_primary_temp(void **p, void *end, + struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, true); +} + +u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + return map->osd_primary_affinity[osd]; +} + +static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) { + int i; + + map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), + GFP_NOFS); + if (!map->osd_primary_affinity) + return -ENOMEM; + + for (i = 0; i < map->max_osd; i++) + map->osd_primary_affinity[i] = + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + } + + map->osd_primary_affinity[osd] = aff; + + return 0; +} + +static int decode_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len == 0) { + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; + return 0; + } + if (len != map->max_osd) + goto e_inval; + + ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); + + for (i = 0; i < map->max_osd; i++) { + int ret; + + ret = set_primary_affinity(map, i, ceph_decode_32(p)); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_new_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + u32 osd, aff; + int ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_32_safe(p, end, aff, e_inval); + + ret = set_primary_affinity(map, osd, aff); + if (ret) + return ret; + + pr_info("osd%d primary-affinity 0x%x\n", osd, aff); + } + + return 0; + +e_inval: + return -EINVAL; +} + +/* + * decode a full map. + */ +static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) +{ + u8 struct_v; + u32 epoch = 0; + void *start = *p; + u32 max; + u32 len, i; + int err; + + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); + + err = get_osdmap_client_data_v(p, end, "full", &struct_v); + if (err) + goto bad; + + /* fsid, epoch, created, modified */ + ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + + sizeof(map->created) + sizeof(map->modified), e_inval); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); - map->epoch = ceph_decode_32(p); + epoch = map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - ceph_decode_32_safe(p, end, max, bad); - while (max--) { - ceph_decode_need(p, end, 8 + 2, bad); - err = -ENOMEM; - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) - goto bad; - pi->id = ceph_decode_64(p); - err = __decode_pool(p, end, pi); - if (err < 0) { - kfree(pi); - goto bad; - } - __insert_pg_pool(&map->pg_pools, pi); - } + /* pools */ + err = decode_pools(p, end, map); + if (err) + goto bad; - err = __decode_pool_names(p, end, map); - if (err < 0) { - dout("fail to decode pool names"); + /* pool_name */ + err = decode_pool_names(p, end, map); + if (err) goto bad; - } - ceph_decode_32_safe(p, end, map->pool_max, bad); + ceph_decode_32_safe(p, end, map->pool_max, e_inval); - ceph_decode_32_safe(p, end, map->flags, bad); + ceph_decode_32_safe(p, end, map->flags, e_inval); - max = ceph_decode_32(p); + /* max_osd */ + ceph_decode_32_safe(p, end, max, e_inval); /* (re)alloc osd arrays */ err = osdmap_set_max_osd(map, max); - if (err < 0) + if (err) goto bad; - dout("osdmap_decode max_osd = %d\n", map->max_osd); - /* osds */ - err = -EINVAL; + /* osd_state, osd_weight, osd_addrs->client_addr */ ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(1 + sizeof(*map->osd_weight) + - sizeof(*map->osd_addr)), bad); - *p += 4; /* skip length field (should match max) */ + sizeof(*map->osd_addr)), e_inval); + + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + ceph_decode_copy(p, map->osd_state, map->max_osd); - *p += 4; /* skip length field (should match max) */ + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + for (i = 0; i < map->max_osd; i++) map->osd_weight[i] = ceph_decode_32(p); - *p += 4; /* skip length field (should match max) */ + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); for (i = 0; i < map->max_osd; i++) ceph_decode_addr(&map->osd_addr[i]); /* pg_temp */ - ceph_decode_32_safe(p, end, len, bad); - for (i = 0; i < len; i++) { - int n, j; - struct ceph_pg pgid; - struct ceph_pg_mapping *pg; + err = decode_pg_temp(p, end, map); + if (err) + goto bad; - err = ceph_decode_pgid(p, end, &pgid); + /* primary_temp */ + if (struct_v >= 1) { + err = decode_primary_temp(p, end, map); if (err) goto bad; - ceph_decode_need(p, end, sizeof(u32), bad); - n = ceph_decode_32(p); - err = -EINVAL; - if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto bad; - ceph_decode_need(p, end, n * sizeof(u32), bad); - err = -ENOMEM; - pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); - if (!pg) - goto bad; - pg->pgid = pgid; - pg->len = n; - for (j = 0; j < n; j++) - pg->osds[j] = ceph_decode_32(p); + } - err = __insert_pg_mapping(pg, &map->pg_temp); + /* primary_affinity */ + if (struct_v >= 2) { + err = decode_primary_affinity(p, end, map); if (err) goto bad; - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, - len); + } else { + /* XXX can this happen? */ + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; } /* crush */ - ceph_decode_32_safe(p, end, len, bad); - dout("osdmap_decode crush len %d from off 0x%x\n", len, - (int)(*p - start)); - ceph_decode_need(p, end, len, bad); - map->crush = crush_decode(*p, end); - *p += len; + ceph_decode_32_safe(p, end, len, e_inval); + map->crush = crush_decode(*p, min(*p + len, end)); if (IS_ERR(map->crush)) { err = PTR_ERR(map->crush); map->crush = NULL; goto bad; } + *p += len; - /* ignore the rest of the map */ + /* ignore the rest */ *p = end; - dout("osdmap_decode done %p %p\n", *p, end); - return map; + dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); + return 0; +e_inval: + err = -EINVAL; bad: - dout("osdmap_decode fail err %d\n", err); - ceph_osdmap_destroy(map); - return ERR_PTR(err); + pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + return err; +} + +/* + * Allocate and decode a full map. + */ +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + int ret; + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return ERR_PTR(-ENOMEM); + + map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + ret = osdmap_decode(p, end, map); + if (ret) { + ceph_osdmap_destroy(map); + return ERR_PTR(ret); + } + + return map; } /* @@ -813,17 +1182,18 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, __s64 new_pool_max; __s32 new_flags, max; void *start = *p; - int err = -EINVAL; - u16 version; + int err; + u8 struct_v; + + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); - ceph_decode_16_safe(p, end, version, bad); - if (version != 6) { - pr_warning("got unknown v %d != 6 of inc osdmap\n", version); + err = get_osdmap_client_data_v(p, end, "inc", &struct_v); + if (err) goto bad; - } - ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), - bad); + /* fsid, epoch, modified, new_pool_max, new_flags */ + ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + + sizeof(u64) + sizeof(u32), e_inval); ceph_decode_copy(p, &fsid, sizeof(fsid)); epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); @@ -832,21 +1202,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, new_flags = ceph_decode_32(p); /* full map? */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end)); } /* new crush? */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { - dout("apply_incremental new crush map len %d, %p to %p\n", - len, *p, end); newcrush = crush_decode(*p, min(*p+len, end)); - if (IS_ERR(newcrush)) - return ERR_CAST(newcrush); + if (IS_ERR(newcrush)) { + err = PTR_ERR(newcrush); + newcrush = NULL; + goto bad; + } *p += len; } @@ -856,13 +1227,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (new_pool_max >= 0) map->pool_max = new_pool_max; - ceph_decode_need(p, end, 5*sizeof(u32), bad); - /* new max? */ - max = ceph_decode_32(p); + ceph_decode_32_safe(p, end, max, e_inval); if (max >= 0) { err = osdmap_set_max_osd(map, max); - if (err < 0) + if (err) goto bad; } @@ -875,51 +1244,34 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, newcrush = NULL; } - /* new_pool */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - struct ceph_pg_pool_info *pi; + /* new_pools */ + err = decode_new_pools(p, end, map); + if (err) + goto bad; - ceph_decode_64_safe(p, end, pool, bad); - pi = __lookup_pg_pool(&map->pg_pools, pool); - if (!pi) { - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) { - err = -ENOMEM; - goto bad; - } - pi->id = pool; - __insert_pg_pool(&map->pg_pools, pi); - } - err = __decode_pool(p, end, pi); - if (err < 0) - goto bad; - } - if (version >= 5) { - err = __decode_pool_names(p, end, map); - if (err < 0) - goto bad; - } + /* new_pool_names */ + err = decode_pool_names(p, end, map); + if (err) + goto bad; /* old_pool */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { struct ceph_pg_pool_info *pi; - ceph_decode_64_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, e_inval); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } /* new_up */ - err = -EINVAL; - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd; struct ceph_entity_addr addr; - ceph_decode_32_safe(p, end, osd, bad); - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); @@ -928,11 +1280,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_state */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd; u8 xorstate; - ceph_decode_32_safe(p, end, osd, bad); + ceph_decode_32_safe(p, end, osd, e_inval); xorstate = **(u8 **)p; (*p)++; /* clean flag */ if (xorstate == 0) @@ -944,10 +1296,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_weight */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd, off; - ceph_decode_need(p, end, sizeof(u32)*2, bad); + ceph_decode_need(p, end, sizeof(u32)*2, e_inval); osd = ceph_decode_32(p); off = ceph_decode_32(p); pr_info("osd%d weight 0x%x %s\n", osd, off, @@ -958,56 +1310,35 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_pg_temp */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - struct ceph_pg_mapping *pg; - int j; - struct ceph_pg pgid; - u32 pglen; + err = decode_new_pg_temp(p, end, map); + if (err) + goto bad; - err = ceph_decode_pgid(p, end, &pgid); + /* new_primary_temp */ + if (struct_v >= 1) { + err = decode_new_primary_temp(p, end, map); if (err) goto bad; - ceph_decode_need(p, end, sizeof(u32), bad); - pglen = ceph_decode_32(p); - if (pglen) { - ceph_decode_need(p, end, pglen*sizeof(u32), bad); - - /* removing existing (if any) */ - (void) __remove_pg_mapping(&map->pg_temp, pgid); + } - /* insert */ - err = -EINVAL; - if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto bad; - err = -ENOMEM; - pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); - if (!pg) - goto bad; - pg->pgid = pgid; - pg->len = pglen; - for (j = 0; j < pglen; j++) - pg->osds[j] = ceph_decode_32(p); - err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) { - kfree(pg); - goto bad; - } - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, - pgid.seed, pglen); - } else { - /* remove */ - __remove_pg_mapping(&map->pg_temp, pgid); - } + /* new_primary_affinity */ + if (struct_v >= 2) { + err = decode_new_primary_affinity(p, end, map); + if (err) + goto bad; } /* ignore the rest */ *p = end; + + dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return map; +e_inval: + err = -EINVAL; bad: - pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", - epoch, (int)(*p - start), *p, start, end); + pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); @@ -1090,71 +1421,275 @@ invalid: EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* - * calculate an object layout (i.e. pgid) from an oid, - * file_layout, and osdmap + * Calculate mapping of a (oloc, oid) pair to a PG. Should only be + * called with target's (oloc, oid), since tiering isn't taken into + * account. */ -int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool) +int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out) { - struct ceph_pg_pool_info *pool_info; + struct ceph_pg_pool_info *pi; - BUG_ON(!osdmap); - pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); - if (!pool_info) + pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + if (!pi) return -EIO; - pg->pool = pool; - pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); + pg_out->pool = oloc->pool; + pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + + dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, + pg_out->pool, pg_out->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_ceph_pg); +EXPORT_SYMBOL(ceph_oloc_oid_to_pg); + +static int do_crush(struct ceph_osdmap *map, int ruleno, int x, + int *result, int result_max, + const __u32 *weight, int weight_max) +{ + int r; + + BUG_ON(result_max > CEPH_PG_MAX_SIZE); + + mutex_lock(&map->crush_scratch_mutex); + r = crush_do_rule(map->crush, ruleno, x, result, result_max, + weight, weight_max, map->crush_scratch_ary); + mutex_unlock(&map->crush_scratch_mutex); + + return r; +} /* - * Calculate raw osd vector for the given pgid. Return pointer to osd - * array, or NULL on failure. + * Calculate raw (crush) set for given pgid. + * + * Return raw set length, or error. */ -static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds, int *num) +static int pg_to_raw_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + struct ceph_pg pgid, u32 pps, int *osds) { - struct ceph_pg_mapping *pg; - struct ceph_pg_pool_info *pool; int ruleno; - int r; - u32 pps; + int len; - pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) - return NULL; + /* crush */ + ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, + pool->type, pool->size); + if (ruleno < 0) { + pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", + pgid.pool, pool->crush_ruleset, pool->type, + pool->size); + return -ENOENT; + } - /* pg_temp? */ + len = do_crush(osdmap, ruleno, pps, osds, + min_t(int, pool->size, CEPH_PG_MAX_SIZE), + osdmap->osd_weight, osdmap->max_osd); + if (len < 0) { + pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", + len, ruleno, pgid.pool, pool->crush_ruleset, + pool->type, pool->size); + return len; + } + + return len; +} + +/* + * Given raw set, calculate up set and up primary. + * + * Return up set length. *primary is set to up primary osd id, or -1 + * if up set is empty. + */ +static int raw_to_up_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int up_primary = -1; + int i; + + if (ceph_can_shift_osds(pool)) { + int removed = 0; + + for (i = 0; i < len; i++) { + if (ceph_osd_is_down(osdmap, osds[i])) { + removed++; + continue; + } + if (removed) + osds[i - removed] = osds[i]; + } + + len -= removed; + if (len > 0) + up_primary = osds[0]; + } else { + for (i = len - 1; i >= 0; i--) { + if (ceph_osd_is_down(osdmap, osds[i])) + osds[i] = CRUSH_ITEM_NONE; + else + up_primary = osds[i]; + } + } + + *primary = up_primary; + return len; +} + +static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int i; + int pos = -1; + + /* + * Do we have any non-default primary_affinity values for these + * osds? + */ + if (!osdmap->osd_primary_affinity) + return; + + for (i = 0; i < len; i++) { + int osd = osds[i]; + + if (osd != CRUSH_ITEM_NONE && + osdmap->osd_primary_affinity[osd] != + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + break; + } + } + if (i == len) + return; + + /* + * Pick the primary. Feed both the seed (for the pg) and the + * osd into the hash/rng so that a proportional fraction of an + * osd's pgs get rejected as primary. + */ + for (i = 0; i < len; i++) { + int osd = osds[i]; + u32 aff; + + if (osd == CRUSH_ITEM_NONE) + continue; + + aff = osdmap->osd_primary_affinity[osd]; + if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + pps, osd) >> 16) >= aff) { + /* + * We chose not to use this primary. Note it + * anyway as a fallback in case we don't pick + * anyone else, but keep looking. + */ + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = osds[pos]; + + if (ceph_can_shift_osds(pool) && pos > 0) { + /* move the new primary to the front */ + for (i = pos; i > 0; i--) + osds[i] = osds[i - 1]; + osds[0] = *primary; + } +} + +/* + * Given up set, apply pg_temp and primary_temp mappings. + * + * Return acting set length. *primary is set to acting primary osd id, + * or -1 if acting set is empty. + */ +static int apply_temps(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, struct ceph_pg pgid, + int *osds, int len, int *primary) +{ + struct ceph_pg_mapping *pg; + int temp_len; + int temp_primary; + int i; + + /* raw_pg -> pg */ pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, pool->pg_num_mask); + + /* pg_temp? */ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { - *num = pg->len; - return pg->osds; + temp_len = 0; + temp_primary = -1; + + for (i = 0; i < pg->pg_temp.len; i++) { + if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { + if (ceph_can_shift_osds(pool)) + continue; + else + osds[temp_len++] = CRUSH_ITEM_NONE; + } else { + osds[temp_len++] = pg->pg_temp.osds[i]; + } + } + + /* apply pg_temp's primary */ + for (i = 0; i < temp_len; i++) { + if (osds[i] != CRUSH_ITEM_NONE) { + temp_primary = osds[i]; + break; + } + } + } else { + temp_len = len; + temp_primary = *primary; } - /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, - pool->type, pool->size); - if (ruleno < 0) { - pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", - pgid.pool, pool->crush_ruleset, pool->type, - pool->size); - return NULL; + /* primary_temp? */ + pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); + if (pg) + temp_primary = pg->primary_temp.osd; + + *primary = temp_primary; + return temp_len; +} + +/* + * Calculate acting set for given pgid. + * + * Return acting set length, or error. *primary is set to acting + * primary osd id, or -1 if acting set is empty or on error. + */ +int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *osds, int *primary) +{ + struct ceph_pg_pool_info *pool; + u32 pps; + int len; + + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); + if (!pool) { + *primary = -1; + return -ENOENT; } if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { - /* hash pool id and seed sothat pool PGs do not overlap */ + /* hash pool id and seed so that pool PGs do not overlap */ pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, ceph_stable_mod(pgid.seed, pool->pgp_num, pool->pgp_num_mask), pgid.pool); } else { /* - * legacy ehavior: add ps and pool together. this is + * legacy behavior: add ps and pool together. this is * not a great approach because the PGs from each pool * will overlap on top of each other: 0.5 == 1.4 == * 2.3 == ... @@ -1163,38 +1698,20 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pgp_num_mask) + (unsigned)pgid.pool; } - r = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->size, *num), - osdmap->osd_weight); - if (r < 0) { - pr_err("error %d from crush rule: pool %lld ruleset %d type %d" - " size %d\n", r, pgid.pool, pool->crush_ruleset, - pool->type, pool->size); - return NULL; + + len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); + if (len < 0) { + *primary = -1; + return len; } - *num = r; - return osds; -} -/* - * Return acting set for given pgid. - */ -int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *acting) -{ - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, o, num = CEPH_PG_MAX_SIZE; + len = raw_to_up_osds(osdmap, pool, osds, len, primary); - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; + apply_primary_affinity(osdmap, pps, pool, osds, len, primary); + + len = apply_temps(osdmap, pool, pgid, osds, len, primary); - /* primary is first up osd */ - o = 0; - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - acting[o++] = osds[i]; - return o; + return len; } /* @@ -1202,17 +1719,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, */ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, num = CEPH_PG_MAX_SIZE; + int osds[CEPH_PG_MAX_SIZE]; + int primary; - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; + ceph_calc_pg_acting(osdmap, pgid, osds, &primary); - /* primary is first up osd */ - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - return osds[i]; - return -1; + return primary; } EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 815a2249cfa..555013034f7 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -53,7 +53,10 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) set_page_dirty_lock(pages[i]); put_page(pages[i]); } - kfree(pages); + if (is_vmalloc_addr(pages)) + vfree(pages); + else + kfree(pages); } EXPORT_SYMBOL(ceph_put_page_vector); @@ -165,36 +168,6 @@ void ceph_copy_from_page_vector(struct page **pages, EXPORT_SYMBOL(ceph_copy_from_page_vector); /* - * copy user data from a page vector into a user pointer - */ -int ceph_copy_page_vector_to_user(struct page **pages, - void __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, left, PAGE_CACHE_SIZE-po); - bad = copy_to_user(data, page_address(pages[i]) + po, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - if (po) { - po += l - bad; - if (po == PAGE_CACHE_SIZE) - po = 0; - } - i++; - } - return len; -} -EXPORT_SYMBOL(ceph_copy_page_vector_to_user); - -/* * Zero an extent within a page vector. Offset is relative to the * start of the first page. */ diff --git a/net/compat.c b/net/compat.c index dd32e34c1e2..bc8aeefddf3 100644 --- a/net/compat.c +++ b/net/compat.c @@ -85,7 +85,7 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov, { int tot_len; - if (kern_msg->msg_namelen) { + if (kern_msg->msg_name && kern_msg->msg_namelen) { if (mode == VERIFY_READ) { int err = move_addr_to_kernel(kern_msg->msg_name, kern_msg->msg_namelen, @@ -93,10 +93,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov, if (err < 0) return err; } - if (kern_msg->msg_name) - kern_msg->msg_name = kern_address; - } else + kern_msg->msg_name = kern_address; + } else { kern_msg->msg_name = NULL; + kern_msg->msg_namelen = 0; + } tot_len = iov_from_user_compat_to_kern(kern_iov, (struct compat_iovec __user *)kern_msg->msg_iov, @@ -384,8 +385,8 @@ static int compat_sock_setsockopt(struct socket *sock, int level, int optname, return sock_setsockopt(sock, level, optname, optval, optlen); } -asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen) +COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, + char __user *, optval, unsigned int, optlen) { int err; struct socket *sock = sockfd_lookup(fd, &err); @@ -504,8 +505,8 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta } EXPORT_SYMBOL(compat_sock_get_timestampns); -asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen) +COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, + char __user *, optval, int __user *, optlen) { int err; struct socket *sock = sockfd_lookup(fd, &err); @@ -735,15 +736,15 @@ static unsigned char nas[21] = { }; #undef AL -asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) +COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } -asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags) +COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; @@ -751,28 +752,28 @@ asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, flags | MSG_CMSG_COMPAT); } -asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) +COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } -asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned int flags) +COMPAT_SYSCALL_DEFINE4(recv, int, fd, void __user *, buf, compat_size_t, len, unsigned int, flags) { return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT); } -asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len, - unsigned int flags, struct sockaddr __user *addr, - int __user *addrlen) +COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len, + unsigned int, flags, struct sockaddr __user *, addr, + int __user *, addrlen) { return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen); } -asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags, - struct compat_timespec __user *timeout) +COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct compat_timespec __user *, timeout) { int datagrams; struct timespec ktspec; @@ -780,27 +781,22 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, if (flags & MSG_CMSG_COMPAT) return -EINVAL; - if (COMPAT_USE_64BIT_TIME) - return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, - (struct timespec *) timeout); - if (timeout == NULL) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, NULL); - if (get_compat_timespec(&ktspec, timeout)) + if (compat_get_timespec(&ktspec, timeout)) return -EFAULT; datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && put_compat_timespec(&ktspec, timeout)) + if (datagrams > 0 && compat_put_timespec(&ktspec, timeout)) datagrams = -EFAULT; return datagrams; } -asmlinkage long compat_sys_socketcall(int call, u32 __user *args) +COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { int ret; u32 a[6]; diff --git a/net/core/Makefile b/net/core/Makefile index b33b996f5dd..71093d94ad2 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o + sock_diag.o dev_ioctl.o tso.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o @@ -21,4 +21,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o -obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o +obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o +obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o +obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o diff --git a/net/core/datagram.c b/net/core/datagram.c index a16ed7bbe37..488dd1a825c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -740,17 +740,37 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev); - skb->ip_summed = CHECKSUM_UNNECESSARY; } + skb->csum_valid = !sum; return sum; } EXPORT_SYMBOL(__skb_checksum_complete_head); __sum16 __skb_checksum_complete(struct sk_buff *skb) { - return __skb_checksum_complete_head(skb, skb->len); + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + /* skb->csum holds pseudo checksum */ + sum = csum_fold(csum_add(skb->csum, csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + + return sum; } EXPORT_SYMBOL(__skb_checksum_complete); diff --git a/net/core/dev.c b/net/core/dev.c index 355df36360b..367a586d0c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -147,6 +147,11 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); + /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. @@ -480,7 +485,7 @@ EXPORT_SYMBOL(dev_add_offload); * and must not be freed until after all the CPU's have gone * through a quiescent state. */ -void __dev_remove_offload(struct packet_offload *po) +static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &offload_base; struct packet_offload *po1; @@ -498,7 +503,6 @@ void __dev_remove_offload(struct packet_offload *po) out: spin_unlock(&offload_lock); } -EXPORT_SYMBOL(__dev_remove_offload); /** * dev_remove_offload - remove packet offload handler @@ -1118,6 +1122,8 @@ rollback: write_seqcount_end(&devnet_rename_seq); + netdev_adjacent_rename_links(dev, oldname); + write_lock_bh(&dev_base_lock); hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); @@ -1137,6 +1143,7 @@ rollback: err = ret; write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); + memcpy(oldname, newname, IFNAMSIZ); goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", @@ -1203,7 +1210,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } @@ -1241,7 +1252,7 @@ static int __dev_open(struct net_device *dev) * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - netpoll_rx_disable(dev); + netpoll_poll_disable(dev); ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); @@ -1256,7 +1267,7 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - netpoll_rx_enable(dev); + netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); @@ -1309,6 +1320,9 @@ static int __dev_close_many(struct list_head *head) might_sleep(); list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1319,7 +1333,7 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); @@ -1339,6 +1353,7 @@ static int __dev_close_many(struct list_head *head) dev->flags &= ~IFF_UP; net_dmaengine_put(); + netpoll_poll_enable(dev); } return 0; @@ -1349,14 +1364,10 @@ static int __dev_close(struct net_device *dev) int retval; LIST_HEAD(single); - /* Temporarily disable netpoll until the interface is down */ - netpoll_rx_disable(dev); - list_add(&dev->close_list, &single); retval = __dev_close_many(&single); list_del(&single); - netpoll_rx_enable(dev); return retval; } @@ -1394,14 +1405,9 @@ int dev_close(struct net_device *dev) if (dev->flags & IFF_UP) { LIST_HEAD(single); - /* Block netpoll rx while the interface is going down */ - netpoll_rx_disable(dev); - list_add(&dev->close_list, &single); dev_close_many(&single); list_del(&single); - - netpoll_rx_enable(dev); } return 0; } @@ -1566,14 +1572,14 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); * are as for raw_notifier_call_chain(). */ -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, - struct netdev_notifier_info *info) +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info) { ASSERT_RTNL(); netdev_notifier_info_init(info, dev); return raw_notifier_call_chain(&netdev_chain, val, info); } -EXPORT_SYMBOL(call_netdevice_notifiers_info); /** * call_netdevice_notifiers - call all network notifier blocks @@ -1641,8 +1647,7 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -static inline bool is_skb_forwardable(struct net_device *dev, - struct sk_buff *skb) +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { unsigned int len; @@ -1661,6 +1666,30 @@ static inline bool is_skb_forwardable(struct net_device *dev, return false; } +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + if (unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_scrub_packet(skb, true); + skb->protocol = eth_type_trans(skb, dev); + + return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif @@ -1682,24 +1711,7 @@ static inline bool is_skb_forwardable(struct net_device *dev, */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - if (unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - - skb_scrub_packet(skb, true); - skb->protocol = eth_type_trans(skb, dev); - - return netif_rx(skb); + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -2079,7 +2091,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -2282,10 +2294,10 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { + unsigned int vlan_depth = skb->mac_len; __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { @@ -2298,17 +2310,34 @@ __be16 skb_network_protocol(struct sk_buff *skb) type = eth->h_proto; } - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; + /* if skb->protocol is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); } + *depth = vlan_depth; + return type; } @@ -2322,12 +2351,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; - __be16 type = skb_network_protocol(skb); + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); - __skb_pull(skb, skb->mac_len); + __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { @@ -2454,13 +2484,8 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) { struct dev_gso_cb *cb; - do { - struct sk_buff *nskb = skb->next; - - skb->next = nskb->next; - nskb->next = NULL; - kfree_skb(nskb); - } while (skb->next); + kfree_skb_list(skb->next); + skb->next = NULL; cb = DEV_GSO_CB(skb); if (cb->destructor) @@ -2495,11 +2520,39 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) return 0; } +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + features &= skb->dev->mpls_features; + + return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} +#endif + static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { + int tmp; + __be16 type; + + type = skb_network_protocol(skb, &tmp); + features = net_mpls_features(skb, features, type); + if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, skb_network_protocol(skb))) { + !can_checksum_protocol(features, type)) { features &= ~NETIF_F_ALL_CSUM; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; @@ -2536,7 +2589,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq, void *accel_priv) + struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; @@ -2602,13 +2655,10 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); skb_len = skb->len; - if (accel_priv) - rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv); - else - rc = ops->ndo_start_xmit(skb, dev); - + trace_net_dev_start_xmit(skb, dev); + rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK && txq) + if (rc == NETDEV_TX_OK) txq_trans_update(txq); return rc; } @@ -2624,10 +2674,8 @@ gso: dev_queue_xmit_nit(nskb, dev); skb_len = nskb->len; - if (accel_priv) - rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv); - else - rc = ops->ndo_start_xmit(nskb, dev); + trace_net_dev_start_xmit(nskb, dev); + rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) @@ -2747,7 +2795,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); @@ -2784,8 +2832,9 @@ int dev_loopback_xmit(struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); /** - * dev_queue_xmit - transmit a buffer + * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit + * @accel_priv: private data used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -2808,7 +2857,7 @@ EXPORT_SYMBOL(dev_loopback_xmit); * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -int dev_queue_xmit(struct sk_buff *skb) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -2824,7 +2873,7 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = netdev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT @@ -2860,7 +2909,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq, NULL); + rc = dev_hard_start_xmit(skb, dev, txq); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); @@ -2883,14 +2932,26 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); + atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return rc; out: rcu_read_unlock_bh(); return rc; } + +int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} EXPORT_SYMBOL(dev_queue_xmit); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ + return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); + /*======================================================================= Receiver routines @@ -2944,7 +3005,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb->rxhash & flow_table->mask; + flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) @@ -2978,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_sock_flow_table *sock_flow_table; int cpu = -1; u16 tcpu; + u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); @@ -3006,7 +3068,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } skb_reset_network_header(skb); - if (!skb_get_rxhash(skb)) + hash = skb_get_hash(skb); + if (!hash) goto done; flow_table = rcu_dereference(rxqueue->rps_flow_table); @@ -3015,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, u16 next_cpu; struct rps_dev_flow *rflow; - rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; - next_cpu = sock_flow_table->ents[skb->rxhash & - sock_flow_table->mask]; + next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; /* * If the desired CPU (where last recvmsg was done) is @@ -3048,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } if (map) { - tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + tcpu = map->cpus[((u64) hash * map->len) >> 32]; if (cpu_online(tcpu)) { cpu = tcpu; @@ -3151,7 +3213,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -3219,29 +3281,10 @@ enqueue: return NET_RX_DROP; } -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) +static int netif_rx_internal(struct sk_buff *skb) { int ret; - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; - net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); @@ -3270,14 +3313,38 @@ int netif_rx(struct sk_buff *skb) } return ret; } + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + trace_netif_rx_entry(skb); + + return netif_rx_internal(skb); +} EXPORT_SYMBOL(netif_rx); int netif_rx_ni(struct sk_buff *skb) { int err; + trace_netif_rx_ni_entry(skb); + preempt_disable(); - err = netif_rx(skb); + err = netif_rx_internal(skb); if (local_softirq_pending()) do_softirq(); preempt_enable(); @@ -3328,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h) root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3338,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h) &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -3424,7 +3491,7 @@ out: * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * - * Register a receive hander for a device. This handler will then be + * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * @@ -3478,11 +3545,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_ARP): - case __constant_htons(ETH_P_IP): - case __constant_htons(ETH_P_IPV6): - case __constant_htons(ETH_P_8021Q): - case __constant_htons(ETH_P_8021AD): + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): return true; default: return false; @@ -3503,10 +3570,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) trace_netif_receive_skb(skb); - /* if we've gotten here through NAPI, check netpoll */ - if (netpoll_receive_skb(skb)) - goto out; - orig_dev = skb->dev; skb_reset_network_header(skb); @@ -3633,7 +3696,6 @@ drop: unlock: rcu_read_unlock(); -out: return ret; } @@ -3662,22 +3724,7 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -/** - * netif_receive_skb - process receive buffer from network - * @skb: buffer to process - * - * netif_receive_skb() is the main receive data processing function. - * It always succeeds. The buffer may be dropped during processing - * for congestion control or by the protocol layers. - * - * This function may only be called from softirq context and interrupts - * should be enabled. - * - * Return values (usually ignored): - * NET_RX_SUCCESS: no congestion - * NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +static int netif_receive_skb_internal(struct sk_buff *skb) { net_timestamp_check(netdev_tstamp_prequeue, skb); @@ -3703,6 +3750,28 @@ int netif_receive_skb(struct sk_buff *skb) #endif return __netif_receive_skb(skb); } + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + trace_netif_receive_skb_entry(skb); + + return netif_receive_skb_internal(skb); +} EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending @@ -3752,7 +3821,7 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb); + err = ptype->callbacks.gro_complete(skb, 0); break; } rcu_read_unlock(); @@ -3764,7 +3833,7 @@ static int napi_gro_complete(struct sk_buff *skb) } out: - return netif_receive_skb(skb); + return netif_receive_skb_internal(skb); } /* napi->gro_list contains packets ordered by age. @@ -3800,21 +3869,66 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), + skb_mac_header(skb), maclen); NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; + } +} + +static void skb_gro_reset_offset(struct sk_buff *skb) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb_mac_header(skb) == skb_tail_pointer(skb) && + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + pinfo->frags[0].page_offset += grow; + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } @@ -3826,14 +3940,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff struct list_head *head = &offload_base; int same_flow; enum gro_result ret; + int grow; - if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) + if (!(skb->dev->features & NETIF_F_GRO)) goto normal; if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; gro_list_prepare(napi, skb); + NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3845,6 +3961,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->udp_mark = 0; pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; @@ -3869,39 +3986,35 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (same_flow) goto ok; - if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) + if (NAPI_GRO_CB(skb)->flush) goto normal; - napi->gro_count++; + if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { + struct sk_buff *nskb = napi->gro_list; + + /* locate the end of the list to select the 'oldest' flow */ + while (nskb->next) { + pp = &nskb->next; + nskb = *pp; + } + *pp = NULL; + nskb->next = NULL; + napi_gro_complete(nskb); + } else { + napi->gro_count++; + } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; ret = GRO_HELD; pull: - if (skb_headlen(skb) < skb_gro_offset(skb)) { - int grow = skb_gro_offset(skb) - skb_headlen(skb); - - BUG_ON(skb->end - skb->tail < grow); - - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - - skb->tail += grow; - skb->data_len -= grow; - - skb_shinfo(skb)->frags[0].page_offset += grow; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - - if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(skb_shinfo(skb)->frags, - skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); - } - } - + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); ok: return ret; @@ -3910,12 +4023,39 @@ normal: goto pull; } +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb(skb)) + if (netif_receive_skb_internal(skb)) ret = GRO_DROP; break; @@ -3938,25 +4078,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) return ret; } -static void skb_gro_reset_offset(struct sk_buff *skb) -{ - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; - - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; - - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); - } -} - gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + trace_napi_gro_receive_entry(skb); + skb_gro_reset_offset(skb); return napi_skb_finish(dev_gro_receive(napi, skb), skb); @@ -3971,6 +4096,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; } @@ -3987,17 +4115,16 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: + __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); - - if (ret == GRO_HELD) - skb_gro_pull(skb, -ETH_HLEN); - else if (netif_receive_skb(skb)) + if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) ret = GRO_DROP; break; @@ -4013,39 +4140,42 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff * return ret; } +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - unsigned int hlen; - unsigned int off; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb); - off = skb_gro_offset(skb); - hlen = off + sizeof(*eth); - eth = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - eth = skb_gro_header_slow(skb, hlen, off); + eth = skb_gro_header_fast(skb, 0); + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + return NULL; } + } else { + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; } - - skb_gro_pull(skb, sizeof(*eth)); + __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. + * special handling. + * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; -out: return skb; } @@ -4056,12 +4186,14 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) if (!skb) return GRO_DROP; + trace_napi_gro_frags_entry(skb); + return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); } EXPORT_SYMBOL(napi_gro_frags); /* - * net_rps_action sends any pending IPI's for rps. + * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) @@ -4079,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) - __smp_call_function_single(remsd->cpu, - &remsd->csd, 0); + smp_call_function_single_async(remsd->cpu, + &remsd->csd); remsd = next; } } else @@ -4104,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota) #endif napi->weight = weight_p; local_irq_disable(); - while (work < quota) { + while (1) { struct sk_buff *skb; - unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); @@ -4120,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota) } rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - - if (qlen < quota - work) { + if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&napi->poll_list); napi->state = 0; + rps_unlock(sd); - quota = work + qlen; + break; } + + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); rps_unlock(sd); } local_irq_enable(); @@ -4167,7 +4298,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(n->gro_list); list_del(&n->poll_list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); @@ -4266,17 +4397,10 @@ EXPORT_SYMBOL(netif_napi_add); void netif_napi_del(struct napi_struct *napi) { - struct sk_buff *skb, *next; - list_del_init(&napi->dev_list); napi_free_frags(napi); - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; - skb->next = NULL; - kfree_skb(skb); - } - + kfree_skb_list(napi->gro_list); napi->gro_list = NULL; napi->gro_count = 0; } @@ -4393,19 +4517,6 @@ struct netdev_adjacent { struct rcu_head rcu; }; -static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev, - struct net_device *adj_dev, - struct list_head *adj_list) -{ - struct netdev_adjacent *adj; - - list_for_each_entry_rcu(adj, adj_list, list) { - if (adj->dev == adj_dev) - return adj; - } - return NULL; -} - static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, struct list_head *adj_list) @@ -4444,13 +4555,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev); * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ -bool netdev_has_any_upper_dev(struct net_device *dev) +static bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->all_adj_list.upper); } -EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device @@ -4487,6 +4597,32 @@ void *netdev_adjacent_get_private(struct list_head *adj_list) EXPORT_SYMBOL(netdev_adjacent_get_private); /** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + +/** * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position @@ -4499,7 +4635,7 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, { struct netdev_adjacent *upper; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); @@ -4533,8 +4669,7 @@ void *netdev_lower_get_next_private(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = lower->list.next; + *iter = lower->list.next; return lower->private; } @@ -4562,14 +4697,60 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = &lower->list; + *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + +/** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * @@ -4588,13 +4769,36 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int netdev_adjacent_sysfs_add(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", adj_dev->name); + return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), + linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, + char *name, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", name); + sysfs_remove_link(&(dev->dev.kobj), linkname); +} + +#define netdev_adjacent_is_neigh_list(dev, dev_list) \ + (dev_list == &dev->adj_list.upper || \ + dev_list == &dev->adj_list.lower) + static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, void *private, bool master) { struct netdev_adjacent *adj; - char linkname[IFNAMSIZ+7]; int ret; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4617,16 +4821,8 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, pr_debug("dev_hold for %s, because of link added from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); - if (dev_list == &dev->adj_list.lower) { - sprintf(linkname, "lower_%s", adj_dev->name); - ret = sysfs_create_link(&(dev->dev.kobj), - &(adj_dev->dev.kobj), linkname); - if (ret) - goto free_adj; - } else if (dev_list == &dev->adj_list.upper) { - sprintf(linkname, "upper_%s", adj_dev->name); - ret = sysfs_create_link(&(dev->dev.kobj), - &(adj_dev->dev.kobj), linkname); + if (netdev_adjacent_is_neigh_list(dev, dev_list)) { + ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; } @@ -4646,14 +4842,8 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; remove_symlinks: - if (dev_list == &dev->adj_list.lower) { - sprintf(linkname, "lower_%s", adj_dev->name); - sysfs_remove_link(&(dev->dev.kobj), linkname); - } else if (dev_list == &dev->adj_list.upper) { - sprintf(linkname, "upper_%s", adj_dev->name); - sysfs_remove_link(&(dev->dev.kobj), linkname); - } - + if (netdev_adjacent_is_neigh_list(dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: kfree(adj); dev_put(adj_dev); @@ -4661,12 +4851,11 @@ free_adj: return ret; } -void __netdev_adjacent_dev_remove(struct net_device *dev, - struct net_device *adj_dev, - struct list_head *dev_list) +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) { struct netdev_adjacent *adj; - char linkname[IFNAMSIZ+7]; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4686,13 +4875,8 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); - if (dev_list == &dev->adj_list.lower) { - sprintf(linkname, "lower_%s", adj_dev->name); - sysfs_remove_link(&(dev->dev.kobj), linkname); - } else if (dev_list == &dev->adj_list.upper) { - sprintf(linkname, "upper_%s", adj_dev->name); - sysfs_remove_link(&(dev->dev.kobj), linkname); - } + if (netdev_adjacent_is_neigh_list(dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); pr_debug("dev_put for %s, because link removed from %s to %s\n", @@ -4701,11 +4885,11 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, kfree_rcu(adj, rcu); } -int __netdev_adjacent_dev_link_lists(struct net_device *dev, - struct net_device *upper_dev, - struct list_head *up_list, - struct list_head *down_list, - void *private, bool master) +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) { int ret; @@ -4724,8 +4908,8 @@ int __netdev_adjacent_dev_link_lists(struct net_device *dev, return 0; } -int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *upper_dev) +static int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->all_adj_list.upper, @@ -4733,26 +4917,26 @@ int __netdev_adjacent_dev_link(struct net_device *dev, NULL, false); } -void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, - struct net_device *upper_dev, - struct list_head *up_list, - struct list_head *down_list) +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, down_list); } -void __netdev_adjacent_dev_unlink(struct net_device *dev, - struct net_device *upper_dev) +static void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower); } -int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, - struct net_device *upper_dev, - void *private, bool master) +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) { int ret = __netdev_adjacent_dev_link(dev, upper_dev); @@ -4771,8 +4955,8 @@ int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, return 0; } -void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, - struct net_device *upper_dev) +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) { __netdev_adjacent_dev_unlink(dev, upper_dev); __netdev_adjacent_dev_unlink_lists(dev, upper_dev, @@ -4961,20 +5145,24 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); -void *netdev_lower_dev_get_private_rcu(struct net_device *dev, - struct net_device *lower_dev) +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { - struct netdev_adjacent *lower; + struct netdev_adjacent *iter; - if (!lower_dev) - return NULL; - lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower); - if (!lower) - return NULL; + list_for_each_entry(iter, &dev->adj_list.upper, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + } - return lower->private; + list_for_each_entry(iter, &dev->adj_list.lower, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } } -EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) @@ -4991,6 +5179,30 @@ void *netdev_lower_dev_get_private(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_dev_get_private); + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) +{ + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; + + ASSERT_RTNL(); + + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; + } + + if (type_check(dev)) + max_nest++; + + return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -5308,6 +5520,17 @@ int dev_change_flags(struct net_device *dev, unsigned int flags) } EXPORT_SYMBOL(dev_change_flags); +static int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + + dev->mtu = new_mtu; + return 0; +} + /** * dev_set_mtu - Change maximum transfer unit * @dev: device @@ -5317,8 +5540,7 @@ EXPORT_SYMBOL(dev_change_flags); */ int dev_set_mtu(struct net_device *dev, int new_mtu) { - const struct net_device_ops *ops = dev->netdev_ops; - int err; + int err, orig_mtu; if (new_mtu == dev->mtu) return 0; @@ -5330,14 +5552,25 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) if (!netif_device_present(dev)) return -ENODEV; - err = 0; - if (ops->ndo_change_mtu) - err = ops->ndo_change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + orig_mtu = dev->mtu; + err = __dev_set_mtu(dev, new_mtu); + + if (!err) { + err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) { + /* setting mtu back and notifying everyone again, + * so that they have a chance to revert changes. + */ + __dev_set_mtu(dev, orig_mtu); + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + } + } return err; } EXPORT_SYMBOL(dev_set_mtu); @@ -5439,7 +5672,7 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); -static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { @@ -5496,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); - /* * Flush the unicast and multicast chains */ @@ -5509,6 +5738,10 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -5592,6 +5825,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; + return features; } @@ -5691,7 +5931,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; @@ -5727,10 +5967,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netif_free_tx_queues(struct net_device *dev) { - if (is_vmalloc_addr(dev->_tx)) - vfree(dev->_tx); - else - kfree(dev->_tx); + kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5830,13 +6067,8 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_SOFT_FEATURES; dev->wanted_features = dev->features & dev->hw_features; - /* Turn on no cache copy if HW is doing checksum */ if (!(dev->flags & IFF_LOOPBACK)) { dev->hw_features |= NETIF_F_NOCACHE_COPY; - if (dev->features & NETIF_F_ALL_CSUM) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; - } } /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -6172,6 +6404,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, &dev->stats); } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + storage->tx_dropped += atomic_long_read(&dev->tx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -6208,10 +6441,7 @@ void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } /** @@ -6223,7 +6453,7 @@ void netdev_freemem(struct net_device *dev) * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subquue structs + * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, @@ -6241,7 +6471,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; @@ -6297,7 +6527,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) @@ -6316,11 +6546,6 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - netif_free_tx_queues(dev); -#ifdef CONFIG_RPS - kfree(dev->_rx); -#endif - free_dev: netdev_freemem(dev); return NULL; @@ -6342,7 +6567,7 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); netif_free_tx_queues(dev); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS kfree(dev->_rx); #endif @@ -6417,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdevice_many - unregister many devices * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -6426,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head) rollback_registered_many(head); list_for_each_entry(dev, head, unreg_list) net_set_todo(dev); + list_del(head); } } EXPORT_SYMBOL(unregister_netdevice_many); @@ -6612,11 +6841,11 @@ static int dev_cpu_callback(struct notifier_block *nfb, /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { - netif_rx(skb); + netif_rx_internal(skb); input_queue_head_incr(oldsd); } while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx(skb); + netif_rx_internal(skb); input_queue_head_incr(oldsd); } @@ -6881,7 +7110,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } } unregister_netdevice_many(&dev_kill_list); - list_del(&dev_kill_list); rtnl_unlock(); } @@ -6929,28 +7157,18 @@ static int __init net_dev_init(void) for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); - memset(sd, 0, sizeof(*sd)); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); - sd->completion_queue = NULL; INIT_LIST_HEAD(&sd->poll_list); - sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; - sd->csd.flags = 0; sd->cpu = i; #endif sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; - sd->backlog.gro_list = NULL; - sd->backlog.gro_count = 0; - -#ifdef CONFIG_NET_FLOW_LIMIT - sd->flow_limit = NULL; -#endif } dev_boot_phase = 0; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index ec40a849fc4..b6b230600b9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -38,7 +38,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, ha->type = addr_type; ha->refcount = 1; ha->global_use = global; - ha->synced = sync; + ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; list_add_tail_rcu(&ha->list, &list->list); list->count++; @@ -48,7 +48,8 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global, bool sync) + unsigned char addr_type, bool global, bool sync, + int sync_count) { struct netdev_hw_addr *ha; @@ -66,10 +67,10 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, ha->global_use = true; } if (sync) { - if (ha->synced) + if (ha->synced && sync_count) return -EEXIST; else - ha->synced = true; + ha->synced++; } ha->refcount++; return 0; @@ -84,7 +85,8 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false); + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, + 0); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, @@ -101,7 +103,7 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, ha->global_use = false; if (sync) - ha->synced = false; + ha->synced--; if (--ha->refcount) return 0; @@ -139,7 +141,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, - false, true); + false, true, ha->sync_cnt); if (err && err != -EEXIST) return err; @@ -186,47 +188,6 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, return err; } -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha, *ha2; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); - if (err) - goto unroll; - } - return 0; - -unroll: - list_for_each_entry(ha2, &from_list->list, list) { - if (ha2 == ha) - break; - type = addr_type ? addr_type : ha2->type; - __hw_addr_del(to_list, ha2->addr, addr_len, type); - } - return err; -} -EXPORT_SYMBOL(__hw_addr_add_multiple); - -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, type); - } -} -EXPORT_SYMBOL(__hw_addr_del_multiple); - /* This function only works where there is a strict 1-1 relationship * between source and destionation of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use @@ -264,7 +225,92 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, } EXPORT_SYMBOL(__hw_addr_unsync); -void __hw_addr_flush(struct netdev_hw_addr_list *list) +/** + * __hw_addr_sync_dev - Synchonize device's multicast list + * @list: address list to syncronize + * @dev: device to sync + * @sync: function to call if address should be added + * @unsync: function to call if address should be removed + * + * This funciton is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address add/remove + * notifications. The unsync function may be NULL in which case + * the addresses requiring removal will simply be removed without + * any notification to the device. + **/ +int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, const unsigned char *), + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + int err; + + /* first go through and flush out any stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt || ha->refcount != 1) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (ha->sync_cnt) + continue; + + err = sync(dev, ha->addr); + if (err) + return err; + + ha->sync_cnt++; + ha->refcount++; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_sync_dev); + +/** + * __hw_addr_unsync_dev - Remove synchonized addresses from device + * @list: address list to remove syncronized addresses from + * @dev: device to sync + * @unsync: function to call if address should be removed + * + * Remove all addresses that were added to the device by __hw_addr_sync_dev(). + * This function is intended to be called from the ndo_stop or ndo_open + * functions on devices that require explicit address add/remove + * notifications. If the unsync function pointer is NULL then this function + * can be used to just reset the sync_cnt for the addresses in the list. + **/ +void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_unsync_dev); + +static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; @@ -274,7 +320,6 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list) } list->count = 0; } -EXPORT_SYMBOL(__hw_addr_flush); void __hw_addr_init(struct netdev_hw_addr_list *list) { @@ -400,59 +445,6 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, } EXPORT_SYMBOL(dev_addr_del); -/** - * dev_addr_add_multiple - Add device addresses from another device - * @to_dev: device to which addresses will be added - * @from_dev: device from which addresses will be added - * @addr_type: address type - 0 means type will be used from from_dev - * - * Add device addresses of the one device to another. - ** - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - * dev_addr_del_multiple - Delete device addresses by another device - * @to_dev: device where the addresses will be deleted - * @from_dev: device supplying the addresses to be deleted - * @addr_type: address type - 0 means type will be used from from_dev - * - * Deletes addresses in to device by the list of addresses in from device. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - /* * Unicast list handling functions */ @@ -676,7 +668,7 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global, false); + NETDEV_HW_ADDR_T_MULTICAST, global, false, 0); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 95897183226..e70301eb7a4 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -64,7 +64,6 @@ static struct genl_family net_drop_monitor_family = { .hdrsize = 0, .name = "NET_DM", .version = 2, - .maxattr = NET_DM_CMD_MAX, }; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); diff --git a/net/core/dst.c b/net/core/dst.c index ca4231ec734..a028409ee43 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -142,12 +142,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard(struct sk_buff *skb) +int dst_discard_sk(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard); +EXPORT_SYMBOL(dst_discard_sk); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -184,7 +184,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard; + dst->output = dst_discard_sk; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -209,8 +209,10 @@ static void ___dst_free(struct dst_entry *dst) /* The first case (dev==NULL) is required, when protocol module is unloaded. */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) - dst->input = dst->output = dst_discard; + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard; + dst->output = dst_discard_sk; + } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -267,6 +269,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_destroy_rcu(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -274,11 +285,8 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) + call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); @@ -361,7 +369,8 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, return; if (!unregister) { - dst->input = dst->output = dst_discard; + dst->input = dst_discard; + dst->output = dst_discard_sk; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 30071dec287..17cb912793f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) @@ -556,6 +557,23 @@ err_out: return ret; } +static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, + struct ethtool_rxnfc *rx_rings, + u32 size) +{ + int i; + + if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) + return -EFAULT; + + /* Validate ring indices */ + for (i = 0; i < size; i++) + if (indir[i] >= rx_rings->data) + return -EINVAL; + + return 0; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -564,7 +582,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->get_rxfh_indir) + !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); if (dev_size == 0) @@ -590,7 +608,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); if (ret) goto out; @@ -612,8 +630,9 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, u32 *indir; const struct ethtool_ops *ops = dev->ethtool_ops; int ret; + u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); - if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir || + if (!ops->get_rxfh_indir_size || !ops->set_rxfh || !ops->get_rxnfc) return -EOPNOTSUPP; @@ -642,28 +661,184 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, for (i = 0; i < dev_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { - if (copy_from_user(indir, - useraddr + - offsetof(struct ethtool_rxfh_indir, - ring_index[0]), - dev_size * sizeof(indir[0]))) { + ret = ethtool_copy_validate_indir(indir, + useraddr + ringidx_offset, + &rx_rings, + dev_size); + if (ret) + goto out; + } + + ret = ops->set_rxfh(dev, indir, NULL); + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, + void __user *useraddr) +{ + int ret; + const struct ethtool_ops *ops = dev->ethtool_ops; + u32 user_indir_size, user_key_size; + u32 dev_indir_size = 0, dev_key_size = 0; + struct ethtool_rxfh rxfh; + u32 total_size; + u32 indir_bytes; + u32 *indir = NULL; + u8 *hkey = NULL; + u8 *rss_config; + + if (!(dev->ethtool_ops->get_rxfh_indir_size || + dev->ethtool_ops->get_rxfh_key_size) || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + + if (ops->get_rxfh_indir_size) + dev_indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + dev_key_size = ops->get_rxfh_key_size(dev); + + if ((dev_key_size + dev_indir_size) == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) + return -EFAULT; + user_indir_size = rxfh.indir_size; + user_key_size = rxfh.key_size; + + /* Check that reserved fields are 0 for now */ + if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + return -EINVAL; + + rxfh.indir_size = dev_indir_size; + rxfh.key_size = dev_key_size; + if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) + return -EFAULT; + + /* If the user buffer size is 0, this is just a query for the + * device table size and key size. Otherwise, if the User size is + * not equal to device table size or key size it's an error. + */ + if (!user_indir_size && !user_key_size) + return 0; + + if ((user_indir_size && (user_indir_size != dev_indir_size)) || + (user_key_size && (user_key_size != dev_key_size))) + return -EINVAL; + + indir_bytes = user_indir_size * sizeof(indir[0]); + total_size = indir_bytes + user_key_size; + rss_config = kzalloc(total_size, GFP_USER); + if (!rss_config) + return -ENOMEM; + + if (user_indir_size) + indir = (u32 *)rss_config; + + if (user_key_size) + hkey = rss_config + indir_bytes; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); + if (!ret) { + if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) ret = -EFAULT; + } + + kfree(rss_config); + + return ret; +} + +static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, + void __user *useraddr) +{ + int ret; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings; + struct ethtool_rxfh rxfh; + u32 dev_indir_size = 0, dev_key_size = 0, i; + u32 *indir = NULL, indir_bytes = 0; + u8 *hkey = NULL; + u8 *rss_config; + u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + + if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || + !ops->get_rxnfc || !ops->set_rxfh) + return -EOPNOTSUPP; + + if (ops->get_rxfh_indir_size) + dev_indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); + if ((dev_key_size + dev_indir_size) == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) + return -EFAULT; + + /* Check that reserved fields are 0 for now */ + if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + return -EINVAL; + + /* If either indir or hash key is valid, proceed further. + * It is not valid to request that both be unchanged. + */ + if ((rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && + rxfh.indir_size != dev_indir_size) || + (rxfh.key_size && (rxfh.key_size != dev_key_size)) || + (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && + rxfh.key_size == 0)) + return -EINVAL; + + if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + indir_bytes = dev_indir_size * sizeof(indir[0]); + + rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); + if (!rss_config) + return -ENOMEM; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret) + goto out; + + /* rxfh.indir_size == 0 means reset the indir table to default. + * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. + */ + if (rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { + indir = (u32 *)rss_config; + ret = ethtool_copy_validate_indir(indir, + useraddr + rss_cfg_offset, + &rx_rings, + rxfh.indir_size); + if (ret) goto out; - } + } else if (rxfh.indir_size == 0) { + indir = (u32 *)rss_config; + for (i = 0; i < dev_indir_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } - /* Validate ring indices */ - for (i = 0; i < dev_size; i++) { - if (indir[i] >= rx_rings.data) { - ret = -EINVAL; - goto out; - } + if (rxfh.key_size) { + hkey = rss_config + indir_bytes; + if (copy_from_user(hkey, + useraddr + rss_cfg_offset + indir_bytes, + rxfh.key_size)) { + ret = -EFAULT; + goto out; } } - ret = ops->set_rxfh_indir(dev, indir); + ret = ops->set_rxfh(dev, indir, hkey); out: - kfree(indir); + kfree(rss_config); return ret; } @@ -1490,6 +1665,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: case ETHTOOL_GRXFHINDIR: + case ETHTOOL_GRSSH: case ETHTOOL_GFEATURES: case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: @@ -1627,6 +1803,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; + case ETHTOOL_GRSSH: + rc = ethtool_get_rxfh(dev, useraddr); + break; + case ETHTOOL_SRSSH: + rc = ethtool_set_rxfh(dev, useraddr); + break; case ETHTOOL_GFEATURES: rc = ethtool_get_features(dev, useraddr); break; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index f409e0bd35c..185c341fafb 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -745,6 +745,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, attach_rules(&ops->rules_list, dev); break; + case NETDEV_CHANGENAME: + list_for_each_entry(ops, &net->rules_ops, list) { + detach_rules(&ops->rules_list, dev); + attach_rules(&ops->rules_list, dev); + } + break; + case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); diff --git a/net/core/filter.c b/net/core/filter.c index 01b780856db..1dbf6462f76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1,11 +1,16 @@ /* * Linux Socket Filter - Kernel level socket filtering * - * Author: - * Jay Schulist <jschlst@samba.org> + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: * - * Based on the design of: - * - The Berkeley Packet Filter + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -36,11 +41,31 @@ #include <asm/uaccess.h> #include <asm/unaligned.h> #include <linux/filter.h> -#include <linux/reciprocal_div.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> +/* Registers */ +#define BPF_R0 regs[BPF_REG_0] +#define BPF_R1 regs[BPF_REG_1] +#define BPF_R2 regs[BPF_REG_2] +#define BPF_R3 regs[BPF_REG_3] +#define BPF_R4 regs[BPF_REG_4] +#define BPF_R5 regs[BPF_REG_5] +#define BPF_R6 regs[BPF_REG_6] +#define BPF_R7 regs[BPF_REG_7] +#define BPF_R8 regs[BPF_REG_8] +#define BPF_R9 regs[BPF_REG_9] +#define BPF_R10 regs[BPF_REG_10] + +/* Named registers */ +#define DST regs[insn->dst_reg] +#define SRC regs[insn->src_reg] +#define FP regs[BPF_REG_FP] +#define ARG1 regs[BPF_REG_ARG1] +#define CTX regs[BPF_REG_CTX] +#define IMM insn->imm + /* No hurry in this branch * * Exported for the bpf jit load helper. @@ -53,9 +78,9 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns ptr = skb_network_header(skb) + k - SKF_NET_OFF; else if (k >= SKF_LL_OFF) ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; + return NULL; } @@ -64,6 +89,7 @@ static inline void *load_pointer(const struct sk_buff *skb, int k, { if (k >= 0) return skb_header_pointer(skb, k, size, buffer); + return bpf_internal_load_pointer_neg_helper(skb, k, size); } @@ -109,304 +135,960 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + /** - * sk_run_filter - run a filter on a socket - * @skb: buffer to run the filter on - * @fentry: filter to apply + * __sk_run_filter - run a filter on a given context + * @ctx: buffer to run the filter on + * @insn: filter to apply * - * Decode and apply filter instructions to the skb->data. - * Return length to keep, 0 for none. @skb is the data we are - * filtering, @filter is the array of filter instructions. - * Because all jumps are guaranteed to be before last instruction, - * and last instruction guaranteed to be a RET, we dont need to check - * flen. (We used to pass to this function the length of filter) + * Decode and apply filter instructions to the skb->data. Return length to + * keep, 0 for none. @ctx is the data we are operating on, @insn is the + * array of filter instructions. */ -unsigned int sk_run_filter(const struct sk_buff *skb, - const struct sock_filter *fentry) +static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) { + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG], tmp; + static const void *jumptable[256] = { + [0 ... 255] = &&default_label, + /* Now overwrite non-defaults ... */ + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, + [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, + [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, + [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, + [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, + [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, + [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, + [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, + [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, + [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, + [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, + [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, + [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, + [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, + [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, + [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, + [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, + [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, + [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, + [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, + [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, + [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, + [BPF_ALU | BPF_NEG] = &&ALU_NEG, + [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, + [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, + /* 64 bit ALU operations */ + [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, + [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, + [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, + [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, + [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, + [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, + [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, + [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, + [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, + [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, + [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, + [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, + [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, + [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, + [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, + [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, + [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, + [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, + [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, + [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, + [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, + [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, + [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, + [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, + [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, + /* Call instruction */ + [BPF_JMP | BPF_CALL] = &&JMP_CALL, + /* Jumps */ + [BPF_JMP | BPF_JA] = &&JMP_JA, + [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, + [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, + [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, + [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, + [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, + [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, + [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, + [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, + [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, + [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, + /* Program return */ + [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, + /* Store instructions */ + [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, + [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, + [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, + [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, + [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, + [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, + [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, + [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, + [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, + [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, + [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, + [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, + [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, + [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, + [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + }; void *ptr; - u32 A = 0; /* Accumulator */ - u32 X = 0; /* Index Register */ - u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ - u32 tmp; - int k; - - /* - * Process array of filter instructions. - */ - for (;; fentry++) { -#if defined(CONFIG_X86_32) -#define K (fentry->k) -#else - const u32 K = fentry->k; -#endif - - switch (fentry->code) { - case BPF_S_ALU_ADD_X: - A += X; - continue; - case BPF_S_ALU_ADD_K: - A += K; - continue; - case BPF_S_ALU_SUB_X: - A -= X; - continue; - case BPF_S_ALU_SUB_K: - A -= K; - continue; - case BPF_S_ALU_MUL_X: - A *= X; - continue; - case BPF_S_ALU_MUL_K: - A *= K; - continue; - case BPF_S_ALU_DIV_X: - if (X == 0) - return 0; - A /= X; - continue; - case BPF_S_ALU_DIV_K: - A = reciprocal_divide(A, K); - continue; - case BPF_S_ALU_MOD_X: - if (X == 0) - return 0; - A %= X; - continue; - case BPF_S_ALU_MOD_K: - A %= K; - continue; - case BPF_S_ALU_AND_X: - A &= X; - continue; - case BPF_S_ALU_AND_K: - A &= K; - continue; - case BPF_S_ALU_OR_X: - A |= X; - continue; - case BPF_S_ALU_OR_K: - A |= K; - continue; - case BPF_S_ANC_ALU_XOR_X: - case BPF_S_ALU_XOR_X: - A ^= X; - continue; - case BPF_S_ALU_XOR_K: - A ^= K; - continue; - case BPF_S_ALU_LSH_X: - A <<= X; - continue; - case BPF_S_ALU_LSH_K: - A <<= K; - continue; - case BPF_S_ALU_RSH_X: - A >>= X; - continue; - case BPF_S_ALU_RSH_K: - A >>= K; - continue; - case BPF_S_ALU_NEG: - A = -A; - continue; - case BPF_S_JMP_JA: - fentry += K; - continue; - case BPF_S_JMP_JGT_K: - fentry += (A > K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGE_K: - fentry += (A >= K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JEQ_K: - fentry += (A == K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JSET_K: - fentry += (A & K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGT_X: - fentry += (A > X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGE_X: - fentry += (A >= X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JEQ_X: - fentry += (A == X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JSET_X: - fentry += (A & X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_LD_W_ABS: - k = K; -load_w: - ptr = load_pointer(skb, k, 4, &tmp); - if (ptr != NULL) { - A = get_unaligned_be32(ptr); - continue; - } + int off; + +#define CONT ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + + /* Registers used in classic BPF programs need to be reset first. */ + regs[BPF_REG_A] = 0; + regs[BPF_REG_X] = 0; + +select_insn: + goto *jumptable[insn->code]; + + /* ALU */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + + ALU(ADD, +) + ALU(SUB, -) + ALU(AND, &) + ALU(OR, |) + ALU(LSH, <<) + ALU(RSH, >>) + ALU(XOR, ^) + ALU(MUL, *) +#undef ALU + ALU_NEG: + DST = (u32) -DST; + CONT; + ALU64_NEG: + DST = -DST; + CONT; + ALU_MOV_X: + DST = (u32) SRC; + CONT; + ALU_MOV_K: + DST = (u32) IMM; + CONT; + ALU64_MOV_X: + DST = SRC; + CONT; + ALU64_MOV_K: + DST = IMM; + CONT; + ALU64_ARSH_X: + (*(s64 *) &DST) >>= SRC; + CONT; + ALU64_ARSH_K: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_H_ABS: - k = K; -load_h: - ptr = load_pointer(skb, k, 2, &tmp); - if (ptr != NULL) { - A = get_unaligned_be16(ptr); - continue; - } + tmp = DST; + DST = do_div(tmp, SRC); + CONT; + ALU_MOD_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_B_ABS: - k = K; -load_b: - ptr = load_pointer(skb, k, 1, &tmp); - if (ptr != NULL) { - A = *(u8 *)ptr; - continue; - } + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: + tmp = DST; + DST = do_div(tmp, IMM); + CONT; + ALU_MOD_K: + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_W_LEN: - A = skb->len; - continue; - case BPF_S_LDX_W_LEN: - X = skb->len; - continue; - case BPF_S_LD_W_IND: - k = X + K; - goto load_w; - case BPF_S_LD_H_IND: - k = X + K; - goto load_h; - case BPF_S_LD_B_IND: - k = X + K; - goto load_b; - case BPF_S_LDX_B_MSH: - ptr = load_pointer(skb, K, 1, &tmp); - if (ptr != NULL) { - X = (*(u8 *)ptr & 0xf) << 2; - continue; - } + do_div(DST, SRC); + CONT; + ALU_DIV_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_IMM: - A = K; - continue; - case BPF_S_LDX_IMM: - X = K; - continue; - case BPF_S_LD_MEM: - A = mem[K]; - continue; - case BPF_S_LDX_MEM: - X = mem[K]; - continue; - case BPF_S_MISC_TAX: - X = A; - continue; - case BPF_S_MISC_TXA: - A = X; - continue; - case BPF_S_RET_K: - return K; - case BPF_S_RET_A: - return A; - case BPF_S_ST: - mem[K] = A; - continue; - case BPF_S_STX: - mem[K] = X; - continue; - case BPF_S_ANC_PROTOCOL: - A = ntohs(skb->protocol); - continue; - case BPF_S_ANC_PKTTYPE: - A = skb->pkt_type; - continue; - case BPF_S_ANC_IFINDEX: - if (!skb->dev) - return 0; - A = skb->dev->ifindex; - continue; - case BPF_S_ANC_MARK: - A = skb->mark; - continue; - case BPF_S_ANC_QUEUE: - A = skb->queue_mapping; - continue; - case BPF_S_ANC_HATYPE: - if (!skb->dev) - return 0; - A = skb->dev->type; - continue; - case BPF_S_ANC_RXHASH: - A = skb->rxhash; - continue; - case BPF_S_ANC_CPU: - A = raw_smp_processor_id(); - continue; - case BPF_S_ANC_VLAN_TAG: - A = vlan_tx_tag_get(skb); - continue; - case BPF_S_ANC_VLAN_TAG_PRESENT: - A = !!vlan_tx_tag_present(skb); - continue; - case BPF_S_ANC_PAY_OFFSET: - A = __skb_get_poff(skb); - continue; - case BPF_S_ANC_NLATTR: { - struct nlattr *nla; - - if (skb_is_nonlinear(skb)) - return 0; - if (A > skb->len - sizeof(struct nlattr)) - return 0; - - nla = nla_find((struct nlattr *)&skb->data[A], - skb->len - A, X); - if (nla) - A = (void *)nla - (void *)skb->data; - else - A = 0; - continue; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; + CONT; + ALU64_DIV_K: + do_div(DST, IMM); + CONT; + ALU_DIV_K: + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_be16(DST); + break; + case 32: + DST = (__force u32) cpu_to_be32(DST); + break; + case 64: + DST = (__force u64) cpu_to_be64(DST); + break; + } + CONT; + ALU_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_le16(DST); + break; + case 32: + DST = (__force u32) cpu_to_le32(DST); + break; + case 64: + DST = (__force u64) cpu_to_le64(DST); + break; + } + CONT; + + /* CALL */ + JMP_CALL: + /* Function call scratches BPF_R1-BPF_R5 registers, + * preserves BPF_R6-BPF_R9, and stores return value + * into BPF_R0. + */ + BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5); + CONT; + + /* JMP */ + JMP_JA: + insn += insn->off; + CONT; + JMP_JEQ_X: + if (DST == SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JEQ_K: + if (DST == IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_X: + if (DST != SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_K: + if (DST != IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_X: + if (DST > SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_K: + if (DST > IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_X: + if (DST >= SRC) { + insn += insn->off; + CONT_JMP; } - case BPF_S_ANC_NLATTR_NEST: { - struct nlattr *nla; - - if (skb_is_nonlinear(skb)) - return 0; - if (A > skb->len - sizeof(struct nlattr)) - return 0; - - nla = (struct nlattr *)&skb->data[A]; - if (nla->nla_len > A - skb->len) - return 0; - - nla = nla_find_nested(nla, X); - if (nla) - A = (void *)nla - (void *)skb->data; - else - A = 0; - continue; + CONT; + JMP_JGE_K: + if (DST >= IMM) { + insn += insn->off; + CONT_JMP; } -#ifdef CONFIG_SECCOMP_FILTER - case BPF_S_ANC_SECCOMP_LD_W: - A = seccomp_bpf_load(fentry->k); - continue; + CONT; + JMP_JSGT_X: + if (((s64) DST) > ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_K: + if (((s64) DST) > ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_X: + if (((s64) DST) >= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_K: + if (((s64) DST) >= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_X: + if (DST & SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_K: + if (DST & IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_EXIT: + return BPF_R0; + + /* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ + CONT; \ + ST_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + + LDST(B, u8) + LDST(H, u16) + LDST(W, u32) + LDST(DW, u64) +#undef LDST + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + CONT; + STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + CONT; + LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ + off = IMM; +load_word: + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are + * only appearing in the programs where ctx == + * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] + * == BPF_R6, sk_convert_filter() saves it in BPF_R6, + * internal BPF verifier will check that BPF_R6 == + * ctx. + * + * BPF_ABS and BPF_IND are wrappers of function calls, + * so they scratch BPF_R1-BPF_R5 registers, preserve + * BPF_R6-BPF_R9, and store return value into BPF_R0. + * + * Implicit input: + * ctx == skb == BPF_R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness + */ + + ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be32(ptr); + CONT; + } + + return 0; + LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ + off = IMM; +load_half: + ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be16(ptr); + CONT; + } + + return 0; + LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ + off = IMM; +load_byte: + ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = *(u8 *)ptr; + CONT; + } + + return 0; + LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_word; + LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_half; + LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ + off = IMM + SRC; + goto load_byte; + + default_label: + /* If we ever reach this, we have a bug somewhere. */ + WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + return 0; +} + +/* Helper to find the offset of pkt_type in sk_buff structure. We want + * to make sure its still a 3bit field starting at a byte boundary; + * taken from arch/x86/net/bpf_jit_comp.c. + */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX (7 << 5) +#else +#define PKT_TYPE_MAX 7 +#endif +static unsigned int pkt_type_offset(void) +{ + struct sk_buff skb_probe = { .pkt_type = ~0, }; + u8 *ct = (u8 *) &skb_probe; + unsigned int off; + + for (off = 0; off < sizeof(struct sk_buff); off++) { + if (ct[off] == PKT_TYPE_MAX) + return off; + } + + pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); + return -1; +} + +static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); +} + +static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = (struct nlattr *) &skb->data[a]; + if (nla->nla_len > skb->len - a) + return 0; + + nla = nla_find_nested(nla, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +/* note that this only generates 32-bit random numbers */ +static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +static bool convert_bpf_extensions(struct sock_filter *fp, + struct sock_filter_int **insnp) +{ + struct sock_filter_int *insn = *insnp; + + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PROTOCOL: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + + /* A = *(u16 *) (CTX + offsetof(protocol)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, protocol)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + + case SKF_AD_OFF + SKF_AD_PKTTYPE: + *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, + pkt_type_offset()); + if (insn->off < 0) + return false; + insn++; + *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + insn++; + *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); #endif + break; + + case SKF_AD_OFF + SKF_AD_IFINDEX: + case SKF_AD_OFF + SKF_AD_HATYPE: + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); + BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); + + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, dev)); + /* if (tmp != 0) goto pc + 1 */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); + *insn++ = BPF_EXIT_INSN(); + if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, ifindex)); + else + *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, type)); + break; + + case SKF_AD_OFF + SKF_AD_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_OFF + SKF_AD_RXHASH: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, hash)); + break; + + case SKF_AD_OFF + SKF_AD_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + + *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, queue_mapping)); + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TAG: + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, vlan_tci)); + if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { + *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, + ~VLAN_TAG_PRESENT); + } else { + /* A >>= 12 */ + *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); + /* A &= 1 */ + *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1); + } + break; + + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + case SKF_AD_OFF + SKF_AD_NLATTR: + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + case SKF_AD_OFF + SKF_AD_CPU: + case SKF_AD_OFF + SKF_AD_RANDOM: + /* arg1 = CTX */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + /* arg2 = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); + /* arg3 = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); + /* Emit call(arg1=CTX, arg2=A, arg3=X) */ + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + break; + case SKF_AD_OFF + SKF_AD_NLATTR: + *insn = BPF_EMIT_CALL(__skb_get_nlattr); + break; + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + break; + case SKF_AD_OFF + SKF_AD_CPU: + *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + break; + case SKF_AD_OFF + SKF_AD_RANDOM: + *insn = BPF_EMIT_CALL(__get_random_u32); + break; + } + break; + + case SKF_AD_OFF + SKF_AD_ALU_XOR_X: + /* A ^= X */ + *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); + break; + + default: + /* This is just a dummy call to avoid letting the compiler + * evict __bpf_call_base() as an optimization. Placed here + * where no-one bothers. + */ + BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); + return false; + } + + *insnp = insn; + return true; +} + +/** + * sk_convert_filter - convert filter program + * @prog: the user passed filter program + * @len: the length of the user passed filter program + * @new_prog: buffer where converted program will be stored + * @new_len: pointer to store length of converted program + * + * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. + * Conversion workflow: + * + * 1) First pass for calculating the new program length: + * sk_convert_filter(old_prog, old_len, NULL, &new_len) + * + * 2) 2nd pass to remap in two passes: 1st pass finds new + * jump offsets, 2nd pass remapping: + * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len); + * sk_convert_filter(old_prog, old_len, new_prog, &new_len); + * + * User BPF's register A is mapped to our BPF register 6, user BPF + * register X is mapped to BPF register 7; frame pointer is always + * register 10; Context 'void *ctx' is stored in register 1, that is, + * for socket filters: ctx == 'struct sk_buff *', for seccomp: + * ctx == 'struct seccomp_data *'. + */ +int sk_convert_filter(struct sock_filter *prog, int len, + struct sock_filter_int *new_prog, int *new_len) +{ + int new_flen = 0, pass = 0, target, i; + struct sock_filter_int *new_insn; + struct sock_filter *fp; + int *addrs = NULL; + u8 bpf_src; + + BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); + BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); + + if (len <= 0 || len > BPF_MAXINSNS) + return -EINVAL; + + if (new_prog) { + addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + } + +do_pass: + new_insn = new_prog; + fp = prog; + + if (new_insn) + *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + new_insn++; + + for (i = 0; i < len; fp++, i++) { + struct sock_filter_int tmp_insns[6] = { }; + struct sock_filter_int *insn = tmp_insns; + + if (addrs) + addrs[i] = new_insn - new_prog; + + switch (fp->code) { + /* All arithmetic insns and skb loads map as-is. */ + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + /* Check for overloaded BPF extension and + * directly convert it if found, otherwise + * just move on with mapping. + */ + if (BPF_CLASS(fp->code) == BPF_LD && + BPF_MODE(fp->code) == BPF_ABS && + convert_bpf_extensions(fp, &insn)) + break; + + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); + break; + + /* Jump transformation cannot use BPF block macros + * everywhere as offset calculation and target updates + * require a bit more work than the rest, i.e. jump + * opcodes map as-is, but offsets need adjustment. + */ + +#define BPF_EMIT_JMP \ + do { \ + if (target >= len || target < 0) \ + goto err; \ + insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ + /* Adjust pc relative offset for 2nd or 3rd insn. */ \ + insn->off -= insn - tmp_insns; \ + } while (0) + + case BPF_JMP | BPF_JA: + target = i + fp->k + 1; + insn->code = fp->code; + BPF_EMIT_JMP; + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { + /* BPF immediates are signed, zero extend + * immediate into tmp register and use it + * in compare insn. + */ + *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); + + insn->dst_reg = BPF_REG_A; + insn->src_reg = BPF_REG_TMP; + bpf_src = BPF_X; + } else { + insn->dst_reg = BPF_REG_A; + insn->src_reg = BPF_REG_X; + insn->imm = fp->k; + bpf_src = BPF_SRC(fp->code); + } + + /* Common case where 'jump_false' is next insn. */ + if (fp->jf == 0) { + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + target = i + fp->jt + 1; + BPF_EMIT_JMP; + break; + } + + /* Convert JEQ into JNE when 'jump_true' is next insn. */ + if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { + insn->code = BPF_JMP | BPF_JNE | bpf_src; + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + } + + /* Other jumps are mapped into two insns: Jxx and JA. */ + target = i + fp->jt + 1; + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + BPF_EMIT_JMP; + insn++; + + insn->code = BPF_JMP | BPF_JA; + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + + /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ + case BPF_LDX | BPF_MSH | BPF_B: + /* tmp = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + /* A = BPF_R0 = *(u8 *) (skb->data + K) */ + *insn++ = BPF_LD_ABS(BPF_B, fp->k); + /* A &= 0xf */ + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); + /* A <<= 2 */ + *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + /* A = tmp */ + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); + break; + + /* RET_K, RET_A are remaped into 2 insns. */ + case BPF_RET | BPF_A: + case BPF_RET | BPF_K: + *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? + BPF_K : BPF_X, BPF_REG_0, + BPF_REG_A, fp->k); + *insn = BPF_EXIT_INSN(); + break; + + /* Store to stack. */ + case BPF_ST: + case BPF_STX: + *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == + BPF_ST ? BPF_REG_A : BPF_REG_X, + -(BPF_MEMWORDS - fp->k) * 4); + break; + + /* Load from stack. */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_FP, + -(BPF_MEMWORDS - fp->k) * 4); + break; + + /* A = K or X = K */ + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, fp->k); + break; + + /* X = A */ + case BPF_MISC | BPF_TAX: + *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + break; + + /* A = X */ + case BPF_MISC | BPF_TXA: + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); + break; + + /* A = skb->len or X = skb->len */ + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LDX | BPF_W | BPF_LEN: + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + break; + + /* Access seccomp_data fields. */ + case BPF_LDX | BPF_ABS | BPF_W: + /* A = *(u32 *) (ctx + K) */ + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); + break; + + /* Unkown instruction. */ default: - WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", - fentry->code, fentry->jt, - fentry->jf, fentry->k); - return 0; + goto err; } + + insn++; + if (new_prog) + memcpy(new_insn, tmp_insns, + sizeof(*insn) * (insn - tmp_insns)); + new_insn += insn - tmp_insns; } + if (!new_prog) { + /* Only calculating new length. */ + *new_len = new_insn - new_prog; + return 0; + } + + pass++; + if (new_flen != new_insn - new_prog) { + new_flen = new_insn - new_prog; + if (pass > 2) + goto err; + goto do_pass; + } + + kfree(addrs); + BUG_ON(*new_len != new_flen); return 0; +err: + kfree(addrs); + return -EINVAL; } -EXPORT_SYMBOL(sk_run_filter); -/* - * Security : +/* Security: + * * A BPF program is able to use 16 cells of memory to store intermediate - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()). + * * As we dont want to clear mem[] array for each packet going through * sk_run_filter(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure @@ -414,44 +1096,46 @@ EXPORT_SYMBOL(sk_run_filter); */ static int check_load_and_stores(struct sock_filter *filter, int flen) { - u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); - masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + + masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { - case BPF_S_ST: - case BPF_S_STX: + case BPF_ST: + case BPF_STX: memvalid |= (1 << filter[pc].k); break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; - case BPF_S_JMP_JA: - /* a jump must set masks on target */ + case BPF_JMP | BPF_JA: + /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* a jump must set masks on targets */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; @@ -463,6 +1147,72 @@ error: return ret; } +static bool chk_code_allowed(u16 code_to_probe) +{ + static const bool codes[] = { + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_K] = true, + [BPF_ALU | BPF_ADD | BPF_X] = true, + [BPF_ALU | BPF_SUB | BPF_K] = true, + [BPF_ALU | BPF_SUB | BPF_X] = true, + [BPF_ALU | BPF_MUL | BPF_K] = true, + [BPF_ALU | BPF_MUL | BPF_X] = true, + [BPF_ALU | BPF_DIV | BPF_K] = true, + [BPF_ALU | BPF_DIV | BPF_X] = true, + [BPF_ALU | BPF_MOD | BPF_K] = true, + [BPF_ALU | BPF_MOD | BPF_X] = true, + [BPF_ALU | BPF_AND | BPF_K] = true, + [BPF_ALU | BPF_AND | BPF_X] = true, + [BPF_ALU | BPF_OR | BPF_K] = true, + [BPF_ALU | BPF_OR | BPF_X] = true, + [BPF_ALU | BPF_XOR | BPF_K] = true, + [BPF_ALU | BPF_XOR | BPF_X] = true, + [BPF_ALU | BPF_LSH | BPF_K] = true, + [BPF_ALU | BPF_LSH | BPF_X] = true, + [BPF_ALU | BPF_RSH | BPF_K] = true, + [BPF_ALU | BPF_RSH | BPF_X] = true, + [BPF_ALU | BPF_NEG] = true, + /* Load instructions */ + [BPF_LD | BPF_W | BPF_ABS] = true, + [BPF_LD | BPF_H | BPF_ABS] = true, + [BPF_LD | BPF_B | BPF_ABS] = true, + [BPF_LD | BPF_W | BPF_LEN] = true, + [BPF_LD | BPF_W | BPF_IND] = true, + [BPF_LD | BPF_H | BPF_IND] = true, + [BPF_LD | BPF_B | BPF_IND] = true, + [BPF_LD | BPF_IMM] = true, + [BPF_LD | BPF_MEM] = true, + [BPF_LDX | BPF_W | BPF_LEN] = true, + [BPF_LDX | BPF_B | BPF_MSH] = true, + [BPF_LDX | BPF_IMM] = true, + [BPF_LDX | BPF_MEM] = true, + /* Store instructions */ + [BPF_ST] = true, + [BPF_STX] = true, + /* Misc instructions */ + [BPF_MISC | BPF_TAX] = true, + [BPF_MISC | BPF_TXA] = true, + /* Return instructions */ + [BPF_RET | BPF_K] = true, + [BPF_RET | BPF_A] = true, + /* Jump instructions */ + [BPF_JMP | BPF_JA] = true, + [BPF_JMP | BPF_JEQ | BPF_K] = true, + [BPF_JMP | BPF_JEQ | BPF_X] = true, + [BPF_JMP | BPF_JGE | BPF_K] = true, + [BPF_JMP | BPF_JGE | BPF_X] = true, + [BPF_JMP | BPF_JGT | BPF_K] = true, + [BPF_JMP | BPF_JGT | BPF_X] = true, + [BPF_JMP | BPF_JSET | BPF_K] = true, + [BPF_JMP | BPF_JSET | BPF_X] = true, + }; + + if (code_to_probe >= ARRAY_SIZE(codes)) + return false; + + return codes[code_to_probe]; +} + /** * sk_chk_filter - verify socket filter code * @filter: filter to verify @@ -479,192 +1229,303 @@ error: */ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) { - /* - * Valid instructions are initialized to non-0. - * Invalid instructions are initialized to 0. - */ - static const u8 codes[] = { - [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, - [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, - [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, - [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, - [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, - [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, - [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, - [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K, - [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X, - [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, - [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, - [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, - [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, - [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K, - [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X, - [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, - [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, - [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, - [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, - [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, - [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, - [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, - [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, - [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, - [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, - [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, - [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, - [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, - [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, - [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, - [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, - [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, - [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, - [BPF_RET|BPF_K] = BPF_S_RET_K, - [BPF_RET|BPF_A] = BPF_S_RET_A, - [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, - [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, - [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, - [BPF_ST] = BPF_S_ST, - [BPF_STX] = BPF_S_STX, - [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, - [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, - [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, - [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, - [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, - [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, - [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, - [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, - [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, - }; - int pc; bool anc_found; + int pc; if (flen == 0 || flen > BPF_MAXINSNS) return -EINVAL; - /* check the filter code now */ + /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { struct sock_filter *ftest = &filter[pc]; - u16 code = ftest->code; - if (code >= ARRAY_SIZE(codes)) - return -EINVAL; - code = codes[code]; - if (!code) + /* May we actually operate on this code? */ + if (!chk_code_allowed(ftest->code)) return -EINVAL; + /* Some instructions need special checks */ - switch (code) { - case BPF_S_ALU_DIV_K: - /* check for division by zero */ + switch (ftest->code) { + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; - ftest->k = reciprocal_value(ftest->k); break; - case BPF_S_ALU_MOD_K: - /* check for division by zero */ - if (ftest->k == 0) - return -EINVAL; - break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: - case BPF_S_ST: - case BPF_S_STX: - /* check for invalid memory addresses */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; - case BPF_S_JMP_JA: - /* - * Note, the large ftest->k might cause loops. + case BPF_JMP | BPF_JA: + /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ - if (ftest->k >= (unsigned int)(flen-pc-1)) + if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* for conditionals both must be safe */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: anc_found = false; -#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ - code = BPF_S_ANC_##CODE; \ - anc_found = true; \ - break - switch (ftest->k) { - ANCILLARY(PROTOCOL); - ANCILLARY(PKTTYPE); - ANCILLARY(IFINDEX); - ANCILLARY(NLATTR); - ANCILLARY(NLATTR_NEST); - ANCILLARY(MARK); - ANCILLARY(QUEUE); - ANCILLARY(HATYPE); - ANCILLARY(RXHASH); - ANCILLARY(CPU); - ANCILLARY(ALU_XOR_X); - ANCILLARY(VLAN_TAG); - ANCILLARY(VLAN_TAG_PRESENT); - ANCILLARY(PAY_OFFSET); - } - - /* ancillary operation unknown or unsupported */ + if (bpf_anc_helper(ftest) & BPF_ANC) + anc_found = true; + /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } - ftest->code = code; } - /* last instruction must be a RET code */ + /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { - case BPF_S_RET_K: - case BPF_S_RET_A: + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } + return -EINVAL; } EXPORT_SYMBOL(sk_chk_filter); +static int sk_store_orig_filter(struct sk_filter *fp, + const struct sock_fprog *fprog) +{ + unsigned int fsize = sk_filter_proglen(fprog); + struct sock_fprog_kern *fkprog; + + fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); + if (!fp->orig_prog) + return -ENOMEM; + + fkprog = fp->orig_prog; + fkprog->len = fprog->len; + fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL); + if (!fkprog->filter) { + kfree(fp->orig_prog); + return -ENOMEM; + } + + return 0; +} + +static void sk_release_orig_filter(struct sk_filter *fp) +{ + struct sock_fprog_kern *fprog = fp->orig_prog; + + if (fprog) { + kfree(fprog->filter); + kfree(fprog); + } +} + /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -void sk_filter_release_rcu(struct rcu_head *rcu) +static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); + sk_release_orig_filter(fp); + sk_filter_free(fp); +} + +/** + * sk_filter_release - release a socket filter + * @fp: filter to remove + * + * Remove a filter from a socket and release its resources. + */ +static void sk_filter_release(struct sk_filter *fp) +{ + if (atomic_dec_and_test(&fp->refcnt)) + call_rcu(&fp->rcu, sk_filter_release_rcu); +} + +void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) +{ + atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); + sk_filter_release(fp); +} + +void sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + atomic_inc(&fp->refcnt); + atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); +} + +static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, + struct sock *sk, + unsigned int len) +{ + struct sk_filter *fp_new; + + if (sk == NULL) + return krealloc(fp, len, GFP_KERNEL); + + fp_new = sock_kmalloc(sk, len, GFP_KERNEL); + if (fp_new) { + *fp_new = *fp; + /* As we're keeping orig_prog in fp_new along, + * we need to make sure we're not evicting it + * from the old fp. + */ + fp->orig_prog = NULL; + sk_filter_uncharge(sk, fp); + } + + return fp_new; +} + +static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, + struct sock *sk) +{ + struct sock_filter *old_prog; + struct sk_filter *old_fp; + int err, new_len, old_len = fp->len; + + /* We are free to overwrite insns et al right here as it + * won't be used at this point in time anymore internally + * after the migration to the internal BPF instruction + * representation. + */ + BUILD_BUG_ON(sizeof(struct sock_filter) != + sizeof(struct sock_filter_int)); + + /* Conversion cannot happen on overlapping memory areas, + * so we need to keep the user BPF around until the 2nd + * pass. At this time, the user BPF is stored in fp->insns. + */ + old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), + GFP_KERNEL); + if (!old_prog) { + err = -ENOMEM; + goto out_err; + } + + /* 1st pass: calculate the new program length. */ + err = sk_convert_filter(old_prog, old_len, NULL, &new_len); + if (err) + goto out_err_free; + + /* Expand fp for appending the new filter representation. */ + old_fp = fp; + fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len)); + if (!fp) { + /* The old_fp is still around in case we couldn't + * allocate new memory, so uncharge on that one. + */ + fp = old_fp; + err = -ENOMEM; + goto out_err_free; + } + + fp->len = new_len; + + /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ + err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); + if (err) + /* 2nd sk_convert_filter() can fail only if it fails + * to allocate memory, remapping must succeed. Note, + * that at this time old_fp has already been released + * by __sk_migrate_realloc(). + */ + goto out_err_free; + + sk_filter_select_runtime(fp); + + kfree(old_prog); + return fp; + +out_err_free: + kfree(old_prog); +out_err: + /* Rollback filter setup. */ + if (sk != NULL) + sk_filter_uncharge(sk, fp); + else + kfree(fp); + return ERR_PTR(err); +} + +void __weak bpf_int_jit_compile(struct sk_filter *prog) +{ +} + +/** + * sk_filter_select_runtime - select execution runtime for BPF program + * @fp: sk_filter populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via SK_RUN_FILTER() macro + */ +void sk_filter_select_runtime(struct sk_filter *fp) +{ + fp->bpf_func = (void *) __sk_run_filter; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_select_runtime); + +/* free internal BPF program */ +void sk_filter_free(struct sk_filter *fp) +{ bpf_jit_free(fp); } -EXPORT_SYMBOL(sk_filter_release_rcu); +EXPORT_SYMBOL_GPL(sk_filter_free); -static int __sk_prepare_filter(struct sk_filter *fp) +static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, + struct sock *sk) { int err; - fp->bpf_func = sk_run_filter; + fp->bpf_func = NULL; + fp->jited = 0; err = sk_chk_filter(fp->insns, fp->len); - if (err) - return err; + if (err) { + if (sk != NULL) + sk_filter_uncharge(sk, fp); + else + kfree(fp); + return ERR_PTR(err); + } + /* Probe if we can JIT compile the filter and if so, do + * the compilation of the filter. + */ bpf_jit_compile(fp); - return 0; + + /* JIT compiler couldn't process this filter, so do the + * internal BPF translation for the optimized interpreter. + */ + if (!fp->jited) + fp = __sk_migrate_filter(fp, sk); + + return fp; } /** * sk_unattached_filter_create - create an unattached filter - * @fprog: the filter program * @pfp: the unattached filter that is created + * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. @@ -672,11 +1533,10 @@ static int __sk_prepare_filter(struct sk_filter *fp) * a negative errno code is returned. On success the return is zero. */ int sk_unattached_filter_create(struct sk_filter **pfp, - struct sock_fprog *fprog) + struct sock_fprog_kern *fprog) { + unsigned int fsize = sk_filter_proglen(fprog); struct sk_filter *fp; - unsigned int fsize = sizeof(struct sock_filter) * fprog->len; - int err; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) @@ -685,20 +1545,26 @@ int sk_unattached_filter_create(struct sk_filter **pfp, fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); if (!fp) return -ENOMEM; + memcpy(fp->insns, fprog->filter, fsize); atomic_set(&fp->refcnt, 1); fp->len = fprog->len; + /* Since unattached filters are not copied back to user + * space through sk_get_filter(), we do not need to hold + * a copy here, and can spare us the work. + */ + fp->orig_prog = NULL; - err = __sk_prepare_filter(fp); - if (err) - goto free_mem; + /* __sk_prepare_filter() already takes care of uncharging + * memory in case something goes wrong. + */ + fp = __sk_prepare_filter(fp, NULL); + if (IS_ERR(fp)) + return PTR_ERR(fp); *pfp = fp; return 0; -free_mem: - kfree(fp); - return err; } EXPORT_SYMBOL_GPL(sk_unattached_filter_create); @@ -721,7 +1587,7 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct sk_filter *fp, *old_fp; - unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + unsigned int fsize = sk_filter_proglen(fprog); unsigned int sk_fsize = sk_filter_size(fprog->len); int err; @@ -735,6 +1601,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL); if (!fp) return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { sock_kfree_s(sk, fp, sk_fsize); return -EFAULT; @@ -743,18 +1610,26 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) atomic_set(&fp->refcnt, 1); fp->len = fprog->len; - err = __sk_prepare_filter(fp); + err = sk_store_orig_filter(fp, fprog); if (err) { sk_filter_uncharge(sk, fp); - return err; + return -ENOMEM; } + /* __sk_prepare_filter() already takes care of uncharging + * memory in case something goes wrong. + */ + fp = __sk_prepare_filter(fp, sk); + if (IS_ERR(fp)) + return PTR_ERR(fp); + old_fp = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); + return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -774,136 +1649,46 @@ int sk_detach_filter(struct sock *sk) sk_filter_uncharge(sk, filter); ret = 0; } + return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); -void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) -{ - static const u16 decodes[] = { - [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K, - [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X, - [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K, - [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X, - [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K, - [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X, - [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X, - [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K, - [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X, - [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K, - [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X, - [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K, - [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X, - [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K, - [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X, - [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K, - [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X, - [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K, - [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X, - [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG, - [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS, - [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS, - [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN, - [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND, - [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND, - [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND, - [BPF_S_LD_IMM] = BPF_LD|BPF_IMM, - [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN, - [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH, - [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM, - [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX, - [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA, - [BPF_S_RET_K] = BPF_RET|BPF_K, - [BPF_S_RET_A] = BPF_RET|BPF_A, - [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K, - [BPF_S_LD_MEM] = BPF_LD|BPF_MEM, - [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM, - [BPF_S_ST] = BPF_ST, - [BPF_S_STX] = BPF_STX, - [BPF_S_JMP_JA] = BPF_JMP|BPF_JA, - [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K, - [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X, - [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K, - [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X, - [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K, - [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X, - [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K, - [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X, - }; - u16 code; - - code = filt->code; - - to->code = decodes[code]; - to->jt = filt->jt; - to->jf = filt->jf; - - if (code == BPF_S_ALU_DIV_K) { - /* - * When loaded this rule user gave us X, which was - * translated into R = r(X). Now we calculate the - * RR = r(R) and report it back. If next time this - * value is loaded and RRR = r(RR) is calculated - * then the R == RRR will be true. - * - * One exception. X == 1 translates into R == 0 and - * we can't calculate RR out of it with r(). - */ - - if (filt->k == 0) - to->k = 1; - else - to->k = reciprocal_value(filt->k); - - BUG_ON(reciprocal_value(to->k) != filt->k); - } else - to->k = filt->k; -} - -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) +int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, + unsigned int len) { + struct sock_fprog_kern *fprog; struct sk_filter *filter; - int i, ret; + int ret = 0; lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); - ret = 0; + sock_owned_by_user(sk)); if (!filter) goto out; - ret = filter->len; + + /* We're copying the filter that has been originally attached, + * so no conversion/decode needed anymore. + */ + fprog = filter->orig_prog; + + ret = fprog->len; if (!len) + /* User space only enquires number of filter blocks. */ goto out; + ret = -EINVAL; - if (len < filter->len) + if (len < fprog->len) goto out; ret = -EFAULT; - for (i = 0; i < filter->len; i++) { - struct sock_filter fb; - - sk_decode_filter(&filter->insns[i], &fb); - if (copy_to_user(&ubuf[i], &fb, sizeof(fb))) - goto out; - } + if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog))) + goto out; - ret = filter->len; + /* Instead of bytes, the API requests to return the number + * of filter blocks. + */ + ret = fprog->len; out: release_sock(sk); return ret; diff --git a/net/core/flow.c b/net/core/flow.c index dfa602ceb8c..a0348fde1fd 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -24,6 +24,7 @@ #include <net/flow.h> #include <linux/atomic.h> #include <linux/security.h> +#include <net/net_namespace.h> struct flow_cache_entry { union { @@ -38,37 +39,14 @@ struct flow_cache_entry { struct flow_cache_object *object; }; -struct flow_cache_percpu { - struct hlist_head *hash_table; - int hash_count; - u32 hash_rnd; - int hash_rnd_recalc; - struct tasklet_struct flush_tasklet; -}; - struct flow_flush_info { struct flow_cache *cache; atomic_t cpuleft; struct completion completion; }; -struct flow_cache { - u32 hash_shift; - struct flow_cache_percpu __percpu *percpu; - struct notifier_block hotcpu_notifier; - int low_watermark; - int high_watermark; - struct timer_list rnd_timer; -}; - -atomic_t flow_cache_genid = ATOMIC_INIT(0); -EXPORT_SYMBOL(flow_cache_genid); -static struct flow_cache flow_cache_global; static struct kmem_cache *flow_cachep __read_mostly; -static DEFINE_SPINLOCK(flow_cache_gc_lock); -static LIST_HEAD(flow_cache_gc_list); - #define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) #define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) @@ -84,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg) add_timer(&fc->rnd_timer); } -static int flow_entry_valid(struct flow_cache_entry *fle) +static int flow_entry_valid(struct flow_cache_entry *fle, + struct netns_xfrm *xfrm) { - if (atomic_read(&flow_cache_genid) != fle->genid) + if (atomic_read(&xfrm->flow_cache_genid) != fle->genid) return 0; if (fle->object && !fle->object->ops->check(fle->object)) return 0; return 1; } -static void flow_entry_kill(struct flow_cache_entry *fle) +static void flow_entry_kill(struct flow_cache_entry *fle, + struct netns_xfrm *xfrm) { if (fle->object) fle->object->ops->delete(fle->object); @@ -104,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work) { struct list_head gc_list; struct flow_cache_entry *fce, *n; + struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, + flow_cache_gc_work); INIT_LIST_HEAD(&gc_list); - spin_lock_bh(&flow_cache_gc_lock); - list_splice_tail_init(&flow_cache_gc_list, &gc_list); - spin_unlock_bh(&flow_cache_gc_lock); + spin_lock_bh(&xfrm->flow_cache_gc_lock); + list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); + spin_unlock_bh(&xfrm->flow_cache_gc_lock); list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) - flow_entry_kill(fce); + flow_entry_kill(fce, xfrm); } -static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, - int deleted, struct list_head *gc_list) + int deleted, struct list_head *gc_list, + struct netns_xfrm *xfrm) { if (deleted) { fcp->hash_count -= deleted; - spin_lock_bh(&flow_cache_gc_lock); - list_splice_tail(gc_list, &flow_cache_gc_list); - spin_unlock_bh(&flow_cache_gc_lock); - schedule_work(&flow_cache_gc_work); + spin_lock_bh(&xfrm->flow_cache_gc_lock); + list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); + spin_unlock_bh(&xfrm->flow_cache_gc_lock); + schedule_work(&xfrm->flow_cache_gc_work); } } @@ -135,6 +117,8 @@ static void __flow_cache_shrink(struct flow_cache *fc, struct hlist_node *tmp; LIST_HEAD(gc_list); int i, deleted = 0; + struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, + flow_cache_global); for (i = 0; i < flow_cache_hash_size(fc); i++) { int saved = 0; @@ -142,7 +126,7 @@ static void __flow_cache_shrink(struct flow_cache *fc, hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { if (saved < shrink_to && - flow_entry_valid(fle)) { + flow_entry_valid(fle, xfrm)) { saved++; } else { deleted++; @@ -152,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc, } } - flow_cache_queue_garbage(fcp, deleted, &gc_list); + flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); } static void flow_cache_shrink(struct flow_cache *fc, @@ -208,7 +192,7 @@ struct flow_cache_object * flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver, void *ctx) { - struct flow_cache *fc = &flow_cache_global; + struct flow_cache *fc = &net->xfrm.flow_cache_global; struct flow_cache_percpu *fcp; struct flow_cache_entry *fle, *tfle; struct flow_cache_object *flo; @@ -258,7 +242,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); fcp->hash_count++; } - } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { + } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) { flo = fle->object; if (!flo) goto ret_object; @@ -279,7 +263,7 @@ nocache: } flo = resolver(net, key, family, dir, flo, ctx); if (fle) { - fle->genid = atomic_read(&flow_cache_genid); + fle->genid = atomic_read(&net->xfrm.flow_cache_genid); if (!IS_ERR(flo)) fle->object = flo; else @@ -303,12 +287,14 @@ static void flow_cache_flush_tasklet(unsigned long data) struct hlist_node *tmp; LIST_HEAD(gc_list); int i, deleted = 0; + struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, + flow_cache_global); fcp = this_cpu_ptr(fc->percpu); for (i = 0; i < flow_cache_hash_size(fc); i++) { hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { - if (flow_entry_valid(fle)) + if (flow_entry_valid(fle, xfrm)) continue; deleted++; @@ -317,7 +303,7 @@ static void flow_cache_flush_tasklet(unsigned long data) } } - flow_cache_queue_garbage(fcp, deleted, &gc_list); + flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); @@ -351,10 +337,9 @@ static void flow_cache_flush_per_cpu(void *data) tasklet_schedule(tasklet); } -void flow_cache_flush(void) +void flow_cache_flush(struct net *net) { struct flow_flush_info info; - static DEFINE_MUTEX(flow_flush_sem); cpumask_var_t mask; int i, self; @@ -365,8 +350,8 @@ void flow_cache_flush(void) /* Don't want cpus going down or up during this. */ get_online_cpus(); - mutex_lock(&flow_flush_sem); - info.cache = &flow_cache_global; + mutex_lock(&net->xfrm.flow_flush_sem); + info.cache = &net->xfrm.flow_cache_global; for_each_online_cpu(i) if (!flow_cache_percpu_empty(info.cache, i)) cpumask_set_cpu(i, mask); @@ -386,21 +371,23 @@ void flow_cache_flush(void) wait_for_completion(&info.completion); done: - mutex_unlock(&flow_flush_sem); + mutex_unlock(&net->xfrm.flow_flush_sem); put_online_cpus(); free_cpumask_var(mask); } static void flow_cache_flush_task(struct work_struct *work) { - flow_cache_flush(); -} + struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, + flow_cache_gc_work); + struct net *net = container_of(xfrm, struct net, xfrm); -static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); + flow_cache_flush(net); +} -void flow_cache_flush_deferred(void) +void flow_cache_flush_deferred(struct net *net) { - schedule_work(&flow_cache_flush_work); + schedule_work(&net->xfrm.flow_cache_flush_work); } static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) @@ -425,7 +412,8 @@ static int flow_cache_cpu(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); + struct flow_cache *fc = container_of(nfb, struct flow_cache, + hotcpu_notifier); int res, cpu = (unsigned long) hcpu; struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); @@ -444,9 +432,20 @@ static int flow_cache_cpu(struct notifier_block *nfb, return NOTIFY_OK; } -static int __init flow_cache_init(struct flow_cache *fc) +int flow_cache_init(struct net *net) { int i; + struct flow_cache *fc = &net->xfrm.flow_cache_global; + + if (!flow_cachep) + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_PANIC, NULL); + spin_lock_init(&net->xfrm.flow_cache_gc_lock); + INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list); + INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); + INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); + mutex_init(&net->xfrm.flow_flush_sem); fc->hash_shift = 10; fc->low_watermark = 2 * flow_cache_hash_size(fc); @@ -456,6 +455,8 @@ static int __init flow_cache_init(struct flow_cache *fc) if (!fc->percpu) return -ENOMEM; + cpu_notifier_register_begin(); + for_each_online_cpu(i) { if (flow_cache_cpu_prepare(fc, i)) goto err; @@ -463,7 +464,9 @@ static int __init flow_cache_init(struct flow_cache *fc) fc->hotcpu_notifier = (struct notifier_block){ .notifier_call = flow_cache_cpu, }; - register_hotcpu_notifier(&fc->hotcpu_notifier); + __register_hotcpu_notifier(&fc->hotcpu_notifier); + + cpu_notifier_register_done(); setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc); @@ -479,19 +482,30 @@ err: fcp->hash_table = NULL; } + cpu_notifier_register_done(); + free_percpu(fc->percpu); fc->percpu = NULL; return -ENOMEM; } +EXPORT_SYMBOL(flow_cache_init); -static int __init flow_cache_init_global(void) +void flow_cache_fini(struct net *net) { - flow_cachep = kmem_cache_create("flow_cache", - sizeof(struct flow_cache_entry), - 0, SLAB_PANIC, NULL); + int i; + struct flow_cache *fc = &net->xfrm.flow_cache_global; - return flow_cache_init(&flow_cache_global); -} + del_timer_sync(&fc->rnd_timer); + unregister_hotcpu_notifier(&fc->hotcpu_notifier); + + for_each_possible_cpu(i) { + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); + kfree(fcp->hash_table); + fcp->hash_table = NULL; + } -module_init(flow_cache_init_global); + free_percpu(fc->percpu); + fc->percpu = NULL; +} +EXPORT_SYMBOL(flow_cache_fini); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d6ef1732250..107ed12a532 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -61,7 +61,7 @@ bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) again: switch (proto) { - case __constant_htons(ETH_P_IP): { + case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; ip: @@ -77,7 +77,7 @@ ip: iph_to_flow_copy_addrs(flow, iph); break; } - case __constant_htons(ETH_P_IPV6): { + case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; ipv6: @@ -91,8 +91,8 @@ ipv6: nhoff += sizeof(struct ipv6hdr); break; } - case __constant_htons(ETH_P_8021AD): - case __constant_htons(ETH_P_8021Q): { + case htons(ETH_P_8021AD): + case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan; struct vlan_hdr _vlan; @@ -104,7 +104,7 @@ ipv6: nhoff += sizeof(*vlan); goto again; } - case __constant_htons(ETH_P_PPP_SES): { + case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; @@ -115,9 +115,9 @@ ipv6: proto = hdr->proto; nhoff += PPPOE_SES_HLEN; switch (proto) { - case __constant_htons(PPP_IP): + case htons(PPP_IP): goto ip; - case __constant_htons(PPP_IPV6): + case htons(PPP_IPV6): goto ipv6; default: return false; @@ -202,12 +202,12 @@ static __always_inline u32 __flow_hash_1word(u32 a) } /* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Sets rxhash in skb to non-zero hash value - * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb + * __skb_get_hash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Sets hash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ -void __skb_get_rxhash(struct sk_buff *skb) +void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; u32 hash; @@ -216,7 +216,7 @@ void __skb_get_rxhash(struct sk_buff *skb) return; if (keys.ports) - skb->l4_rxhash = 1; + skb->l4_hash = 1; /* get a consistent hash (same value on both flow directions) */ if (((__force u32)keys.dst < (__force u32)keys.src) || @@ -232,9 +232,9 @@ void __skb_get_rxhash(struct sk_buff *skb) if (!hash) hash = 1; - skb->rxhash = hash; + skb->hash = hash; } -EXPORT_SYMBOL(__skb_get_rxhash); +EXPORT_SYMBOL(__skb_get_hash); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number @@ -323,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -355,7 +344,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) hash = skb->sk->sk_hash; else hash = (__force u16) skb->protocol ^ - skb->rxhash; + skb->hash; hash = __flow_hash_1word(hash); queue_index = map->queues[ ((u64)hash * map->len) >> 32]; @@ -372,7 +361,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); @@ -392,20 +381,23 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) return queue_index; } -EXPORT_SYMBOL(__netdev_pick_tx); struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + void *accel_priv) { int queue_index = 0; if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb); + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); else queue_index = __netdev_pick_tx(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); + + if (!accel_priv) + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); diff --git a/net/core/iovec.c b/net/core/iovec.c index b61869429f4..e1ec45ab1e6 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -39,7 +39,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a { int size, ct, err; - if (m->msg_namelen) { + if (m->msg_name && m->msg_namelen) { if (mode == VERIFY_READ) { void __user *namep; namep = (void __user __force *) m->msg_name; @@ -48,10 +48,10 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a if (err < 0) return err; } - if (m->msg_name) - m->msg_name = address; + m->msg_name = address; } else { m->msg_name = NULL; + m->msg_namelen = 0; } size = m->msg_iovlen * sizeof(struct iovec); @@ -75,61 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a } /* - * Copy kernel to iovec. Returns -EFAULT on error. - */ - -int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, - int offset, int len) -{ - int copy; - for (; len > 0; ++iov) { - /* Skip over the finished iovecs */ - if (unlikely(offset >= iov->iov_len)) { - offset -= iov->iov_len; - continue; - } - copy = min_t(unsigned int, iov->iov_len - offset, len); - if (copy_to_user(iov->iov_base + offset, kdata, copy)) - return -EFAULT; - offset = 0; - kdata += copy; - len -= copy; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovecend); - -/* - * Copy iovec to kernel. Returns -EFAULT on error. - */ - -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, - int offset, int len) -{ - /* Skip over the finished iovecs */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - } - - while (len > 0) { - u8 __user *base = iov->iov_base + offset; - int copy = min_t(unsigned int, len, iov->iov_len - offset); - - offset = 0; - if (copy_from_user(kdata, base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovecend); - -/* * And now for the all-in-one: copy and checksum from a user iovec * directly to a datagram * Calls to csum_partial but the last must be in 32 bit chunks diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839322b..bd0767e6b2b 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -147,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev) * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ce2b77515a9..ef31fef25e5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -117,7 +117,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? (net_random() % base) + (base >> 1) : 0; + return base ? (prandom_u32() % base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -766,9 +766,6 @@ static void neigh_periodic_work(struct work_struct *work) nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); - if (atomic_read(&tbl->entries) < tbl->gc_thresh1) - goto out; - /* * periodically recompute ReachableTime from random function */ @@ -781,6 +778,9 @@ static void neigh_periodic_work(struct work_struct *work) neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } + if (atomic_read(&tbl->entries) < tbl->gc_thresh1) + goto out; + for (i = 0 ; i < (1 << nht->hash_shift); i++) { np = &nht->hash_buckets[i]; @@ -828,7 +828,7 @@ out: * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 * BASE_REACHABLE_TIME. */ - schedule_delayed_work(&tbl->gc_work, + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); write_unlock_bh(&tbl->lock); } @@ -836,10 +836,10 @@ out: static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE) ? - NEIGH_VAR(p, UCAST_PROBES) : - NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + - NEIGH_VAR(p, MCAST_PROBES); + int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); + if (!(n->nud_state & NUD_PROBE)) + max_probes += NEIGH_VAR(p, MCAST_PROBES); + return max_probes; } static void neigh_invalidate(struct neighbour *neigh) @@ -945,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_FAILED; notify = 1; neigh_invalidate(neigh); + goto out; } if (neigh->nud_state & NUD_IN_TIMER) { @@ -1169,6 +1170,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->parms->reachable_time : 0))); neigh->nud_state = new; + notify = 1; } if (lladdr != neigh->ha) { @@ -1238,6 +1240,21 @@ out: } EXPORT_SYMBOL(neigh_update); +/* Update the neigh to listen temporarily for probe responses, even if it is + * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. + */ +void __neigh_set_probe_once(struct neighbour *neigh) +{ + neigh->updated = jiffies; + if (!(neigh->nud_state & NUD_FAILED)) + return; + neigh->nud_state = NUD_INCOMPLETE; + atomic_set(&neigh->probes, neigh_max_probes(neigh)); + neigh_add_timer(neigh, + jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); +} +EXPORT_SYMBOL(__neigh_set_probe_once); + struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) @@ -1282,7 +1299,7 @@ int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb) if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 && - dev->header_ops->rebuild(skb)) + dev_rebuild_header(skb)) return 0; return dev_queue_xmit(skb); @@ -1399,7 +1416,8 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long now = jiffies; - unsigned long sched_next = now + (net_random() % + + unsigned long sched_next = now + (prandom_u32() % NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { @@ -1548,7 +1566,8 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) rwlock_init(&tbl->lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); - schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + tbl->parms.reachable_time); setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); @@ -2070,13 +2089,16 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: - NEIGH_VAR_SET(p, ANYCAST_DELAY, nla_get_msecs(tbp[i])); + NEIGH_VAR_SET(p, ANYCAST_DELAY, + nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: - NEIGH_VAR_SET(p, PROXY_DELAY, nla_get_msecs(tbp[i])); + NEIGH_VAR_SET(p, PROXY_DELAY, + nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: - NEIGH_VAR_SET(p, LOCKTIME, nla_get_msecs(tbp[i])); + NEIGH_VAR_SET(p, LOCKTIME, + nla_get_msecs(tbp[i])); break; } } @@ -2227,7 +2249,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = pn->flags | NTF_PROXY; - ndm->ndm_type = NDA_DST; + ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev->ifindex; ndm->ndm_state = NUD_NONE; @@ -2850,7 +2872,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; - struct net *net = p->net; + struct net *net = neigh_parms_net(p); int index = (int *) ctl->data - p->data; if (!write) @@ -3025,7 +3047,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (!t) goto err; - for (i = 0; i < ARRAY_SIZE(t->neigh_vars); i++) { + for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { t->neigh_vars[i].data += (long) p; t->neigh_vars[i].extra1 = dev; t->neigh_vars[i].extra2 = p; @@ -3037,11 +3059,12 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { + struct neigh_table *tbl = p->tbl; dev_name_source = "default"; - t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); - t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; - t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; - t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f3edf9635e0..1cac29ebb05 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -104,6 +104,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RO(dev_id, fmt_hex); +NETDEVICE_SHOW_RO(dev_port, fmt_dec); NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); NETDEVICE_SHOW_RO(addr_len, fmt_dec); NETDEVICE_SHOW_RO(iflink, fmt_dec); @@ -252,6 +253,16 @@ static ssize_t operstate_show(struct device *dev, } static DEVICE_ATTR_RO(operstate); +static ssize_t carrier_changes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + return sprintf(buf, fmt_dec, + atomic_read(&netdev->carrier_changes)); +} +static DEVICE_ATTR_RO(carrier_changes); + /* read-write attributes */ static int change_mtu(struct net_device *net, unsigned long new_mtu) @@ -373,6 +384,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, &dev_attr_dev_id.attr, + &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, &dev_attr_addr_assign_type.attr, @@ -384,6 +396,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_duplex.attr, &dev_attr_dormant.attr, &dev_attr_operstate.attr, + &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, &dev_attr_carrier.attr, &dev_attr_mtu.attr, @@ -498,17 +511,7 @@ static struct attribute_group wireless_group = { #define net_class_groups NULL #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_RPS -/* - * RX queue sysfs structures and functions. - */ -struct rx_queue_attribute { - struct attribute attr; - ssize_t (*show)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf); - ssize_t (*store)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, const char *buf, size_t len); -}; +#ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) container_of(_attr, \ struct rx_queue_attribute, attr) @@ -543,6 +546,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = { .store = rx_queue_attr_store, }; +#ifdef CONFIG_RPS static ssize_t show_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, char *buf) { @@ -676,8 +680,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); /* On 64 bit arches, must check mask fits in table->mask (u32), - * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1) - * doesnt overflow. + * and on 32bit arches, must check + * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. */ #if BITS_PER_LONG > 32 if (mask > (unsigned long)(u32)mask) @@ -718,16 +722,20 @@ static struct rx_queue_attribute rps_cpus_attribute = static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +#endif /* CONFIG_RPS */ static struct attribute *rx_queue_default_attrs[] = { +#ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, +#endif NULL }; static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); +#ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; @@ -743,15 +751,29 @@ static void rx_queue_release(struct kobject *kobj) RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } +#endif memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); } +static const void *rx_queue_namespace(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->ns_type) + ns = dev->class->namespace(dev); + + return ns; +} + static struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_attrs = rx_queue_default_attrs, + .namespace = rx_queue_namespace }; static int rx_queue_add_kobject(struct net_device *net, int index) @@ -763,25 +785,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index) kobj->kset = net->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); - if (error) { - kobject_put(kobj); - return error; + if (error) + goto exit; + + if (net->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); + if (error) + goto exit; } kobject_uevent(kobj, KOBJ_ADD); dev_hold(queue->dev); return error; +exit: + kobject_put(kobj); + return error; } -#endif /* CONFIG_RPS */ +#endif /* CONFIG_SYSFS */ int net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS int i; int error = 0; +#ifndef CONFIG_RPS + if (!net->sysfs_rx_queue_group) + return 0; +#endif for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(net, i); if (error) { @@ -790,8 +823,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } } - while (--i >= new_num) + while (--i >= new_num) { + if (net->sysfs_rx_queue_group) + sysfs_remove_group(&net->_rx[i].kobj, + net->sysfs_rx_queue_group); kobject_put(&net->_rx[i].kobj); + } return error; #else @@ -972,15 +1009,12 @@ static struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) - if (queue == &dev->_tx[i]) - break; + unsigned int i; + i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; @@ -1082,10 +1116,23 @@ static void netdev_queue_release(struct kobject *kobj) dev_put(queue->dev); } +static const void *netdev_queue_namespace(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->ns_type) + ns = dev->class->namespace(dev); + + return ns; +} + static struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_attrs = netdev_queue_default_attrs, + .namespace = netdev_queue_namespace, }; static int netdev_queue_add_kobject(struct net_device *net, int index) @@ -1155,9 +1202,6 @@ static int register_queue_kobjects(struct net_device *net) NULL, &net->dev.kobj); if (!net->queues_kset) return -ENOMEM; -#endif - -#ifdef CONFIG_RPS real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; @@ -1184,7 +1228,7 @@ static void remove_queue_kobjects(struct net_device *net) { int real_rx = 0, real_tx = 0; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; @@ -1358,7 +1402,7 @@ void netdev_class_remove_file_ns(struct class_attribute *class_attr, } EXPORT_SYMBOL(netdev_class_remove_file_ns); -int netdev_kobject_init(void) +int __init netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); return class_register(&net_class); diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index bd7751ec1c4..2745a1b51e0 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -1,7 +1,7 @@ #ifndef __NET_SYSFS_H__ #define __NET_SYSFS_H__ -int netdev_kobject_init(void); +int __init netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 81d3a9a0845..85b62691f4f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -24,7 +24,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); +DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -273,7 +273,7 @@ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; struct net *net, *tmp; - LIST_HEAD(net_kill_list); + struct list_head net_kill_list; LIST_HEAD(net_exit_list); /* Atomically snapshot the list of namespaces to cleanup */ diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c new file mode 100644 index 00000000000..30d903b19c6 --- /dev/null +++ b/net/core/netclassid_cgroup.c @@ -0,0 +1,111 @@ +/* + * net/core/netclassid_cgroup.c Classid Cgroupfs Handling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fdtable.h> +#include <net/cls_cgroup.h> +#include <net/sock.h> + +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + +struct cgroup_cls_state *task_cls_state(struct task_struct *p) +{ + return css_cls_state(task_css(p, net_cls_cgrp_id)); +} +EXPORT_SYMBOL_GPL(task_cls_state); + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_cls_state *cs; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + return &cs->css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css->parent); + + if (parent) + cs->classid = parent->classid; + + return 0; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_cls_state(css)); +} + +static int update_classid(const void *v, struct file *file, unsigned n) +{ + int err; + struct socket *sock = sock_from_file(file, &err); + + if (sock) + sock->sk->sk_classid = (u32)(unsigned long)v; + + return 0; +} + +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; + struct task_struct *p; + + cgroup_taskset_for_each(p, tset) { + task_lock(p); + iterate_fd(p->files, 0, update_classid, v); + task_unlock(p); + } +} + +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return css_cls_state(css)->classid; +} + +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) +{ + css_cls_state(css)->classid = (u32) value; + + return 0; +} + +static struct cftype ss_files[] = { + { + .name = "classid", + .read_u64 = read_classid, + .write_u64 = write_classid, + }, + { } /* terminate */ +}; + +struct cgroup_subsys net_cls_cgrp_subsys = { + .css_alloc = cgrp_css_alloc, + .css_online = cgrp_css_online, + .css_free = cgrp_css_free, + .attach = cgrp_attach, + .base_cftypes = ss_files, +}; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 8f971990677..e33937fb32a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -46,13 +46,9 @@ static struct sk_buff_head skb_pool; -static atomic_t trapped; - DEFINE_STATIC_SRCU(netpoll_srcu); #define USEC_PER_POLL 50 -#define NETPOLL_RX_ENABLED 1 -#define NETPOLL_RX_DROP 2 #define MAX_SKB_SIZE \ (sizeof(struct ethhdr) + \ @@ -61,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu); MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo); static void netpoll_async_cleanup(struct work_struct *work); static unsigned int carrier_timeout = 4; @@ -74,6 +69,37 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) +static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int status = NETDEV_TX_OK; + netdev_features_t features; + + features = netif_skb_features(skb); + + if (vlan_tx_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (unlikely(!skb)) { + /* This is actually a packet drop, but we + * don't want the code that calls this + * function to try and operate on a NULL skb. + */ + goto out; + } + skb->vlan_tci = 0; + } + + status = ops->ndo_start_xmit(skb, dev); + if (status == NETDEV_TX_OK) + txq_trans_update(txq); + +out: + return status; +} + static void queue_process(struct work_struct *work) { struct netpoll_info *npinfo = @@ -83,51 +109,31 @@ static void queue_process(struct work_struct *work) while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; - const struct net_device_ops *ops = dev->netdev_ops; struct netdev_queue *txq; if (!netif_device_present(dev) || !netif_running(dev)) { - __kfree_skb(skb); + kfree_skb(skb); continue; } txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); local_irq_save(flags); - __netif_tx_lock(txq, smp_processor_id()); + HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || - ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { + netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); - __netif_tx_unlock(txq); + HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } - __netif_tx_unlock(txq); + HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); } } -static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, - unsigned short ulen, __be32 saddr, __be32 daddr) -{ - __wsum psum; - - if (uh->check == 0 || skb_csum_unnecessary(skb)) - return 0; - - psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); - - if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_fold(csum_add(psum, skb->csum))) - return 0; - - skb->csum = psum; - - return __skb_checksum_complete(skb); -} - /* * Check whether delayed processing was scheduled for our NIC. If so, * we attempt to grab the poll lock and use ->poll() to pump the card. @@ -138,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, * trylock here and interrupts are already disabled in the softirq * case. Further, we test the poll_owner to avoid recursion on UP * systems where the lock doesn't exist. - * - * In cases where there is bi-directional communications, reading only - * one message at a time can lead to packets being dropped by the - * network adapter, forcing superfluous retries and possibly timeouts. - * Thus, we set our budget to greater than 1. */ -static int poll_one_napi(struct netpoll_info *npinfo, - struct napi_struct *napi, int budget) +static int poll_one_napi(struct napi_struct *napi, int budget) { int work; @@ -156,52 +156,35 @@ static int poll_one_napi(struct netpoll_info *npinfo, if (!test_bit(NAPI_STATE_SCHED, &napi->state)) return budget; - npinfo->rx_flags |= NETPOLL_RX_DROP; - atomic_inc(&trapped); set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); trace_napi_poll(napi); clear_bit(NAPI_STATE_NPSVC, &napi->state); - atomic_dec(&trapped); - npinfo->rx_flags &= ~NETPOLL_RX_DROP; return budget - work; } -static void poll_napi(struct net_device *dev) +static void poll_napi(struct net_device *dev, int budget) { struct napi_struct *napi; - int budget = 16; list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), - napi, budget); + budget = poll_one_napi(napi, budget); spin_unlock(&napi->poll_lock); - - if (!budget) - break; } } } -static void service_neigh_queue(struct netpoll_info *npi) -{ - if (npi) { - struct sk_buff *skb; - - while ((skb = skb_dequeue(&npi->neigh_tx))) - netpoll_neigh_reply(skb, npi); - } -} - static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); + int budget = 0; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity @@ -224,31 +207,14 @@ static void netpoll_poll_dev(struct net_device *dev) /* Process pending work on NIC */ ops->ndo_poll_controller(dev); - poll_napi(dev); + poll_napi(dev, budget); up(&ni->dev_lock); - if (dev->flags & IFF_SLAVE) { - if (ni) { - struct net_device *bond_dev; - struct sk_buff *skb; - struct netpoll_info *bond_ni; - - bond_dev = netdev_master_upper_dev_get_rcu(dev); - bond_ni = rcu_dereference_bh(bond_dev->npinfo); - while ((skb = skb_dequeue(&ni->neigh_tx))) { - skb->dev = bond_dev; - skb_queue_tail(&bond_ni->neigh_tx, skb); - } - } - } - - service_neigh_queue(ni); - zap_completion_queue(); } -void netpoll_rx_disable(struct net_device *dev) +void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; int idx; @@ -259,9 +225,9 @@ void netpoll_rx_disable(struct net_device *dev) down(&ni->dev_lock); srcu_read_unlock(&netpoll_srcu, idx); } -EXPORT_SYMBOL(netpoll_rx_disable); +EXPORT_SYMBOL(netpoll_poll_disable); -void netpoll_rx_enable(struct net_device *dev) +void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; rcu_read_lock(); @@ -270,7 +236,7 @@ void netpoll_rx_enable(struct net_device *dev) up(&ni->dev_lock); rcu_read_unlock(); } -EXPORT_SYMBOL(netpoll_rx_enable); +EXPORT_SYMBOL(netpoll_poll_enable); static void refill_skbs(void) { @@ -304,7 +270,7 @@ static void zap_completion_queue(void) while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; - if (skb->destructor) { + if (!skb_irq_freeable(skb)) { atomic_inc(&skb->users); dev_kfree_skb_any(skb); /* put this one back */ } else { @@ -359,7 +325,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, { int status = NETDEV_TX_BUSY; unsigned long tries; - const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; @@ -367,7 +332,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { - __kfree_skb(skb); + dev_kfree_skb_irq(skb); return; } @@ -375,27 +340,16 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { - if (__netif_tx_trylock(txq)) { - if (!netif_xmit_stopped(txq)) { - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(netif_skb_features(skb), - skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - break; - skb->vlan_tci = 0; - } - - status = ops->ndo_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); - } - __netif_tx_unlock(txq); + if (HARD_TX_TRYLOCK(dev, txq)) { + if (!netif_xmit_stopped(txq)) + status = netpoll_start_xmit(skb, dev, txq); + + HARD_TX_UNLOCK(dev, txq); if (status == NETDEV_TX_OK) break; @@ -410,7 +364,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, WARN_ONCE(!irqs_disabled(), "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", - dev->name, ops->ndo_start_xmit); + dev->name, dev->netdev_ops->ndo_start_xmit); } @@ -513,8 +467,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb->protocol = eth->h_proto = htons(ETH_P_IP); } - memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); - memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); + ether_addr_copy(eth->h_source, np->dev->dev_addr); + ether_addr_copy(eth->h_dest, np->remote_mac); skb->dev = np->dev; @@ -522,384 +476,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) } EXPORT_SYMBOL(netpoll_send_udp); -static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo) -{ - int size, type = ARPOP_REPLY; - __be32 sip, tip; - unsigned char *sha; - struct sk_buff *send_skb; - struct netpoll *np, *tmp; - unsigned long flags; - int hlen, tlen; - int hits = 0, proto; - - if (list_empty(&npinfo->rx_np)) - return; - - /* Before checking the packet, we do some early - inspection whether this is interesting at all */ - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (np->dev == skb->dev) - hits++; - } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - - /* No netpoll struct is using this dev */ - if (!hits) - return; - - proto = ntohs(eth_hdr(skb)->h_proto); - if (proto == ETH_P_ARP) { - struct arphdr *arp; - unsigned char *arp_ptr; - /* No arp on this interface */ - if (skb->dev->flags & IFF_NOARP) - return; - - if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) - return; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - arp = arp_hdr(skb); - - if ((arp->ar_hrd != htons(ARPHRD_ETHER) && - arp->ar_hrd != htons(ARPHRD_IEEE802)) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_op != htons(ARPOP_REQUEST)) - return; - - arp_ptr = (unsigned char *)(arp+1); - /* save the location of the src hw addr */ - sha = arp_ptr; - arp_ptr += skb->dev->addr_len; - memcpy(&sip, arp_ptr, 4); - arp_ptr += 4; - /* If we actually cared about dst hw addr, - it would get copied here */ - arp_ptr += skb->dev->addr_len; - memcpy(&tip, arp_ptr, 4); - - /* Should we ignore arp? */ - if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) - return; - - size = arp_hdr_len(skb->dev); - - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (tip != np->local_ip.ip) - continue; - - hlen = LL_RESERVED_SPACE(np->dev); - tlen = np->dev->needed_tailroom; - send_skb = find_skb(np, size + hlen + tlen, hlen); - if (!send_skb) - continue; - - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); - - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - continue; - } - - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should - * always equal 1 (Ethernet). - */ - - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); - - arp_ptr = (unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); - - netpoll_send_skb(np, send_skb); - - /* If there are several rx_skb_hooks for the same - * address we're fine by sending a single reply - */ - break; - } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } else if( proto == ETH_P_IPV6) { -#if IS_ENABLED(CONFIG_IPV6) - struct nd_msg *msg; - u8 *lladdr = NULL; - struct ipv6hdr *hdr; - struct icmp6hdr *icmp6h; - const struct in6_addr *saddr; - const struct in6_addr *daddr; - struct inet6_dev *in6_dev = NULL; - struct in6_addr *target; - - in6_dev = in6_dev_get(skb->dev); - if (!in6_dev || !in6_dev->cnf.accept_ra) - return; - - if (!pskb_may_pull(skb, skb->len)) - return; - - msg = (struct nd_msg *)skb_transport_header(skb); - - __skb_push(skb, skb->data - skb_transport_header(skb)); - - if (ipv6_hdr(skb)->hop_limit != 255) - return; - if (msg->icmph.icmp6_code != 0) - return; - if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) - return; - - saddr = &ipv6_hdr(skb)->saddr; - daddr = &ipv6_hdr(skb)->daddr; - - size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); - - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (!ipv6_addr_equal(daddr, &np->local_ip.in6)) - continue; - - hlen = LL_RESERVED_SPACE(np->dev); - tlen = np->dev->needed_tailroom; - send_skb = find_skb(np, size + hlen + tlen, hlen); - if (!send_skb) - continue; - - send_skb->protocol = htons(ETH_P_IPV6); - send_skb->dev = skb->dev; - - skb_reset_network_header(send_skb); - hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr)); - *(__be32*)hdr = htonl(0x60000000); - hdr->payload_len = htons(size); - hdr->nexthdr = IPPROTO_ICMPV6; - hdr->hop_limit = 255; - hdr->saddr = *saddr; - hdr->daddr = *daddr; - - icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr)); - icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; - icmp6h->icmp6_router = 0; - icmp6h->icmp6_solicited = 1; - - target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr)); - *target = msg->target; - icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size, - IPPROTO_ICMPV6, - csum_partial(icmp6h, - size, 0)); - - if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6, - lladdr, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - continue; - } - - netpoll_send_skb(np, send_skb); - - /* If there are several rx_skb_hooks for the same - * address, we're fine by sending a single reply - */ - break; - } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); -#endif - } -} - -static bool pkt_is_ns(struct sk_buff *skb) -{ - struct nd_msg *msg; - struct ipv6hdr *hdr; - - if (skb->protocol != htons(ETH_P_ARP)) - return false; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg))) - return false; - - msg = (struct nd_msg *)skb_transport_header(skb); - __skb_push(skb, skb->data - skb_transport_header(skb)); - hdr = ipv6_hdr(skb); - - if (hdr->nexthdr != IPPROTO_ICMPV6) - return false; - if (hdr->hop_limit != 255) - return false; - if (msg->icmph.icmp6_code != 0) - return false; - if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) - return false; - - return true; -} - -int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) -{ - int proto, len, ulen, data_len; - int hits = 0, offset; - const struct iphdr *iph; - struct udphdr *uh; - struct netpoll *np, *tmp; - uint16_t source; - - if (list_empty(&npinfo->rx_np)) - goto out; - - if (skb->dev->type != ARPHRD_ETHER) - goto out; - - /* check if netpoll clients need ARP */ - if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { - skb_queue_tail(&npinfo->neigh_tx, skb); - return 1; - } else if (pkt_is_ns(skb) && atomic_read(&trapped)) { - skb_queue_tail(&npinfo->neigh_tx, skb); - return 1; - } - - if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { - skb = vlan_untag(skb); - if (unlikely(!skb)) - goto out; - } - - proto = ntohs(eth_hdr(skb)->h_proto); - if (proto != ETH_P_IP && proto != ETH_P_IPV6) - goto out; - if (skb->pkt_type == PACKET_OTHERHOST) - goto out; - if (skb_shared(skb)) - goto out; - - if (proto == ETH_P_IP) { - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - iph = (struct iphdr *)skb->data; - if (iph->ihl < 5 || iph->version != 4) - goto out; - if (!pskb_may_pull(skb, iph->ihl*4)) - goto out; - iph = (struct iphdr *)skb->data; - if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) - goto out; - - len = ntohs(iph->tot_len); - if (skb->len < len || len < iph->ihl*4) - goto out; - - /* - * Our transport medium may have padded the buffer out. - * Now We trim to the true length of the frame. - */ - if (pskb_trim_rcsum(skb, len)) - goto out; - - iph = (struct iphdr *)skb->data; - if (iph->protocol != IPPROTO_UDP) - goto out; - - len -= iph->ihl*4; - uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); - offset = (unsigned char *)(uh + 1) - skb->data; - ulen = ntohs(uh->len); - data_len = skb->len - offset; - source = ntohs(uh->source); - - if (ulen != len) - goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) - goto out; - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (np->local_ip.ip && np->local_ip.ip != iph->daddr) - continue; - if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr) - continue; - if (np->local_port && np->local_port != ntohs(uh->dest)) - continue; - - np->rx_skb_hook(np, source, skb, offset, data_len); - hits++; - } - } else { -#if IS_ENABLED(CONFIG_IPV6) - const struct ipv6hdr *ip6h; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto out; - ip6h = (struct ipv6hdr *)skb->data; - if (ip6h->version != 6) - goto out; - len = ntohs(ip6h->payload_len); - if (!len) - goto out; - if (len + sizeof(struct ipv6hdr) > skb->len) - goto out; - if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr))) - goto out; - ip6h = ipv6_hdr(skb); - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto out; - uh = udp_hdr(skb); - offset = (unsigned char *)(uh + 1) - skb->data; - ulen = ntohs(uh->len); - data_len = skb->len - offset; - source = ntohs(uh->source); - if (ulen != skb->len) - goto out; - if (udp6_csum_init(skb, uh, IPPROTO_UDP)) - goto out; - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr)) - continue; - if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr)) - continue; - if (np->local_port && np->local_port != ntohs(uh->dest)) - continue; - - np->rx_skb_hook(np, source, skb, offset, data_len); - hits++; - } -#endif - } - - if (!hits) - goto out; - - kfree_skb(skb); - return 1; - -out: - if (atomic_read(&trapped)) { - kfree_skb(skb); - return 1; - } - - return 0; -} - void netpoll_print_options(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); @@ -941,6 +517,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; int ipv6; + bool ipversion_set = false; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) @@ -953,6 +530,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) cur++; if (*cur != '/') { + ipversion_set = true; if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; @@ -995,7 +573,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); if (ipv6 < 0) goto parse_failed; - else if (np->ipv6 != (bool)ipv6) + else if (ipversion_set && np->ipv6 != (bool)ipv6) goto parse_failed; else np->ipv6 = (bool)ipv6; @@ -1017,11 +595,10 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; - unsigned long flags; int err; np->dev = ndev; @@ -1037,18 +614,13 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) } if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), gfp); + npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; goto out; } - npinfo->rx_flags = 0; - INIT_LIST_HEAD(&npinfo->rx_np); - - spin_lock_init(&npinfo->rx_lock); sema_init(&npinfo->dev_lock, 1); - skb_queue_head_init(&npinfo->neigh_tx); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); @@ -1056,7 +628,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); + err = ops->ndo_netpoll_setup(ndev, npinfo); if (err) goto free_npinfo; } @@ -1067,13 +639,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) npinfo->netpoll = np; - if (np->rx_skb_hook) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_flags |= NETPOLL_RX_ENABLED; - list_add_tail(&np->rx, &npinfo->rx_np); - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } - /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -1195,7 +760,7 @@ int netpoll_setup(struct netpoll *np) /* fill up the skb queue */ refill_skbs(); - err = __netpoll_setup(np, ndev, GFP_KERNEL); + err = __netpoll_setup(np, ndev); if (err) goto put; @@ -1222,7 +787,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); - skb_queue_purge(&npinfo->neigh_tx); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ @@ -1238,7 +802,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; - unsigned long flags; /* rtnl_dereference would be preferable here but * rcu_cleanup_netpoll path can put us in here safely without @@ -1248,14 +811,6 @@ void __netpoll_cleanup(struct netpoll *np) if (!npinfo) return; - if (!list_empty(&npinfo->rx_np)) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_del(&np->rx); - if (list_empty(&npinfo->rx_np)) - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } - synchronize_srcu(&netpoll_srcu); if (atomic_dec_and_test(&npinfo->refcnt)) { @@ -1265,7 +820,7 @@ void __netpoll_cleanup(struct netpoll *np) if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); - rcu_assign_pointer(np->dev->npinfo, NULL); + RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); } } @@ -1299,18 +854,3 @@ out: rtnl_unlock(); } EXPORT_SYMBOL(netpoll_cleanup); - -int netpoll_trap(void) -{ - return atomic_read(&trapped); -} -EXPORT_SYMBOL(netpoll_trap); - -void netpoll_set_trap(int trap) -{ - if (trap) - atomic_inc(&trapped); - else - atomic_dec(&trapped); -} -EXPORT_SYMBOL(netpoll_set_trap); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9b7cf6c85f8..2f385b9bccc 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -30,7 +30,7 @@ #define PRIOMAP_MIN_SZ 128 /* - * Extend @dev->priomap so that it's large enough to accomodate + * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ @@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *parent_css = css_parent(css); + struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; @@ -173,27 +173,27 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) return css->cgroup->id; } -static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb) +static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) - cb->fill(cb, dev->name, netprio_prio(css, dev)); + seq_printf(sf, "%s %u\n", dev->name, + netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } -static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) +static ssize_t write_priomap(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; - if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) + if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); @@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(css, dev, prio); + ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); - return ret; + return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) @@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css, struct task_struct *p; void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, css, tset) { + cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); @@ -238,21 +238,18 @@ static struct cftype ss_files[] = { }, { .name = "ifpriomap", - .read_map = read_priomap, - .write_string = write_priomap, + .seq_show = read_priomap, + .write = write_priomap, }, { } /* terminate */ }; -struct cgroup_subsys net_prio_subsys = { - .name = "net_prio", +struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, - .subsys_id = net_prio_subsys_id, .base_cftypes = ss_files, - .module = THIS_MODULE, }; static int netprio_device_event(struct notifier_block *unused, @@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = { static int __init init_cgroup_netprio(void) { - int ret; - - ret = cgroup_load_subsys(&net_prio_subsys); - if (ret) - goto out; - register_netdevice_notifier(&netprio_device_notifier); - -out: - return ret; -} - -static void __exit exit_cgroup_netprio(void) -{ - struct netprio_map *old; - struct net_device *dev; - - unregister_netdevice_notifier(&netprio_device_notifier); - - cgroup_unload_subsys(&net_prio_subsys); - - rtnl_lock(); - for_each_netdev(&init_net, dev) { - old = rtnl_dereference(dev->priomap); - RCU_INIT_POINTER(dev->priomap, NULL); - if (old) - kfree_rcu(old, rcu); - } - rtnl_unlock(); + return 0; } -module_init(init_cgroup_netprio); -module_exit(exit_cgroup_netprio); +subsys_initcall(init_cgroup_netprio); MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a797fff7f22..fc17a9d309a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -389,6 +389,9 @@ struct pktgen_dev { #ifdef CONFIG_XFRM __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ + __u32 spi; + struct dst_entry dst; + struct dst_ops dstops; #endif char result[512]; }; @@ -473,23 +476,22 @@ static int pgctrl_show(struct seq_file *seq, void *v) static ssize_t pgctrl_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - int err = 0; char data[128]; struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); - if (!capable(CAP_NET_ADMIN)) { - err = -EPERM; - goto out; - } + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (count == 0) + return -EINVAL; if (count > sizeof(data)) count = sizeof(data); - if (copy_from_user(data, buf, count)) { - err = -EFAULT; - goto out; - } - data[count - 1] = 0; /* Make string */ + if (copy_from_user(data, buf, count)) + return -EFAULT; + + data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ if (!strcmp(data, "stop")) pktgen_stop_all_threads_ifs(pn); @@ -503,10 +505,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, else pr_warning("Unknown command: %s\n", data); - err = count; - -out: - return err; + return count; } static int pgctrl_open(struct inode *inode, struct file *file) @@ -574,7 +573,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) is_zero_ether_addr(pkt_dev->src_mac) ? pkt_dev->odev->dev_addr : pkt_dev->src_mac); - seq_printf(seq, "dst_mac: "); + seq_puts(seq, "dst_mac: "); seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, @@ -589,7 +588,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->nr_labels) { unsigned int i; - seq_printf(seq, " mpls: "); + seq_puts(seq, " mpls: "); for (i = 0; i < pkt_dev->nr_labels; i++) seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), i == pkt_dev->nr_labels-1 ? "\n" : ", "); @@ -614,64 +613,67 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); - seq_printf(seq, " Flags: "); + seq_puts(seq, " Flags: "); if (pkt_dev->flags & F_IPV6) - seq_printf(seq, "IPV6 "); + seq_puts(seq, "IPV6 "); if (pkt_dev->flags & F_IPSRC_RND) - seq_printf(seq, "IPSRC_RND "); + seq_puts(seq, "IPSRC_RND "); if (pkt_dev->flags & F_IPDST_RND) - seq_printf(seq, "IPDST_RND "); + seq_puts(seq, "IPDST_RND "); if (pkt_dev->flags & F_TXSIZE_RND) - seq_printf(seq, "TXSIZE_RND "); + seq_puts(seq, "TXSIZE_RND "); if (pkt_dev->flags & F_UDPSRC_RND) - seq_printf(seq, "UDPSRC_RND "); + seq_puts(seq, "UDPSRC_RND "); if (pkt_dev->flags & F_UDPDST_RND) - seq_printf(seq, "UDPDST_RND "); + seq_puts(seq, "UDPDST_RND "); if (pkt_dev->flags & F_UDPCSUM) - seq_printf(seq, "UDPCSUM "); + seq_puts(seq, "UDPCSUM "); if (pkt_dev->flags & F_MPLS_RND) - seq_printf(seq, "MPLS_RND "); + seq_puts(seq, "MPLS_RND "); if (pkt_dev->flags & F_QUEUE_MAP_RND) - seq_printf(seq, "QUEUE_MAP_RND "); + seq_puts(seq, "QUEUE_MAP_RND "); if (pkt_dev->flags & F_QUEUE_MAP_CPU) - seq_printf(seq, "QUEUE_MAP_CPU "); + seq_puts(seq, "QUEUE_MAP_CPU "); if (pkt_dev->cflows) { if (pkt_dev->flags & F_FLOW_SEQ) - seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/ + seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/ else - seq_printf(seq, "FLOW_RND "); + seq_puts(seq, "FLOW_RND "); } #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) - seq_printf(seq, "IPSEC "); + if (pkt_dev->flags & F_IPSEC_ON) { + seq_puts(seq, "IPSEC "); + if (pkt_dev->spi) + seq_printf(seq, "spi:%u", pkt_dev->spi); + } #endif if (pkt_dev->flags & F_MACSRC_RND) - seq_printf(seq, "MACSRC_RND "); + seq_puts(seq, "MACSRC_RND "); if (pkt_dev->flags & F_MACDST_RND) - seq_printf(seq, "MACDST_RND "); + seq_puts(seq, "MACDST_RND "); if (pkt_dev->flags & F_VID_RND) - seq_printf(seq, "VID_RND "); + seq_puts(seq, "VID_RND "); if (pkt_dev->flags & F_SVID_RND) - seq_printf(seq, "SVID_RND "); + seq_puts(seq, "SVID_RND "); if (pkt_dev->flags & F_NODE) - seq_printf(seq, "NODE_ALLOC "); + seq_puts(seq, "NODE_ALLOC "); seq_puts(seq, "\n"); @@ -714,7 +716,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->result[0]) seq_printf(seq, "Result: %s\n", pkt_dev->result); else - seq_printf(seq, "Result: Idle\n"); + seq_puts(seq, "Result: Idle\n"); return 0; } @@ -1245,7 +1247,13 @@ static ssize_t pktgen_if_write(struct file *file, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " - "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); + "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " + "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " + "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " +#ifdef CONFIG_XFRM + "IPSEC, " +#endif + "NODE_ALLOC\n"); return count; } sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); @@ -1434,7 +1442,7 @@ static ssize_t pktgen_if_write(struct file *file, if (!mac_pton(valstr, pkt_dev->dst_mac)) return -EINVAL; /* Set up Dest MAC */ - memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN); + ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac); sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac); return count; @@ -1451,7 +1459,7 @@ static ssize_t pktgen_if_write(struct file *file, if (!mac_pton(valstr, pkt_dev->src_mac)) return -EINVAL; /* Set up Src MAC */ - memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN); + ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac); sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac); return count; @@ -1476,7 +1484,18 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); return count; } +#ifdef CONFIG_XFRM + if (!strcmp(name, "spi")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + i += len; + pkt_dev->spi = value; + sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); + return count; + } +#endif if (!strcmp(name, "flowlen")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -1716,14 +1735,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) BUG_ON(!t); - seq_printf(seq, "Running: "); + seq_puts(seq, "Running: "); if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); - seq_printf(seq, "\nStopped: "); + seq_puts(seq, "\nStopped: "); list_for_each_entry(pkt_dev, &t->if_list, list) if (!pkt_dev->running) @@ -1732,7 +1751,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) if (t->result[0]) seq_printf(seq, "\nResult: %s\n", t->result); else - seq_printf(seq, "\nResult: NA\n"); + seq_puts(seq, "\nResult: NA\n"); if_unlock(t); @@ -2043,10 +2062,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) /* Default to the interface's mac if not explicitly set. */ if (is_zero_ether_addr(pkt_dev->src_mac)) - memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); + ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr); /* Set up Dest MAC */ - memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); + ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac); if (pkt_dev->flags & F_IPV6) { int i, set = 0, err = 1; @@ -2233,13 +2252,21 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); if (!x) { - /*slow path: we dont already have xfrm_state*/ - x = xfrm_stateonly_find(pn->net, DUMMY_MARK, - (xfrm_address_t *)&pkt_dev->cur_daddr, - (xfrm_address_t *)&pkt_dev->cur_saddr, - AF_INET, - pkt_dev->ipsmode, - pkt_dev->ipsproto, 0); + + if (pkt_dev->spi) { + /* We need as quick as possible to find the right SA + * Searching with minimum criteria to archieve this. + */ + x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); + } else { + /* slow path: we dont already have xfrm_state */ + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, + (xfrm_address_t *)&pkt_dev->cur_daddr, + (xfrm_address_t *)&pkt_dev->cur_saddr, + AF_INET, + pkt_dev->ipsmode, + pkt_dev->ipsproto, 0); + } if (x) { pkt_dev->flows[flow].x = x; set_pkt_overhead(pkt_dev); @@ -2475,31 +2502,47 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) #ifdef CONFIG_XFRM +static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { + + [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */ +}; + static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int err = 0; + struct net *net = dev_net(pkt_dev->odev); if (!x) return 0; /* XXX: we dont support tunnel mode for now until * we resolve the dst issue */ - if (x->props.mode != XFRM_MODE_TRANSPORT) + if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; - spin_lock(&x->lock); + /* But when user specify an valid SPI, transformation + * supports both transport/tunnel mode + ESP/AH type. + */ + if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) + skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); - if (err) + rcu_read_unlock_bh(); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); goto error; + } err = x->type->output(x, skb); - if (err) + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR); goto error; - + } + spin_lock_bh(&x->lock); x->curlft.bytes += skb->len; x->curlft.packets++; + spin_unlock_bh(&x->lock); error: - spin_unlock(&x->lock); return err; } @@ -3295,9 +3338,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) queue_map = skb_get_queue_mapping(pkt_dev->skb); txq = netdev_get_tx_queue(odev, queue_map); - __netif_tx_lock_bh(txq); + local_bh_disable(); + + HARD_TX_LOCK(odev, txq, smp_processor_id()); - if (unlikely(netif_xmit_frozen_or_stopped(txq))) { + if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; @@ -3331,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; } unlock: - __netif_tx_unlock_bh(txq); + HARD_TX_UNLOCK(odev, txq); + + local_bh_enable(); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { @@ -3542,6 +3589,17 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) #ifdef CONFIG_XFRM pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; pkt_dev->ipsproto = IPPROTO_ESP; + + /* xfrm tunnel mode needs additional dst to extract outter + * ip header protocol/ttl/id field, here creat a phony one. + * instead of looking for a valid rt, which definitely hurting + * performance under such circumstance. + */ + pkt_dev->dstops.family = AF_INET; + pkt_dev->dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); + pkt_dev->dst.child = &pkt_dev->dst; + pkt_dev->dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c new file mode 100644 index 00000000000..d3027a73fd4 --- /dev/null +++ b/net/core/ptp_classifier.c @@ -0,0 +1,141 @@ +/* PTP classifier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* The below program is the bpf_asm (tools/net/) representation of + * the opcode array in the ptp_filter structure. + * + * For convenience, this can easily be altered and reviewed with + * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a + * simple file containing the below program: + * + * ldh [12] ; load ethertype + * + * ; PTP over UDP over IPv4 over Ethernet + * test_ipv4: + * jneq #0x800, test_ipv6 ; ETH_P_IP ? + * ldb [23] ; load proto + * jneq #17, drop_ipv4 ; IPPROTO_UDP ? + * ldh [20] ; load frag offset field + * jset #0x1fff, drop_ipv4 ; don't allow fragments + * ldxb 4*([14]&0xf) ; load IP header len + * ldh [x + 16] ; load UDP dst port + * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 22] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x10 ; PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over Ethernet + * test_ipv6: + * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ? + * ldb [20] ; load proto + * jneq #17, drop_ipv6 ; IPPROTO_UDP ? + * ldh [56] ; load UDP dst port + * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ? + * ldh [62] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x20 ; PTP_CLASS_IPV6 + * ret a ; return PTP class + * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over 802.1Q over Ethernet + * test_8021q: + * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ? + * ldh [16] ; load inner type + * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * ldb [18] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [18] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x40 ; PTP_CLASS_V2_VLAN + * ret a ; return PTP class + * + * ; PTP over Ethernet + * test_ieee1588: + * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * ldb [14] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [14] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x30 ; PTP_CLASS_L2 + * ret a ; return PTP class + * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE + */ + +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <linux/ptp_classify.h> + +static struct sk_filter *ptp_insns __read_mostly; + +unsigned int ptp_classify_raw(const struct sk_buff *skb) +{ + return SK_RUN_FILTER(ptp_insns, skb); +} +EXPORT_SYMBOL_GPL(ptp_classify_raw); + +void __init ptp_classifier_init(void) +{ + static struct sock_filter ptp_filter[] __initdata = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000014 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x0000000e }, + { 0x48, 0, 0, 0x00000010 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x00000016 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000010 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 9, 0x000086dd }, + { 0x30, 0, 0, 0x00000014 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x00000038 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x0000003e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000020 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 9, 0x00008100 }, + { 0x28, 0, 0, 0x00000010 }, + { 0x15, 0, 15, 0x000088f7 }, + { 0x30, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 12, 0x00000000 }, + { 0x28, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000040 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x15, 0, 7, 0x000088f7 }, + { 0x30, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 4, 0x00000000 }, + { 0x28, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000030 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + }; + struct sock_fprog_kern ptp_prog = { + .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, + }; + + BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog)); +} diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 4425148d2b5..467f326126e 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -221,5 +221,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, out: spin_unlock_bh(&fastopenq->lock); sock_put(lsk); - return; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cf67144d3e3..1063996f831 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -353,18 +353,65 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + for_each_net(net) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - rtnl_lock(); + /* Close the race with cleanup_net() */ + mutex_lock(&net_mutex); + rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); + mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); +static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (!master_dev) + return 0; + ops = master_dev->rtnl_link_ops; + if (!ops || !ops->get_slave_size) + return 0; + /* IFLA_INFO_SLAVE_DATA + nested data */ + return nla_total_size(sizeof(struct nlattr)) + + ops->get_slave_size(master_dev, dev); +} + static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; @@ -385,6 +432,8 @@ static size_t rtnl_link_get_size(const struct net_device *dev) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); + size += rtnl_link_get_slave_info_data_size(dev); + return size; } @@ -403,34 +452,16 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) } /** - * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. - * @ops: struct rtnl_af_ops * to register - * - * The caller must hold the rtnl_mutex. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_af_register(struct rtnl_af_ops *ops) -{ - list_add_tail(&ops->list, &rtnl_af_ops); - return 0; -} -EXPORT_SYMBOL_GPL(__rtnl_af_register); - -/** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Returns 0 on success or a negative error code. */ -int rtnl_af_register(struct rtnl_af_ops *ops) +void rtnl_af_register(struct rtnl_af_ops *ops) { - int err; - rtnl_lock(); - err = __rtnl_af_register(ops); + list_add_tail(&ops->list, &rtnl_af_ops); rtnl_unlock(); - return err; } EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -477,40 +508,100 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev) return size; } -static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +static bool rtnl_have_link_slave_info(const struct net_device *dev) { - const struct rtnl_link_ops *ops = dev->rtnl_link_ops; - struct nlattr *linkinfo, *data; - int err = -EMSGSIZE; + struct net_device *master_dev; - linkinfo = nla_nest_start(skb, IFLA_LINKINFO); - if (linkinfo == NULL) - goto out; + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (master_dev && master_dev->rtnl_link_ops) + return true; + return false; +} + +static int rtnl_link_slave_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + struct nlattr *slave_data; + int err; + + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (!master_dev) + return 0; + ops = master_dev->rtnl_link_ops; + if (!ops) + return 0; + if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) + return -EMSGSIZE; + if (ops->fill_slave_info) { + slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA); + if (!slave_data) + return -EMSGSIZE; + err = ops->fill_slave_info(skb, master_dev, dev); + if (err < 0) + goto err_cancel_slave_data; + nla_nest_end(skb, slave_data); + } + return 0; + +err_cancel_slave_data: + nla_nest_cancel(skb, slave_data); + return err; +} + +static int rtnl_link_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + struct nlattr *data; + int err; + if (!ops) + return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) - goto err_cancel_link; + return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) - goto err_cancel_link; + return err; } if (ops->fill_info) { data = nla_nest_start(skb, IFLA_INFO_DATA); - if (data == NULL) { - err = -EMSGSIZE; - goto err_cancel_link; - } + if (data == NULL) + return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } - - nla_nest_end(skb, linkinfo); return 0; err_cancel_data: nla_nest_cancel(skb, data); + return err; +} + +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +{ + struct nlattr *linkinfo; + int err = -EMSGSIZE; + + linkinfo = nla_nest_start(skb, IFLA_LINKINFO); + if (linkinfo == NULL) + goto out; + + err = rtnl_link_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + err = rtnl_link_slave_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + nla_nest_end(skb, linkinfo); + return 0; + err_cancel_link: nla_nest_cancel(skb, linkinfo); out: @@ -707,14 +798,15 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + - nla_total_size(sizeof(struct ifla_vf_tx_rate)) + - nla_total_size(sizeof(struct ifla_vf_spoofchk))); + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + + nla_total_size(sizeof(struct ifla_vf_rate))); return size; } else return 0; } -static size_t rtnl_port_size(const struct net_device *dev) +static size_t rtnl_port_size(const struct net_device *dev, + u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ @@ -730,7 +822,8 @@ static size_t rtnl_port_size(const struct net_device *dev) size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + @@ -762,10 +855,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ - + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ @@ -827,11 +921,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) return 0; } -static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, + u32 ext_filter_mask) { int err; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); @@ -910,7 +1006,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || (dev->ifalias && - nla_put_string(skb, IFLA_IFALIAS, dev->ifalias))) + nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || + nla_put_u32(skb, IFLA_CARRIER_CHANGES, + atomic_read(&dev->carrier_changes))) goto nla_put_failure; if (1) { @@ -967,6 +1065,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_info ivi; struct ifla_vf_mac vf_mac; struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_link_state vf_linkstate; @@ -987,6 +1086,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, break; vf_mac.vf = vf_vlan.vf = + vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = ivi.vf; @@ -994,7 +1094,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; - vf_tx_rate.rate = ivi.tx_rate; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf = nla_nest_start(skb, IFLA_VF_INFO); @@ -1004,6 +1106,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, } if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), @@ -1016,10 +1120,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_nest_end(skb, vfinfo); } - if (rtnl_port_fill(skb, dev)) + if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; - if (dev->rtnl_link_ops) { + if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } @@ -1061,56 +1165,7 @@ nla_put_failure: return -EMSGSIZE; } -static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; - struct hlist_head *head; - struct nlattr *tb[IFLA_MAX+1]; - u32 ext_filter_mask = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - - rcu_read_lock(); - cb->seq = net->dev_base_seq; - - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, - ifla_policy) >= 0) { - - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); - } - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, - ext_filter_mask) <= 0) - goto out; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - } -out: - rcu_read_unlock(); - cb->args[1] = idx; - cb->args[0] = h; - - return skb->len; -} - -const struct nla_policy ifla_policy[IFLA_MAX+1] = { +static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -1136,12 +1191,14 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, + [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ }; -EXPORT_SYMBOL(ifla_policy); static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, + [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { @@ -1157,6 +1214,10 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY, .len = sizeof(struct ifla_vf_spoofchk) }, + [IFLA_VF_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_rate) }, + [IFLA_VF_LINK_STATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_link_state) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1173,6 +1234,78 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx = 0, s_idx; + struct net_device *dev; + struct hlist_head *head; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + int err; + int hdrlen; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + rcu_read_lock(); + cb->seq = net->dev_base_seq; + + /* A hack to preserve kernel<->userspace interface. + * The correct header is ifinfomsg. It is consistent with rtnl_getlink. + * However, before Linux v3.9 the code here assumed rtgenmsg and that's + * what iproute2 < v3.9.0 used. + * We can detect the old iproute2. Even including the IFLA_EXT_MASK + * attribute, its netlink message is shorter than struct ifinfomsg. + */ + hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + } + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI, + ext_filter_mask); + /* If we ran out of room on the first message, + * we're in trouble + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + + if (err <= 0) + goto out; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + +int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len) +{ + return nla_parse(tb, IFLA_MAX, head, len, ifla_policy); +} +EXPORT_SYMBOL(rtnl_nla_parse_ifla); + struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; @@ -1254,11 +1387,29 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) } case IFLA_VF_TX_RATE: { struct ifla_vf_tx_rate *ivt; + struct ifla_vf_info ivf; + ivt = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_get_vf_config) + err = ops->ndo_get_vf_config(dev, ivt->vf, + &ivf); + if (err) + break; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_rate) + err = ops->ndo_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, + ivt->rate); + break; + } + case IFLA_VF_RATE: { + struct ifla_vf_rate *ivt; ivt = nla_data(vf); err = -EOPNOTSUPP; - if (ops->ndo_set_vf_tx_rate) - err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, - ivt->rate); + if (ops->ndo_set_vf_rate) + err = ops->ndo_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, + ivt->max_tx_rate); break; } case IFLA_VF_SPOOFCHK: { @@ -1324,7 +1475,8 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } -static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, + struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1336,7 +1488,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, err = PTR_ERR(net); goto errout; } - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { err = -EPERM; goto errout; } @@ -1590,7 +1742,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) goto errout; - err = do_setlink(dev, ifm, tb, ifname, 0); + err = do_setlink(skb, dev, ifm, tb, ifname, 0); errout: return err; } @@ -1630,7 +1782,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); - list_del(&list_kill); return 0; } @@ -1707,7 +1858,8 @@ err: } EXPORT_SYMBOL(rtnl_create_link); -static int rtnl_group_changelink(struct net *net, int group, +static int rtnl_group_changelink(const struct sk_buff *skb, + struct net *net, int group, struct ifinfomsg *ifm, struct nlattr **tb) { @@ -1716,7 +1868,7 @@ static int rtnl_group_changelink(struct net *net, int group, for_each_netdev(net, dev) { if (dev->group == group) { - err = do_setlink(dev, ifm, tb, NULL, 0); + err = do_setlink(skb, dev, ifm, tb, NULL, 0); if (err < 0) return err; } @@ -1729,7 +1881,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; + const struct rtnl_link_ops *m_ops = NULL; struct net_device *dev; + struct net_device *master_dev = NULL; struct ifinfomsg *ifm; char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; @@ -1759,6 +1913,12 @@ replay: dev = NULL; } + if (dev) { + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + } + err = validate_linkmsg(dev, tb); if (err < 0) return err; @@ -1780,7 +1940,10 @@ replay: } if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct nlattr *attr[ops ? ops->maxtype + 1 : 0]; + struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0]; + struct nlattr **data = NULL; + struct nlattr **slave_data = NULL; struct net *dest_net; if (ops) { @@ -1799,6 +1962,24 @@ replay: } } + if (m_ops) { + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy); + if (err < 0) + return err; + slave_data = slave_attr; + } + if (m_ops->slave_validate) { + err = m_ops->slave_validate(tb, slave_data); + if (err < 0) + return err; + } + } + if (dev) { int modified = 0; @@ -1818,12 +1999,23 @@ replay: modified = 1; } - return do_setlink(dev, ifm, tb, ifname, modified); + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + err = m_ops->slave_changelink(master_dev, dev, + tb, slave_data); + if (err < 0) + return err; + modified = 1; + } + + return do_setlink(skb, dev, ifm, tb, ifname, modified); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(net, + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, tb); return -ENODEV; @@ -1861,16 +2053,25 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) + if (ops->newlink) { err = ops->newlink(net, dev, tb, data); - else + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) + free_netdev(dev); + goto out; + } + } else { err = register_netdevice(dev); - - if (err < 0) { - free_netdev(dev); - goto out; + if (err < 0) { + free_netdev(dev); + goto out; + } } - err = rtnl_configure_link(dev, ifm); if (err < 0) unregister_netdevice(dev); @@ -1935,9 +2136,13 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; u16 min_ifinfo_dump_size = 0; + int hdrlen; + + /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, - ifla_policy) >= 0) { + if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } @@ -2014,12 +2219,13 @@ EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u32 pid, u32 seq, - int type, unsigned int flags) + int type, unsigned int flags, + int nlflags) { struct nlmsghdr *nlh; struct ndmsg *ndm; - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; @@ -2057,7 +2263,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type) if (!skb) goto errout; - err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF); + err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0); if (err < 0) { kfree_skb(skb); goto errout; @@ -2204,7 +2410,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) int err = -EINVAL; __u8 *addr; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); @@ -2282,7 +2488,8 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, portid, seq, - RTM_NEWNEIGH, NTF_SELF); + RTM_NEWNEIGH, NTF_SELF, + NLM_F_MULTI); if (err < 0) return err; skip: @@ -2655,7 +2862,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sz_idx = type>>2; kind = type&3; - if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 897da56f3af..ba71212f025 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -85,31 +85,6 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -__u32 secure_ip_id(__be32 daddr) -{ - u32 hash[MD5_DIGEST_WORDS]; - - net_secret_init(); - hash[0] = (__force __u32) daddr; - hash[1] = net_secret[13]; - hash[2] = net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; -} - -__u32 secure_ipv6_id(const __be32 daddr[4]) -{ - __u32 hash[4]; - - net_secret_init(); - memcpy(hash, daddr, 16); - md5_transform(hash, net_secret); - - return hash[0]; -} __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06e72d3cdf6..c1a33033cbe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -47,6 +47,8 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/udp.h> #include <linux/netdevice.h> #ifdef CONFIG_NET_CLS_ACT #include <net/pkt_sched.h> @@ -65,6 +67,7 @@ #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/ip6_checksum.h> #include <net/xfrm.h> #include <asm/uaccess.h> @@ -74,36 +77,6 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; -static void sock_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - put_page(buf->page); -} - -static void sock_pipe_buf_get(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - get_page(buf->page); -} - -static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - - -/* Pipe buffer operations for a socket. */ -static const struct pipe_buf_operations sock_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = sock_pipe_buf_release, - .steal = sock_pipe_buf_steal, - .get = sock_pipe_buf_get, -}; - /** * skb_panic - private function for out-of-line support * @skb: buffer @@ -712,17 +685,19 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->inner_network_header = old->inner_network_header; new->inner_mac_header = old->inner_mac_header; skb_dst_copy(new, old); - new->rxhash = old->rxhash; + skb_copy_hash(new, old); new->ooo_okay = old->ooo_okay; - new->l4_rxhash = old->l4_rxhash; new->no_fcs = old->no_fcs; new->encapsulation = old->encapsulation; + new->encap_hdr_csum = old->encap_hdr_csum; + new->csum_valid = old->csum_valid; + new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif memcpy(new->cb, old->cb, sizeof(old->cb)); new->csum = old->csum; - new->local_df = old->local_df; + new->ignore_df = old->ignore_df; new->pkt_type = old->pkt_type; new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); @@ -735,9 +710,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mark = old->mark; new->skb_iif = old->skb_iif; __nf_copy(new, old); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - new->nf_trace = old->nf_trace; -#endif #ifdef CONFIG_NET_SCHED new->tc_index = old->tc_index; #ifdef CONFIG_NET_CLS_ACT @@ -982,10 +954,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) EXPORT_SYMBOL(skb_copy); /** - * __pskb_copy - create copy of an sk_buff with private head. + * __pskb_copy_fclone - create copy of an sk_buff with private head. * @skb: buffer to copy * @headroom: headroom of new skb * @gfp_mask: allocation priority + * @fclone: if true allocate the copy of the skb from the fclone + * cache instead of the head cache; it is recommended to set this + * to true for the cases where the copy will likely be cloned * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when @@ -995,11 +970,12 @@ EXPORT_SYMBOL(skb_copy); * The returned buffer has a reference count of 1. */ -struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, + gfp_t gfp_mask, bool fclone) { unsigned int size = skb_headlen(skb) + headroom; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); + struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); if (!n) goto out; @@ -1039,7 +1015,7 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) out: return n; } -EXPORT_SYMBOL(__pskb_copy); +EXPORT_SYMBOL(__pskb_copy_fclone); /** * pskb_expand_head - reallocate header of &sk_buff @@ -1830,7 +1806,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, .flags = flags, - .ops = &sock_pipe_buf_ops, + .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; @@ -2122,6 +2098,104 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); + /** + * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + * @from: source buffer + * + * Calculates the amount of linear headroom needed in the 'to' skb passed + * into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ + unsigned int hlen = 0; + + if (!from->head_frag || + skb_headlen(from) < L1_CACHE_BYTES || + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(from); + + if (skb_has_frag_list(from)) + hlen = from->len; + + return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + * skb_zerocopy - Zero copy skb to skb + * @to: destination buffer + * @from: source buffer + * @len: number of bytes to copy from source buffer + * @hlen: size of linear headroom in destination buffer + * + * Copies up to `len` bytes from `from` to `to` by creating references + * to the frags in the source buffer. + * + * The `hlen` as calculated by skb_zerocopy_headlen() specifies the + * headroom in the `to` buffer. + * + * Return value: + * 0: everything is OK + * -ENOMEM: couldn't orphan frags of @from due to lack of memory + * -EFAULT: skb_copy_bits() found some problem with skb geometry + */ +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + int ret; + struct page *page; + unsigned int offset; + + BUG_ON(!from->head_frag && !hlen); + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) + return skb_copy_bits(from, 0, skb_put(to, len), len); + + if (hlen) { + ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + if (unlikely(ret)) + return ret; + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + to->truesize += len + plen; + to->len += len + plen; + to->data_len += len + plen; + + if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { + skb_tx_error(from); + return -ENOMEM; + } + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); + len -= skb_shinfo(to)->frags[j].size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); + void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { __wsum csum; @@ -2784,81 +2858,87 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum); /** * skb_segment - Perform protocol segmentation on skb. - * @skb: buffer to segment + * @head_skb: buffer to segment * @features: features for the output path (see dev->features) * * This function performs segmentation on the given skb. It returns * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) +struct sk_buff *skb_segment(struct sk_buff *head_skb, + netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; - struct sk_buff *fskb = skb_shinfo(skb)->frag_list; - skb_frag_t *skb_frag = skb_shinfo(skb)->frags; - unsigned int mss = skb_shinfo(skb)->gso_size; - unsigned int doffset = skb->data - skb_mac_header(skb); + struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; + skb_frag_t *frag = skb_shinfo(head_skb)->frags; + unsigned int mss = skb_shinfo(head_skb)->gso_size; + unsigned int doffset = head_skb->data - skb_mac_header(head_skb); + struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; - unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int headroom; unsigned int len; __be16 proto; bool csum; int sg = !!(features & NETIF_F_SG); - int nfrags = skb_shinfo(skb)->nr_frags; + int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; int pos; + int dummy; - proto = skb_network_protocol(skb); + __skb_push(head_skb, doffset); + proto = skb_network_protocol(head_skb, &dummy); if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !!can_checksum_protocol(features, proto); - __skb_push(skb, doffset); - headroom = skb_headroom(skb); - pos = skb_headlen(skb); + csum = !head_skb->encap_hdr_csum && + !!can_checksum_protocol(features, proto); + + headroom = skb_headroom(head_skb); + pos = skb_headlen(head_skb); do { struct sk_buff *nskb; - skb_frag_t *frag; + skb_frag_t *nskb_frag; int hsize; int size; - len = skb->len - offset; + len = head_skb->len - offset; if (len > mss) len = mss; - hsize = skb_headlen(skb) - offset; + hsize = skb_headlen(head_skb) - offset; if (hsize < 0) hsize = 0; if (hsize > len || !sg) hsize = len; - if (!hsize && i >= nfrags && skb_headlen(fskb) && - (skb_headlen(fskb) == len || sg)) { - BUG_ON(skb_headlen(fskb) > len); + if (!hsize && i >= nfrags && skb_headlen(list_skb) && + (skb_headlen(list_skb) == len || sg)) { + BUG_ON(skb_headlen(list_skb) > len); i = 0; - nfrags = skb_shinfo(fskb)->nr_frags; - skb_frag = skb_shinfo(fskb)->frags; - pos += skb_headlen(fskb); + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + pos += skb_headlen(list_skb); while (pos < offset + len) { BUG_ON(i >= nfrags); - size = skb_frag_size(skb_frag); + size = skb_frag_size(frag); if (pos + size > offset + len) break; i++; pos += size; - skb_frag++; + frag++; } - nskb = skb_clone(fskb, GFP_ATOMIC); - fskb = fskb->next; + nskb = skb_clone(list_skb, GFP_ATOMIC); + list_skb = list_skb->next; if (unlikely(!nskb)) goto err; @@ -2879,7 +2959,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) __skb_push(nskb, doffset); } else { nskb = __alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC, skb_alloc_rx_flag(skb), + GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) @@ -2895,12 +2975,12 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) segs = nskb; tail = nskb; - __copy_skb_header(nskb, skb); - nskb->mac_len = skb->mac_len; + __copy_skb_header(nskb, head_skb); + nskb->mac_len = head_skb->mac_len; skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); - skb_copy_from_linear_data_offset(skb, -tnl_hlen, + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, doffset + tnl_hlen); @@ -2909,30 +2989,34 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) if (!sg) { nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(skb, offset, + nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), len, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; continue; } - frag = skb_shinfo(nskb)->frags; + nskb_frag = skb_shinfo(nskb)->frags; - skb_copy_from_linear_data_offset(skb, offset, + skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & + SKBTX_SHARED_FRAG; while (pos < offset + len) { if (i >= nfrags) { - BUG_ON(skb_headlen(fskb)); + BUG_ON(skb_headlen(list_skb)); i = 0; - nfrags = skb_shinfo(fskb)->nr_frags; - skb_frag = skb_shinfo(fskb)->frags; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; BUG_ON(!nfrags); - fskb = fskb->next; + list_skb = list_skb->next; } if (unlikely(skb_shinfo(nskb)->nr_frags >= @@ -2943,27 +3027,30 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) goto err; } - *frag = *skb_frag; - __skb_frag_ref(frag); - size = skb_frag_size(frag); + if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) + goto err; + + *nskb_frag = *frag; + __skb_frag_ref(nskb_frag); + size = skb_frag_size(nskb_frag); if (pos < offset) { - frag->page_offset += offset - pos; - skb_frag_size_sub(frag, offset - pos); + nskb_frag->page_offset += offset - pos; + skb_frag_size_sub(nskb_frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; - skb_frag++; + frag++; pos += size; } else { - skb_frag_size_sub(frag, pos + size - (offset + len)); + skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); goto skip_fraglist; } - frag++; + nskb_frag++; } skip_fraglist: @@ -2976,16 +3063,15 @@ perform_csum_check: nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; } - } while ((offset += len) < skb->len); + } while ((offset += len) < head_skb->len); return segs; err: - while ((skb = segs)) { - segs = skb->next; - kfree_skb(skb); - } + kfree_skb_list(segs); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skb_segment); @@ -3003,7 +3089,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) if (unlikely(p->len + len >= 65536)) return -E2BIG; - lp = NAPI_GRO_CB(p)->last ?: p; + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); if (headlen <= offset) { @@ -3119,7 +3205,7 @@ merge: __skb_pull(skb, offset); - if (!NAPI_GRO_CB(p)->last) + if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; @@ -3227,6 +3313,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) return elt; } +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given + * sglist without mark the sg which contain last skb data as the end. + * So the caller can mannipulate sg list as will when padding new data after + * the first call without calling sg_unmark_end to expend sg list. + * + * Scenario to use skb_to_sgvec_nomark: + * 1. sg_init_table + * 2. skb_to_sgvec_nomark(payload1) + * 3. skb_to_sgvec_nomark(payload2) + * + * This is equivalent to: + * 1. sg_init_table + * 2. skb_to_sgvec(payload1) + * 3. sg_unmark_end + * 4. skb_to_sgvec(payload2) + * + * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * is more preferable. + */ +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, + int offset, int len) +{ + return __skb_to_sgvec(skb, sg, offset, len); +} +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); + int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { int nsg = __skb_to_sgvec(skb, sg, offset, len); @@ -3359,8 +3471,6 @@ static void sock_rmem_free(struct sk_buff *skb) */ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { - int len = skb->len; - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf) return -ENOMEM; @@ -3375,7 +3485,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); @@ -3468,6 +3578,238 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) } EXPORT_SYMBOL_GPL(skb_partial_csum_set); +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, + unsigned int max) +{ + if (skb_headlen(skb) >= len) + return 0; + + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + if (max > skb->len) + max = skb->len; + + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) + return -ENOMEM; + + if (skb_headlen(skb) < len) + return -EPROTO; + + return 0; +} + +#define MAX_TCP_HDR_LEN (15 * 4) + +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, + typeof(IPPROTO_IP) proto, + unsigned int off) +{ + switch (proto) { + int err; + + case IPPROTO_TCP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), + off + MAX_TCP_HDR_LEN); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct tcphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; + + case IPPROTO_UDP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), + off + sizeof(struct udphdr)); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct udphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &udp_hdr(skb)->check; + } + + return ERR_PTR(-EPROTO); +} + +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) +{ + unsigned int off; + bool fragment; + __sum16 *csum; + int err; + + fragment = false; + + err = skb_maybe_pull_tail(skb, + sizeof(struct iphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) + fragment = true; + + off = ip_hdrlen(skb); + + err = -EPROTO; + + if (fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + ip_hdr(skb)->protocol, 0); + err = 0; + +out: + return err; +} + +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ + (type *)(skb_network_header(skb) + (off)) + +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) +{ + int err; + u8 nexthdr; + unsigned int off; + unsigned int len; + bool fragment; + bool done; + __sum16 *csum; + + fragment = false; + done = false; + + off = sizeof(struct ipv6hdr); + + err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + nexthdr = ipv6_hdr(skb)->nexthdr; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + while (off <= len && !done) { + switch (nexthdr) { + case IPPROTO_DSTOPTS: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: { + struct ipv6_opt_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ipv6_opt_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_optlen(hp); + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ip_auth_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ip_auth_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_authlen(hp); + break; + } + case IPPROTO_FRAGMENT: { + struct frag_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct frag_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct frag_hdr, skb, off); + + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) + fragment = true; + + nexthdr = hp->nexthdr; + off += sizeof(struct frag_hdr); + break; + } + default: + done = true; + break; + } + } + + err = -EPROTO; + + if (!done || fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, nexthdr, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, nexthdr, 0); + err = 0; + +out: + return err; +} + +/** + * skb_checksum_setup - set up partial checksum offset + * @skb: the skb to set up + * @recalculate: if true the pseudo-header checksum will be recalculated + */ +int skb_checksum_setup(struct sk_buff *skb, bool recalculate) +{ + int err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + err = skb_checksum_setup_ipv4(skb, recalculate); + break; + + case htons(ETH_P_IPV6): + err = skb_checksum_setup_ipv6(skb, recalculate); + break; + + default: + err = -EPROTO; + break; + } + + return err; +} +EXPORT_SYMBOL(skb_checksum_setup); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", @@ -3584,7 +3926,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; - skb->local_df = 0; + skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; secpath_reset(skb); @@ -3592,3 +3934,28 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset_trace(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + return tcp_hdrlen(skb) + shinfo->gso_size; + + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return shinfo->gso_size; +} +EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); diff --git a/net/core/sock.c b/net/core/sock.c index ab20ed9b0f3..026e01f7027 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -145,6 +145,55 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, + struct user_namespace *user_ns, int cap) +{ + return file_ns_capable(sk->sk_socket->file, user_ns, cap) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capbility to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socke was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + + #ifdef CONFIG_MEMCG_KMEM int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { @@ -428,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -735,7 +784,7 @@ set_rcvbuf: break; case SO_NO_CHECK: - sk->sk_no_check = valbool; + sk->sk_no_check_tx = valbool; break; case SO_PRIORITY: @@ -882,7 +931,7 @@ set_rcvbuf: case SO_PEEK_OFF: if (sock->ops->set_peek_off) - sock->ops->set_peek_off(sk, val); + ret = sock->ops->set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; @@ -925,8 +974,8 @@ set_rcvbuf: EXPORT_SYMBOL(sock_setsockopt); -void cred_to_ucred(struct pid *pid, const struct cred *cred, - struct ucred *ucred) +static void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; @@ -937,7 +986,6 @@ void cred_to_ucred(struct pid *pid, const struct cred *cred, ucred->gid = from_kgid_munged(current_ns, cred->egid); } } -EXPORT_SYMBOL_GPL(cred_to_ucred); int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) @@ -1016,7 +1064,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_NO_CHECK: - v.val = sk->sk_no_check; + v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: @@ -1168,6 +1216,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; + case SO_BPF_EXTENSIONS: + v.val = bpf_tell_extensions(); + break; + case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; @@ -1308,19 +1360,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) -{ - u32 classid; - - classid = task_cls_classid(current); - if (classid != sk->sk_classid) - sk->sk_classid = classid; -} -EXPORT_SYMBOL(sock_update_classid); -#endif - -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) void sock_update_netprioidx(struct sock *sk) { if (in_interrupt()) @@ -1666,22 +1706,6 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, EXPORT_SYMBOL(sock_wmalloc); /* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, - gfp_t priority) -{ - if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { - struct sk_buff *skb = alloc_skb(size, priority); - if (skb) { - skb_set_owner_r(skb, sk); - return skb; - } - } - return NULL; -} - -/* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) @@ -1800,7 +1824,9 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, while (order) { if (npages >= 1 << order) { page = alloc_pages(sk->sk_allocation | - __GFP_COMP | __GFP_NOWARN, + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, order); if (page) goto fill_page; @@ -1865,14 +1891,12 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) put_page(pfrag->page); } - /* We restrict high order allocations to users that can afford to wait */ - order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; - + order = SKB_FRAG_PAGE_ORDER; do { gfp_t gfp = prio; if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; + gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; pfrag->page = alloc_pages(gfp, order); if (likely(pfrag->page)) { pfrag->offset = 0; @@ -2221,7 +2245,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk) { struct socket_wq *wq; @@ -2382,10 +2406,13 @@ void release_sock(struct sock *sk) if (sk->sk_backlog.tail) __release_sock(sk); + /* Warning : release_cb() might need to release sk ownership, + * ie call sock_release_ownership(sk) before us. + */ if (sk->sk_prot->release_cb) sk->sk_prot->release_cb(sk); - sk->sk_lock.owned = 0; + sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a0e9cf6379d..a4216a4c957 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -49,38 +49,35 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); -int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, +int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, struct sk_buff *skb, int attrtype) { - struct nlattr *attr; + struct sock_fprog_kern *fprog; struct sk_filter *filter; - unsigned int len; + struct nlattr *attr; + unsigned int flen; int err = 0; - if (!ns_capable(user_ns, CAP_NET_ADMIN)) { + if (!may_report_filterinfo) { nla_reserve(skb, attrtype, 0); return 0; } rcu_read_lock(); - filter = rcu_dereference(sk->sk_filter); - len = filter ? filter->len * sizeof(struct sock_filter) : 0; + if (!filter) + goto out; - attr = nla_reserve(skb, attrtype, len); + fprog = filter->orig_prog; + flen = sk_filter_proglen(fprog); + + attr = nla_reserve(skb, attrtype, flen); if (attr == NULL) { err = -EMSGSIZE; goto out; } - if (filter) { - struct sock_filter *fb = (struct sock_filter *)nla_data(attr); - int i; - - for (i = 0; i < filter->len; i++, fb++) - sk_decode_filter(&filter->insns[i], fb); - } - + memcpy(nla_data(attr), fprog->filter, flen); out: rcu_read_unlock(); return err; diff --git a/net/core/stream.c b/net/core/stream.c index 512f0a24269..301c05f2606 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -122,7 +122,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) DEFINE_WAIT(wait); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; while (1) { set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cca44419090..cf9cd13509a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -122,7 +122,8 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, synchronize_rcu(); kfree(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { - cur = kzalloc(len, GFP_KERNEL); + cur = kzalloc_node(len, GFP_KERNEL, + cpu_to_node(i)); if (!cur) { /* not unwinding previous changes */ ret = -ENOMEM; diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 661b5a40ec1..6521dfd8b7c 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -23,16 +23,11 @@ #include <linux/skbuff.h> #include <linux/export.h> -static struct sock_filter ptp_filter[] = { - PTP_FILTER -}; - static unsigned int classify(const struct sk_buff *skb) { - if (likely(skb->dev && - skb->dev->phydev && + if (likely(skb->dev && skb->dev->phydev && skb->dev->phydev->drv)) - return sk_run_filter(skb, ptp_filter); + return ptp_classify_raw(skb); else return PTP_CLASS_NONE; } @@ -60,11 +55,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) if (likely(phydev->drv->txtstamp)) { if (!atomic_inc_not_zero(&sk->sk_refcnt)) return; + clone = skb_clone(skb, GFP_ATOMIC); if (!clone) { sock_put(sk); return; } + clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } @@ -89,12 +86,15 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, } *skb_hwtstamps(skb) = *hwtstamps; + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; skb->sk = NULL; + err = sock_queue_err_skb(sk, skb); + sock_put(sk); if (err) kfree_skb(skb); @@ -132,8 +132,3 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) return false; } EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); - -void __init skb_timestamping_init(void) -{ - BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); -} diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 00000000000..8c3203c585b --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,77 @@ +#include <linux/export.h> +#include <net/ip.h> +#include <net/tso.h> + +/* Calculate expected number of TX descriptors */ +int tso_count_descs(struct sk_buff *skb) +{ + /* The Marvell Way */ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} +EXPORT_SYMBOL(tso_count_descs); + +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int mac_hdr_len = skb_network_offset(skb); + + memcpy(hdr, skb->data, hdr_len); + iph = (struct iphdr *)(hdr + mac_hdr_len); + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); + tcph->seq = htonl(tso->tcp_seq); + tso->ip_id++; + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } +} +EXPORT_SYMBOL(tso_build_hdr); + +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +{ + tso->tcp_seq += size; + tso->size -= size; + tso->data += size; + + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} +EXPORT_SYMBOL(tso_build_data); + +void tso_start(struct sk_buff *skb, struct tso_t *tso) +{ + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + tso->ip_id = ntohs(ip_hdr(skb)->id); + tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); + tso->next_frag_idx = 0; + + /* Build first data */ + tso->size = skb_headlen(skb) - hdr_len; + tso->data = skb->data + hdr_len; + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} +EXPORT_SYMBOL(tso_start); diff --git a/net/core/utils.c b/net/core/utils.c index 2f737bf90b3..eed34338736 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -348,8 +348,8 @@ static void __net_random_once_deferred(struct work_struct *w) { struct __net_random_once_work *work = container_of(w, struct __net_random_once_work, work); - if (!static_key_enabled(work->key)) - static_key_slow_inc(work->key); + BUG_ON(!static_key_enabled(work->key)); + static_key_slow_dec(work->key); kfree(work); } @@ -367,7 +367,7 @@ static void __net_random_once_disable_jump(struct static_key *key) } bool __net_get_random_once(void *buf, int nbytes, bool *done, - struct static_key *done_key) + struct static_key *once_key) { static DEFINE_SPINLOCK(lock); unsigned long flags; @@ -382,7 +382,7 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done, *done = true; spin_unlock_irqrestore(&lock, flags); - __net_random_once_disable_jump(done_key); + __net_random_once_disable_jump(once_key); return true; } diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 66fbe1948fb..f8b98d89c28 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1669,7 +1669,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; - if ((nlh->nlmsg_type == RTM_SETDCB) && !capable(CAP_NET_ADMIN)) + if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, @@ -1688,21 +1688,17 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; - netdev = dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); + netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); if (!netdev) return -ENODEV; - if (!netdev->dcbnl_ops) { - ret = -EOPNOTSUPP; - goto out; - } + if (!netdev->dcbnl_ops) + return -EOPNOTSUPP; reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); - if (!reply_skb) { - ret = -ENOBUFS; - goto out; - } + if (!reply_skb) + return -ENOBUFS; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { @@ -1714,7 +1710,6 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ret = rtnl_unicast(reply_skb, net, portid); out: - dev_put(netdev); return ret; } diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 30948784dd5..c67816647cc 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -479,7 +479,6 @@ void dccp_feat_list_purge(struct list_head *fn_list); int dccp_insert_options(struct sock *sk, struct sk_buff *skb); int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); -int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed); u32 dccp_timestamp(void); void dccp_timestamping_init(void); int dccp_insert_option(struct sk_buff *skb, unsigned char option, diff --git a/net/dccp/input.c b/net/dccp/input.c index 14cdafad7a9..3c8ec7d4a34 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -28,7 +28,7 @@ static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } static void dccp_fin(struct sock *sk, struct sk_buff *skb) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d9f65fc66db..6ca645c4b48 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -75,7 +75,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_DCCP, - orig_sport, orig_dport, sk, true); + orig_sport, orig_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -989,6 +989,7 @@ static const struct net_protocol dccp_v4_protocol = { .err_handler = dccp_v4_err, .no_policy = 1, .netns_ok = 1, + .icmp_strict_tag_validation = 1, }; static const struct proto_ops inet_dccp_ops = { @@ -1023,7 +1024,6 @@ static struct inet_protosw dccp_v4_protosw = { .protocol = IPPROTO_DCCP, .prot = &dccp_v4_prot, .ops = &inet_dccp_ops, - .no_check = 0, .flags = INET_PROTOSW_ICSK, }; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4ac71ff7c2e..4db3c2a1679 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -141,6 +141,9 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst = NULL; + if (!ip6_sk_accept_pmtu(sk)) + goto out; + if (sock_owned_by_user(sk)) goto out; if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) @@ -237,7 +240,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) final_p = fl6_update_dst(&fl6, np->opt, &final); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; @@ -301,7 +304,7 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false); + dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); ip6_xmit(ctl_sk, skb, &fl6, NULL, 0); @@ -512,7 +515,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, fl6.fl6_sport = htons(ireq->ir_num); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) goto out; } @@ -851,7 +854,6 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - usin->sin6_addr = flowlabel->dst; fl6_sock_release(flowlabel); } } @@ -932,7 +934,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, final_p = fl6_update_dst(&fl6, np->opt, &final); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 9e2f78bc155..c69eb9c4fbb 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -237,7 +237,7 @@ int dccp_child_process(struct sock *parent, struct sock *child, /* Wakeup parent, send SIGIO */ if (state == DCCP_RESPOND && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/dccp/options.c b/net/dccp/options.c index a58e0b63405..9bce31886bd 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -343,38 +343,6 @@ static inline int dccp_elapsed_time_len(const u32 elapsed_time) return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; } -/* FIXME: This function is currently not used anywhere */ -int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time) -{ - const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); - const int len = 2 + elapsed_time_len; - unsigned char *to; - - if (elapsed_time_len == 0) - return 0; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - *to++ = DCCPO_ELAPSED_TIME; - *to++ = len; - - if (elapsed_time_len == 2) { - const __be16 var16 = htons((u16)elapsed_time); - memcpy(to, &var16, 2); - } else { - const __be32 var32 = htonl(elapsed_time); - memcpy(to, &var32, 4); - } - - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); - static int dccp_insert_option_timestamp(struct sk_buff *skb) { __be32 now = htonl(dccp_timestamp()); diff --git a/net/dccp/output.c b/net/dccp/output.c index 8876078859d..0248e8a3460 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -138,7 +138,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); return net_xmit_eval(err); } return -ENOBUFS; diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 4c6bdf97a65..595ddf0459d 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -152,17 +152,6 @@ static const struct file_operations dccpprobe_fops = { .llseek = noop_llseek, }; -static __init int setup_jprobe(void) -{ - int ret = register_jprobe(&dccp_send_probe); - - if (ret) { - request_module("dccp"); - ret = register_jprobe(&dccp_send_probe); - } - return ret; -} - static __init int dccpprobe_init(void) { int ret = -ENOMEM; @@ -174,7 +163,13 @@ static __init int dccpprobe_init(void) if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops)) goto err0; - ret = setup_jprobe(); + ret = register_jprobe(&dccp_send_probe); + if (ret) { + ret = request_module("dccp"); + if (!ret) + ret = register_jprobe(&dccp_send_probe); + } + if (ret) goto err1; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index eb892b4f481..de2c1e71930 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1084,14 +1084,15 @@ EXPORT_SYMBOL_GPL(dccp_shutdown); static inline int dccp_mib_init(void) { - return snmp_mib_init((void __percpu **)dccp_statistics, - sizeof(struct dccp_mib), - __alignof__(struct dccp_mib)); + dccp_statistics = alloc_percpu(struct dccp_mib); + if (!dccp_statistics) + return -ENOMEM; + return 0; } static inline void dccp_mib_exit(void) { - snmp_mib_free((void __percpu **)dccp_statistics); + free_percpu(dccp_statistics); } static int thash_entries; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 607ab71b5a0..53731e45403 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -20,6 +20,7 @@ /* Boundary values */ static int zero = 0, + one = 1, u8_max = 0xFF; static unsigned long seqw_min = DCCPF_SEQ_WMIN, seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ @@ -58,7 +59,7 @@ static struct ctl_table dccp_default_table[] = { .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = &one, .extra2 = &u8_max, }, { diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 16f0b223102..1cd46a345cb 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -280,7 +280,7 @@ static ktime_t dccp_timestamp_seed; */ u32 dccp_timestamp(void) { - s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); + u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); do_div(delta, 10); return delta; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index dd4d506ef92..ae011b46c07 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -481,7 +481,7 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf sk->sk_backlog_rcv = dn_nsp_backlog_rcv; sk->sk_destruct = dn_destruct; - sk->sk_no_check = 1; + sk->sk_no_check_tx = 1; sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; @@ -1808,6 +1808,7 @@ out: rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk); if ((rv >= 0) && msg->msg_name) { + __sockaddr_check_size(sizeof(struct sockaddr_dn)); memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn)); msg->msg_namelen = sizeof(struct sockaddr_dn); } @@ -1914,7 +1915,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, int err = 0; size_t sent = 0; int addr_len = msg->msg_namelen; - struct sockaddr_dn *addr = (struct sockaddr_dn *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_dn *, addr, msg->msg_name); struct sk_buff *skb = NULL; struct dn_skb_cb *cb; size_t len; @@ -2103,8 +2104,6 @@ static struct notifier_block dn_dev_notifier = { .notifier_call = dn_device_event, }; -extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); - static struct packet_type dn_dix_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DNA_RT), .func = dn_route_rcv, @@ -2352,9 +2351,6 @@ static const struct proto_ops dn_proto_ops = { .sendpage = sock_no_sendpage, }; -void dn_register_sysctl(void); -void dn_unregister_sysctl(void); - MODULE_DESCRIPTION("The Linux DECnet Network Protocol"); MODULE_AUTHOR("Linux DECnet Project Team"); MODULE_LICENSE("GPL"); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index dd0dfb25f4b..3b726f31c64 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -561,6 +561,7 @@ static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U16 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [IFA_FLAGS] = { .type = NLA_U32 }, }; static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -573,7 +574,7 @@ static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) struct dn_ifaddr __rcu **ifap; int err = -EINVAL; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) @@ -617,7 +618,7 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) struct dn_ifaddr *ifa; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) @@ -648,7 +649,8 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]); - ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : + ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = dn_db; @@ -669,7 +671,8 @@ static inline size_t dn_ifaddr_nlmsg_size(void) return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(2) /* IFA_ADDRESS */ - + nla_total_size(2); /* IFA_LOCAL */ + + nla_total_size(2) /* IFA_LOCAL */ + + nla_total_size(4); /* IFA_FLAGS */ } static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, @@ -677,6 +680,7 @@ static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; + u32 ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); if (nlh == NULL) @@ -685,7 +689,7 @@ static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, ifm = nlmsg_data(nlh); ifm->ifa_family = AF_DECnet; ifm->ifa_prefixlen = 16; - ifm->ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT; + ifm->ifa_flags = ifa_flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; @@ -694,7 +698,8 @@ static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, (ifa->ifa_local && nla_put_le16(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_label[0] && - nla_put_string(skb, IFA_LABEL, ifa->ifa_label))) + nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + nla_put_u32(skb, IFA_FLAGS, ifa_flags)) goto nla_put_failure; return nlmsg_end(skb, nlh); diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 57dc159245e..d332aefb084 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -505,7 +505,7 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *attrs[RTA_MAX+1]; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) @@ -530,7 +530,7 @@ static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *attrs[RTA_MAX+1]; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index c344163e6ac..fe5f01485d3 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -585,7 +585,6 @@ out: static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue) { int err; - int skb_len; /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK @@ -600,12 +599,11 @@ static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig if (err) goto out; - skb_len = skb->len; skb_set_owner_r(skb, sk); skb_queue_tail(queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); out: return err; } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index fe32388ea24..daccc4a36d8 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -752,7 +752,7 @@ static int dn_to_neigh_output(struct sk_buff *skb) return n->output(n, skb); } -static int dn_output(struct sk_buff *skb) +static int dn_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -838,6 +838,18 @@ drop: * Used to catch bugs. This should never normally get * called. */ +static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n", + le16_to_cpu(cb->src), le16_to_cpu(cb->dst)); + + kfree_skb(skb); + + return NET_RX_DROP; +} + static int dn_rt_bug(struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); @@ -1288,8 +1300,6 @@ int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *fl, stru err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD); if (err == 0 && fl->flowidn_proto) { - if (!(flags & MSG_DONTWAIT)) - fl->flowidn_flags |= FLOWI_FLAG_CAN_SLEEP; *pprt = xfrm_lookup(&init_net, *pprt, flowidn_to_flowi(fl), sk, 0); if (IS_ERR(*pprt)) { @@ -1465,7 +1475,7 @@ make_route: rt->n = neigh; rt->dst.lastuse = jiffies; - rt->dst.output = dn_rt_bug; + rt->dst.output = dn_rt_bug_sk; switch (res.type) { case RTN_UNICAST: rt->dst.input = dn_forward; @@ -1668,12 +1678,8 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (fld.flowidn_iif) { struct net_device *dev; - if ((dev = dev_get_by_index(&init_net, fld.flowidn_iif)) == NULL) { - kfree_skb(skb); - return -ENODEV; - } - if (!dev->dn_ptr) { - dev_put(dev); + dev = __dev_get_by_index(&init_net, fld.flowidn_iif); + if (!dev || !dev->dn_ptr) { kfree_skb(skb); return -ENODEV; } @@ -1695,8 +1701,6 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0); } - if (skb->dev) - dev_put(skb->dev); skb->dev = NULL; if (err) goto out_free; diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index e83015cecfa..e4d9560a910 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); /* Eventually we might send routing messages too */ diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index e7b6d53eef8..dd8696a3dbe 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -93,8 +93,8 @@ int dns_query(const char *type, const char *name, size_t namelen, } if (!namelen) - namelen = strlen(name); - if (namelen < 3) + namelen = strnlen(name, 256); + if (namelen < 3 || namelen > 255) return -EINVAL; desclen += namelen + 1; @@ -149,7 +149,9 @@ int dns_query(const char *type, const char *name, size_t namelen, if (!*_result) goto put; - memcpy(*_result, upayload->data, len + 1); + memcpy(*_result, upayload->data, len); + (*_result)[len] = '\0'; + if (_expiry) *_expiry = rkey->expiry; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0eb5d5e76df..5db37cef50a 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -406,8 +406,9 @@ static int dsa_of_probe(struct platform_device *pdev) goto out_free; } - chip_index = 0; + chip_index = -1; for_each_available_child_of_node(np, child) { + chip_index++; cd = &pd->chip[chip_index]; cd->mii_bus = &mdio_bus->dev; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 29d684ebca6..64c5af0a10d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -156,7 +156,7 @@ static int dsa_slave_set_mac_address(struct net_device *dev, void *a) dev_uc_del(master, dev->dev_addr); out: - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + ether_addr_copy(dev->dev_addr, addr->sa_data); return 0; } @@ -346,7 +346,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, return slave_dev; slave_dev->features = master->vlan_features; - SET_ETHTOOL_OPS(slave_dev, &dsa_slave_ethtool_ops); + slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 8f032bae60a..5dc638cad2e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -156,7 +156,9 @@ EXPORT_SYMBOL(eth_rebuild_header); */ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { - struct ethhdr *eth; + unsigned short _service_access_point; + const unsigned short *sap; + const struct ethhdr *eth; skb->dev = dev; skb_reset_mac_header(skb); @@ -194,7 +196,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. */ - if (unlikely(skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF)) + sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); + if (sap && *sap == 0xFFFF) return htons(ETH_P_802_3); /* diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index cac505f166d..e5302b7f7ca 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -209,7 +209,7 @@ static int slave_xmit(struct sk_buff *skb, struct hsr_priv *hsr_priv, /* Address substitution (IEC62439-3 pp 26, 50): replace mac * address of outgoing frame with that of the outgoing slave's. */ - memcpy(hsr_ethhdr->ethhdr.h_source, skb->dev->dev_addr, ETH_ALEN); + ether_addr_copy(hsr_ethhdr->ethhdr.h_source, skb->dev->dev_addr); return dev_queue_xmit(skb); } @@ -346,7 +346,7 @@ static void send_hsr_supervision_frame(struct net_device *hsr_dev, u8 type) /* Payload: MacAddressA */ hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp)); - memcpy(hsr_sp->MacAddressA, hsr_dev->dev_addr, ETH_ALEN); + ether_addr_copy(hsr_sp->MacAddressA, hsr_dev->dev_addr); dev_queue_xmit(skb); return; @@ -493,7 +493,7 @@ static int check_slave_ok(struct net_device *dev) /* Default multicast address for HSR Supervision frames */ -static const unsigned char def_multicast_addr[ETH_ALEN] = { +static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 }; @@ -519,7 +519,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr_priv->announce_timer.function = hsr_announce; hsr_priv->announce_timer.data = (unsigned long) hsr_priv; - memcpy(hsr_priv->sup_multicast_addr, def_multicast_addr, ETH_ALEN); + ether_addr_copy(hsr_priv->sup_multicast_addr, def_multicast_addr); hsr_priv->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; /* FIXME: should I modify the value of these? @@ -547,7 +547,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr_dev->features |= NETIF_F_VLAN_CHALLENGED; /* Set hsr_dev's MAC address to that of mac_slave1 */ - memcpy(hsr_dev->dev_addr, hsr_priv->slave[0]->dev_addr, ETH_ALEN); + ether_addr_copy(hsr_dev->dev_addr, hsr_priv->slave[0]->dev_addr); /* Set required header length */ for (i = 0; i < HSR_MAX_SLAVE; i++) { diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 4bdab152187..83e58449366 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -108,8 +108,8 @@ int hsr_create_self_node(struct list_head *self_node_db, if (!node) return -ENOMEM; - memcpy(node->MacAddressA, addr_a, ETH_ALEN); - memcpy(node->MacAddressB, addr_b, ETH_ALEN); + ether_addr_copy(node->MacAddressA, addr_a); + ether_addr_copy(node->MacAddressB, addr_b); rcu_read_lock(); oldnode = list_first_or_null_rcu(self_node_db, @@ -127,11 +127,6 @@ int hsr_create_self_node(struct list_head *self_node_db, return 0; } -static void node_entry_reclaim(struct rcu_head *rh) -{ - kfree(container_of(rh, struct node_entry, rcu_head)); -} - /* Add/merge node to the database of nodes. 'skb' must contain an HSR * supervision frame. @@ -175,7 +170,7 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, if (node && !ether_addr_equal(node->MacAddressA, hsr_sp->MacAddressA)) { /* Node has changed its AddrA, frame was received from SlaveB */ list_del_rcu(&node->mac_list); - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); node = NULL; } @@ -183,7 +178,7 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, !ether_addr_equal(node->MacAddressB, hsr_ethsup->ethhdr.h_source)) { /* Cables have been swapped */ list_del_rcu(&node->mac_list); - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); node = NULL; } @@ -192,7 +187,7 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, !ether_addr_equal(node->MacAddressA, hsr_ethsup->ethhdr.h_source)) { /* Cables have been swapped */ list_del_rcu(&node->mac_list); - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); node = NULL; } @@ -204,7 +199,7 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, /* Node is known, but frame was received from an unknown * address. Node is PICS_SUBS capable; merge its AddrB. */ - memcpy(node->MacAddressB, hsr_ethsup->ethhdr.h_source, ETH_ALEN); + ether_addr_copy(node->MacAddressB, hsr_ethsup->ethhdr.h_source); node->AddrB_if = dev_idx; return node; } @@ -213,8 +208,8 @@ struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, if (!node) return NULL; - memcpy(node->MacAddressA, hsr_sp->MacAddressA, ETH_ALEN); - memcpy(node->MacAddressB, hsr_ethsup->ethhdr.h_source, ETH_ALEN); + ether_addr_copy(node->MacAddressA, hsr_sp->MacAddressA); + ether_addr_copy(node->MacAddressB, hsr_ethsup->ethhdr.h_source); if (!ether_addr_equal(hsr_sp->MacAddressA, hsr_ethsup->ethhdr.h_source)) node->AddrB_if = dev_idx; else @@ -255,7 +250,7 @@ void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb) rcu_read_lock(); node = find_node_by_AddrB(&hsr_priv->node_db, ethhdr->h_source); if (node) - memcpy(ethhdr->h_source, node->MacAddressA, ETH_ALEN); + ether_addr_copy(ethhdr->h_source, node->MacAddressA); rcu_read_unlock(); } @@ -277,7 +272,7 @@ void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr, rcu_read_lock(); node = find_node_by_AddrA(&hsr_priv->node_db, ethhdr->h_dest); if (node && (node->AddrB_if == dev_idx)) - memcpy(ethhdr->h_dest, node->MacAddressB, ETH_ALEN); + ether_addr_copy(ethhdr->h_dest, node->MacAddressB); rcu_read_unlock(); } @@ -302,7 +297,7 @@ static bool seq_nr_after(u16 a, u16 b) void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx) { - if ((dev_idx < 0) || (dev_idx >= HSR_MAX_DEV)) { + if ((dev_idx < 0) || (dev_idx >= HSR_MAX_SLAVE)) { WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx); return; } @@ -417,7 +412,7 @@ void hsr_prune_nodes(struct hsr_priv *hsr_priv) hsr_nl_nodedown(hsr_priv, node->MacAddressA); list_del_rcu(&node->mac_list); /* Note that we need to free this entry later: */ - call_rcu(&node->rcu_head, node_entry_reclaim); + kfree_rcu(node, rcu_head); } } rcu_read_unlock(); @@ -433,13 +428,13 @@ void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos, node = list_first_or_null_rcu(&hsr_priv->node_db, struct node_entry, mac_list); if (node) - memcpy(addr, node->MacAddressA, ETH_ALEN); + ether_addr_copy(addr, node->MacAddressA); return node; } node = _pos; list_for_each_entry_continue_rcu(node, &hsr_priv->node_db, mac_list) { - memcpy(addr, node->MacAddressA, ETH_ALEN); + ether_addr_copy(addr, node->MacAddressA); return node; } @@ -467,7 +462,7 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, return -ENOENT; /* No such entry */ } - memcpy(addr_b, node->MacAddressB, ETH_ALEN); + ether_addr_copy(addr_b, node->MacAddressB); tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_A]; if (node->time_in_stale[HSR_DEV_SLAVE_A]) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index af68dd83a4e..3fee5218a69 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -138,8 +138,8 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, break; if (dev == hsr_priv->slave[0]) - memcpy(hsr_priv->dev->dev_addr, - hsr_priv->slave[0]->dev_addr, ETH_ALEN); + ether_addr_copy(hsr_priv->dev->dev_addr, + hsr_priv->slave[0]->dev_addr); /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(&hsr_priv->self_node_db, @@ -459,7 +459,7 @@ static int __init hsr_init(void) static void __exit hsr_exit(void) { unregister_netdevice_notifier(&hsr_nb); - del_timer(&prune_timer); + del_timer_sync(&prune_timer); hsr_netlink_exit(); dev_remove_pack(&hsr_pt); } diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c deleted file mode 100644 index 459e200c08a..00000000000 --- a/net/ieee802154/6lowpan.c +++ /dev/null @@ -1,1513 +0,0 @@ -/* - * Copyright 2011, Siemens AG - * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> - */ - -/* - * Based on patches from Jon Smirl <jonsmirl@gmail.com> - * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -/* Jon's code is based on 6lowpan implementation for Contiki which is: - * Copyright (c) 2008, Swedish Institute of Computer Science. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Institute nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <linux/bitops.h> -#include <linux/if_arp.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/netdevice.h> -#include <net/af_ieee802154.h> -#include <net/ieee802154.h> -#include <net/ieee802154_netdev.h> -#include <net/ipv6.h> - -#include "6lowpan.h" - -/* TTL uncompression values */ -static const u8 lowpan_ttl_values[] = {0, 1, 64, 255}; - -static LIST_HEAD(lowpan_devices); - -/* private device info */ -struct lowpan_dev_info { - struct net_device *real_dev; /* real WPAN device ptr */ - struct mutex dev_list_mtx; /* mutex for list ops */ - unsigned short fragment_tag; -}; - -struct lowpan_dev_record { - struct net_device *ldev; - struct list_head list; -}; - -struct lowpan_fragment { - struct sk_buff *skb; /* skb to be assembled */ - u16 length; /* length to be assemled */ - u32 bytes_rcv; /* bytes received */ - u16 tag; /* current fragment tag */ - struct timer_list timer; /* assembling timer */ - struct list_head list; /* fragments list */ -}; - -static LIST_HEAD(lowpan_fragments); -static DEFINE_SPINLOCK(flist_lock); - -static inline struct -lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) -{ - return netdev_priv(dev); -} - -static inline void lowpan_address_flip(u8 *src, u8 *dest) -{ - int i; - for (i = 0; i < IEEE802154_ADDR_LEN; i++) - (dest)[IEEE802154_ADDR_LEN - i - 1] = (src)[i]; -} - -/* list of all 6lowpan devices, uses for package delivering */ -/* print data in line */ -static inline void lowpan_raw_dump_inline(const char *caller, char *msg, - unsigned char *buf, int len) -{ -#ifdef DEBUG - if (msg) - pr_debug("(%s) %s: ", caller, msg); - print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, - 16, 1, buf, len, false); -#endif /* DEBUG */ -} - -/* - * print data in a table format: - * - * addr: xx xx xx xx xx xx - * addr: xx xx xx xx xx xx - * ... - */ -static inline void lowpan_raw_dump_table(const char *caller, char *msg, - unsigned char *buf, int len) -{ -#ifdef DEBUG - if (msg) - pr_debug("(%s) %s:\n", caller, msg); - print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, - 16, 1, buf, len, false); -#endif /* DEBUG */ -} - -static u8 -lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, const struct in6_addr *ipaddr, - const unsigned char *lladdr) -{ - u8 val = 0; - - if (is_addr_mac_addr_based(ipaddr, lladdr)) - val = 3; /* 0-bits */ - else if (lowpan_is_iid_16_bit_compressable(ipaddr)) { - /* compress IID to 16 bits xxxx::XXXX */ - memcpy(*hc06_ptr, &ipaddr->s6_addr16[7], 2); - *hc06_ptr += 2; - val = 2; /* 16-bits */ - } else { - /* do not compress IID => xxxx::IID */ - memcpy(*hc06_ptr, &ipaddr->s6_addr16[4], 8); - *hc06_ptr += 8; - val = 1; /* 64-bits */ - } - - return rol8(val, shift); -} - -/* - * Uncompress address function for source and - * destination address(non-multicast). - * - * address_mode is sam value or dam value. - */ -static int -lowpan_uncompress_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, - const u8 address_mode, - const struct ieee802154_addr *lladdr) -{ - bool fail; - - switch (address_mode) { - case LOWPAN_IPHC_ADDR_00: - /* for global link addresses */ - fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16); - break; - case LOWPAN_IPHC_ADDR_01: - /* fe:80::XXXX:XXXX:XXXX:XXXX */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8); - break; - case LOWPAN_IPHC_ADDR_02: - /* fe:80::ff:fe00:XXXX */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - ipaddr->s6_addr[11] = 0xFF; - ipaddr->s6_addr[12] = 0xFE; - fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2); - break; - case LOWPAN_IPHC_ADDR_03: - fail = false; - switch (lladdr->addr_type) { - case IEEE802154_ADDR_LONG: - /* fe:80::XXXX:XXXX:XXXX:XXXX - * \_________________/ - * hwaddr - */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - memcpy(&ipaddr->s6_addr[8], lladdr->hwaddr, - IEEE802154_ADDR_LEN); - /* second bit-flip (Universe/Local) - * is done according RFC2464 - */ - ipaddr->s6_addr[8] ^= 0x02; - break; - case IEEE802154_ADDR_SHORT: - /* fe:80::ff:fe00:XXXX - * \__/ - * short_addr - * - * Universe/Local bit is zero. - */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - ipaddr->s6_addr[11] = 0xFF; - ipaddr->s6_addr[12] = 0xFE; - ipaddr->s6_addr16[7] = htons(lladdr->short_addr); - break; - default: - pr_debug("Invalid addr_type set\n"); - return -EINVAL; - } - break; - default: - pr_debug("Invalid address mode value: 0x%x\n", address_mode); - return -EINVAL; - } - - if (fail) { - pr_debug("Failed to fetch skb data\n"); - return -EIO; - } - - lowpan_raw_dump_inline(NULL, "Reconstructed ipv6 addr is:\n", - ipaddr->s6_addr, 16); - - return 0; -} - -/* Uncompress address function for source context - * based address(non-multicast). - */ -static int -lowpan_uncompress_context_based_src_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, - const u8 sam) -{ - switch (sam) { - case LOWPAN_IPHC_ADDR_00: - /* unspec address :: - * Do nothing, address is already :: - */ - break; - case LOWPAN_IPHC_ADDR_01: - /* TODO */ - case LOWPAN_IPHC_ADDR_02: - /* TODO */ - case LOWPAN_IPHC_ADDR_03: - /* TODO */ - netdev_warn(skb->dev, "SAM value 0x%x not supported\n", sam); - return -EINVAL; - default: - pr_debug("Invalid sam value: 0x%x\n", sam); - return -EINVAL; - } - - lowpan_raw_dump_inline(NULL, - "Reconstructed context based ipv6 src addr is:\n", - ipaddr->s6_addr, 16); - - return 0; -} - -/* Uncompress function for multicast destination address, - * when M bit is set. - */ -static int -lowpan_uncompress_multicast_daddr(struct sk_buff *skb, - struct in6_addr *ipaddr, - const u8 dam) -{ - bool fail; - - switch (dam) { - case LOWPAN_IPHC_DAM_00: - /* 00: 128 bits. The full address - * is carried in-line. - */ - fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16); - break; - case LOWPAN_IPHC_DAM_01: - /* 01: 48 bits. The address takes - * the form ffXX::00XX:XXXX:XXXX. - */ - ipaddr->s6_addr[0] = 0xFF; - fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1); - fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[11], 5); - break; - case LOWPAN_IPHC_DAM_10: - /* 10: 32 bits. The address takes - * the form ffXX::00XX:XXXX. - */ - ipaddr->s6_addr[0] = 0xFF; - fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1); - fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[13], 3); - break; - case LOWPAN_IPHC_DAM_11: - /* 11: 8 bits. The address takes - * the form ff02::00XX. - */ - ipaddr->s6_addr[0] = 0xFF; - ipaddr->s6_addr[1] = 0x02; - fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1); - break; - default: - pr_debug("DAM value has a wrong value: 0x%x\n", dam); - return -EINVAL; - } - - if (fail) { - pr_debug("Failed to fetch skb data\n"); - return -EIO; - } - - lowpan_raw_dump_inline(NULL, "Reconstructed ipv6 multicast addr is:\n", - ipaddr->s6_addr, 16); - - return 0; -} - -static void -lowpan_compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb) -{ - struct udphdr *uh = udp_hdr(skb); - - if (((uh->source & LOWPAN_NHC_UDP_4BIT_MASK) == - LOWPAN_NHC_UDP_4BIT_PORT) && - ((uh->dest & LOWPAN_NHC_UDP_4BIT_MASK) == - LOWPAN_NHC_UDP_4BIT_PORT)) { - pr_debug("UDP header: both ports compression to 4 bits\n"); - **hc06_ptr = LOWPAN_NHC_UDP_CS_P_11; - **(hc06_ptr + 1) = /* subtraction is faster */ - (u8)((uh->dest - LOWPAN_NHC_UDP_4BIT_PORT) + - ((uh->source & LOWPAN_NHC_UDP_4BIT_PORT) << 4)); - *hc06_ptr += 2; - } else if ((uh->dest & LOWPAN_NHC_UDP_8BIT_MASK) == - LOWPAN_NHC_UDP_8BIT_PORT) { - pr_debug("UDP header: remove 8 bits of dest\n"); - **hc06_ptr = LOWPAN_NHC_UDP_CS_P_01; - memcpy(*hc06_ptr + 1, &uh->source, 2); - **(hc06_ptr + 3) = (u8)(uh->dest - LOWPAN_NHC_UDP_8BIT_PORT); - *hc06_ptr += 4; - } else if ((uh->source & LOWPAN_NHC_UDP_8BIT_MASK) == - LOWPAN_NHC_UDP_8BIT_PORT) { - pr_debug("UDP header: remove 8 bits of source\n"); - **hc06_ptr = LOWPAN_NHC_UDP_CS_P_10; - memcpy(*hc06_ptr + 1, &uh->dest, 2); - **(hc06_ptr + 3) = (u8)(uh->source - LOWPAN_NHC_UDP_8BIT_PORT); - *hc06_ptr += 4; - } else { - pr_debug("UDP header: can't compress\n"); - **hc06_ptr = LOWPAN_NHC_UDP_CS_P_00; - memcpy(*hc06_ptr + 1, &uh->source, 2); - memcpy(*hc06_ptr + 3, &uh->dest, 2); - *hc06_ptr += 5; - } - - /* checksum is always inline */ - memcpy(*hc06_ptr, &uh->check, 2); - *hc06_ptr += 2; - - /* skip the UDP header */ - skb_pull(skb, sizeof(struct udphdr)); -} - -static inline int lowpan_fetch_skb_u8(struct sk_buff *skb, u8 *val) -{ - if (unlikely(!pskb_may_pull(skb, 1))) - return -EINVAL; - - *val = skb->data[0]; - skb_pull(skb, 1); - - return 0; -} - -static inline int lowpan_fetch_skb_u16(struct sk_buff *skb, u16 *val) -{ - if (unlikely(!pskb_may_pull(skb, 2))) - return -EINVAL; - - *val = (skb->data[0] << 8) | skb->data[1]; - skb_pull(skb, 2); - - return 0; -} - -static int -lowpan_uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) -{ - u8 tmp; - - if (!uh) - goto err; - - if (lowpan_fetch_skb_u8(skb, &tmp)) - goto err; - - if ((tmp & LOWPAN_NHC_UDP_MASK) == LOWPAN_NHC_UDP_ID) { - pr_debug("UDP header uncompression\n"); - switch (tmp & LOWPAN_NHC_UDP_CS_P_11) { - case LOWPAN_NHC_UDP_CS_P_00: - memcpy(&uh->source, &skb->data[0], 2); - memcpy(&uh->dest, &skb->data[2], 2); - skb_pull(skb, 4); - break; - case LOWPAN_NHC_UDP_CS_P_01: - memcpy(&uh->source, &skb->data[0], 2); - uh->dest = - skb->data[2] + LOWPAN_NHC_UDP_8BIT_PORT; - skb_pull(skb, 3); - break; - case LOWPAN_NHC_UDP_CS_P_10: - uh->source = skb->data[0] + LOWPAN_NHC_UDP_8BIT_PORT; - memcpy(&uh->dest, &skb->data[1], 2); - skb_pull(skb, 3); - break; - case LOWPAN_NHC_UDP_CS_P_11: - uh->source = - LOWPAN_NHC_UDP_4BIT_PORT + (skb->data[0] >> 4); - uh->dest = - LOWPAN_NHC_UDP_4BIT_PORT + (skb->data[0] & 0x0f); - skb_pull(skb, 1); - break; - default: - pr_debug("ERROR: unknown UDP format\n"); - goto err; - } - - pr_debug("uncompressed UDP ports: src = %d, dst = %d\n", - uh->source, uh->dest); - - /* copy checksum */ - memcpy(&uh->check, &skb->data[0], 2); - skb_pull(skb, 2); - - /* - * UDP lenght needs to be infered from the lower layers - * here, we obtain the hint from the remaining size of the - * frame - */ - uh->len = htons(skb->len + sizeof(struct udphdr)); - pr_debug("uncompressed UDP length: src = %d", uh->len); - } else { - pr_debug("ERROR: unsupported NH format\n"); - goto err; - } - - return 0; -err: - return -EINVAL; -} - -static int lowpan_header_create(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) -{ - u8 tmp, iphc0, iphc1, *hc06_ptr; - struct ipv6hdr *hdr; - const u8 *saddr = _saddr; - const u8 *daddr = _daddr; - u8 head[100]; - struct ieee802154_addr sa, da; - - /* TODO: - * if this package isn't ipv6 one, where should it be routed? - */ - if (type != ETH_P_IPV6) - return 0; - - hdr = ipv6_hdr(skb); - hc06_ptr = head + 2; - - pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n" - "\tnexthdr = 0x%02x\n\thop_lim = %d\n", hdr->version, - ntohs(hdr->payload_len), hdr->nexthdr, hdr->hop_limit); - - lowpan_raw_dump_table(__func__, "raw skb network header dump", - skb_network_header(skb), sizeof(struct ipv6hdr)); - - if (!saddr) - saddr = dev->dev_addr; - - lowpan_raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); - - /* - * As we copy some bit-length fields, in the IPHC encoding bytes, - * we sometimes use |= - * If the field is 0, and the current bit value in memory is 1, - * this does not work. We therefore reset the IPHC encoding here - */ - iphc0 = LOWPAN_DISPATCH_IPHC; - iphc1 = 0; - - /* TODO: context lookup */ - - lowpan_raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); - - /* - * Traffic class, flow label - * If flow label is 0, compress it. If traffic class is 0, compress it - * We have to process both in the same time as the offset of traffic - * class depends on the presence of version and flow label - */ - - /* hc06 format of TC is ECN | DSCP , original one is DSCP | ECN */ - tmp = (hdr->priority << 4) | (hdr->flow_lbl[0] >> 4); - tmp = ((tmp & 0x03) << 6) | (tmp >> 2); - - if (((hdr->flow_lbl[0] & 0x0F) == 0) && - (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) { - /* flow label can be compressed */ - iphc0 |= LOWPAN_IPHC_FL_C; - if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { - /* compress (elide) all */ - iphc0 |= LOWPAN_IPHC_TC_C; - } else { - /* compress only the flow label */ - *hc06_ptr = tmp; - hc06_ptr += 1; - } - } else { - /* Flow label cannot be compressed */ - if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { - /* compress only traffic class */ - iphc0 |= LOWPAN_IPHC_TC_C; - *hc06_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F); - memcpy(hc06_ptr + 1, &hdr->flow_lbl[1], 2); - hc06_ptr += 3; - } else { - /* compress nothing */ - memcpy(hc06_ptr, &hdr, 4); - /* replace the top byte with new ECN | DSCP format */ - *hc06_ptr = tmp; - hc06_ptr += 4; - } - } - - /* NOTE: payload length is always compressed */ - - /* Next Header is compress if UDP */ - if (hdr->nexthdr == UIP_PROTO_UDP) - iphc0 |= LOWPAN_IPHC_NH_C; - - if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { - *hc06_ptr = hdr->nexthdr; - hc06_ptr += 1; - } - - /* - * Hop limit - * if 1: compress, encoding is 01 - * if 64: compress, encoding is 10 - * if 255: compress, encoding is 11 - * else do not compress - */ - switch (hdr->hop_limit) { - case 1: - iphc0 |= LOWPAN_IPHC_TTL_1; - break; - case 64: - iphc0 |= LOWPAN_IPHC_TTL_64; - break; - case 255: - iphc0 |= LOWPAN_IPHC_TTL_255; - break; - default: - *hc06_ptr = hdr->hop_limit; - hc06_ptr += 1; - break; - } - - /* source address compression */ - if (is_addr_unspecified(&hdr->saddr)) { - pr_debug("source address is unspecified, setting SAC\n"); - iphc1 |= LOWPAN_IPHC_SAC; - /* TODO: context lookup */ - } else if (is_addr_link_local(&hdr->saddr)) { - pr_debug("source address is link-local\n"); - iphc1 |= lowpan_compress_addr_64(&hc06_ptr, - LOWPAN_IPHC_SAM_BIT, &hdr->saddr, saddr); - } else { - pr_debug("send the full source address\n"); - memcpy(hc06_ptr, &hdr->saddr.s6_addr16[0], 16); - hc06_ptr += 16; - } - - /* destination address compression */ - if (is_addr_mcast(&hdr->daddr)) { - pr_debug("destination address is multicast: "); - iphc1 |= LOWPAN_IPHC_M; - if (lowpan_is_mcast_addr_compressable8(&hdr->daddr)) { - pr_debug("compressed to 1 octet\n"); - iphc1 |= LOWPAN_IPHC_DAM_11; - /* use last byte */ - *hc06_ptr = hdr->daddr.s6_addr[15]; - hc06_ptr += 1; - } else if (lowpan_is_mcast_addr_compressable32(&hdr->daddr)) { - pr_debug("compressed to 4 octets\n"); - iphc1 |= LOWPAN_IPHC_DAM_10; - /* second byte + the last three */ - *hc06_ptr = hdr->daddr.s6_addr[1]; - memcpy(hc06_ptr + 1, &hdr->daddr.s6_addr[13], 3); - hc06_ptr += 4; - } else if (lowpan_is_mcast_addr_compressable48(&hdr->daddr)) { - pr_debug("compressed to 6 octets\n"); - iphc1 |= LOWPAN_IPHC_DAM_01; - /* second byte + the last five */ - *hc06_ptr = hdr->daddr.s6_addr[1]; - memcpy(hc06_ptr + 1, &hdr->daddr.s6_addr[11], 5); - hc06_ptr += 6; - } else { - pr_debug("using full address\n"); - iphc1 |= LOWPAN_IPHC_DAM_00; - memcpy(hc06_ptr, &hdr->daddr.s6_addr[0], 16); - hc06_ptr += 16; - } - } else { - /* TODO: context lookup */ - if (is_addr_link_local(&hdr->daddr)) { - pr_debug("dest address is unicast and link-local\n"); - iphc1 |= lowpan_compress_addr_64(&hc06_ptr, - LOWPAN_IPHC_DAM_BIT, &hdr->daddr, daddr); - } else { - pr_debug("dest address is unicast: using full one\n"); - memcpy(hc06_ptr, &hdr->daddr.s6_addr16[0], 16); - hc06_ptr += 16; - } - } - - /* UDP header compression */ - if (hdr->nexthdr == UIP_PROTO_UDP) - lowpan_compress_udp_header(&hc06_ptr, skb); - - head[0] = iphc0; - head[1] = iphc1; - - skb_pull(skb, sizeof(struct ipv6hdr)); - skb_reset_transport_header(skb); - memcpy(skb_push(skb, hc06_ptr - head), head, hc06_ptr - head); - skb_reset_network_header(skb); - - lowpan_raw_dump_table(__func__, "raw skb data dump", skb->data, - skb->len); - - /* - * NOTE1: I'm still unsure about the fact that compression and WPAN - * header are created here and not later in the xmit. So wait for - * an opinion of net maintainers. - */ - /* - * NOTE2: to be absolutely correct, we must derive PANid information - * from MAC subif of the 'dev' and 'real_dev' network devices, but - * this isn't implemented in mainline yet, so currently we assign 0xff - */ - { - mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA; - mac_cb(skb)->seq = ieee802154_mlme_ops(dev)->get_dsn(dev); - - /* prepare wpan address data */ - sa.addr_type = IEEE802154_ADDR_LONG; - sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - - memcpy(&(sa.hwaddr), saddr, 8); - /* intra-PAN communications */ - da.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - - /* - * if the destination address is the broadcast address, use the - * corresponding short address - */ - if (lowpan_is_addr_broadcast(daddr)) { - da.addr_type = IEEE802154_ADDR_SHORT; - da.short_addr = IEEE802154_ADDR_BROADCAST; - } else { - da.addr_type = IEEE802154_ADDR_LONG; - memcpy(&(da.hwaddr), daddr, IEEE802154_ADDR_LEN); - - /* request acknowledgment */ - mac_cb(skb)->flags |= MAC_CB_FLAG_ACKREQ; - } - - return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, - type, (void *)&da, (void *)&sa, skb->len); - } -} - -static int lowpan_give_skb_to_devices(struct sk_buff *skb) -{ - struct lowpan_dev_record *entry; - struct sk_buff *skb_cp; - int stat = NET_RX_SUCCESS; - - rcu_read_lock(); - list_for_each_entry_rcu(entry, &lowpan_devices, list) - if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) { - skb_cp = skb_copy(skb, GFP_ATOMIC); - if (!skb_cp) { - stat = -ENOMEM; - break; - } - - skb_cp->dev = entry->ldev; - stat = netif_rx(skb_cp); - } - rcu_read_unlock(); - - return stat; -} - -static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr) -{ - struct sk_buff *new; - int stat = NET_RX_SUCCESS; - - new = skb_copy_expand(skb, sizeof(struct ipv6hdr), skb_tailroom(skb), - GFP_ATOMIC); - kfree_skb(skb); - - if (!new) - return -ENOMEM; - - skb_push(new, sizeof(struct ipv6hdr)); - skb_copy_to_linear_data(new, hdr, sizeof(struct ipv6hdr)); - - new->protocol = htons(ETH_P_IPV6); - new->pkt_type = PACKET_HOST; - - stat = lowpan_give_skb_to_devices(new); - - kfree_skb(new); - - return stat; -} - -static void lowpan_fragment_timer_expired(unsigned long entry_addr) -{ - struct lowpan_fragment *entry = (struct lowpan_fragment *)entry_addr; - - pr_debug("timer expired for frame with tag %d\n", entry->tag); - - list_del(&entry->list); - dev_kfree_skb(entry->skb); - kfree(entry); -} - -static struct lowpan_fragment * -lowpan_alloc_new_frame(struct sk_buff *skb, u16 len, u16 tag) -{ - struct lowpan_fragment *frame; - - frame = kzalloc(sizeof(struct lowpan_fragment), - GFP_ATOMIC); - if (!frame) - goto frame_err; - - INIT_LIST_HEAD(&frame->list); - - frame->length = len; - frame->tag = tag; - - /* allocate buffer for frame assembling */ - frame->skb = netdev_alloc_skb_ip_align(skb->dev, frame->length + - sizeof(struct ipv6hdr)); - - if (!frame->skb) - goto skb_err; - - frame->skb->priority = skb->priority; - - /* reserve headroom for uncompressed ipv6 header */ - skb_reserve(frame->skb, sizeof(struct ipv6hdr)); - skb_put(frame->skb, frame->length); - - /* copy the first control block to keep a - * trace of the link-layer addresses in case - * of a link-local compressed address - */ - memcpy(frame->skb->cb, skb->cb, sizeof(skb->cb)); - - init_timer(&frame->timer); - /* time out is the same as for ipv6 - 60 sec */ - frame->timer.expires = jiffies + LOWPAN_FRAG_TIMEOUT; - frame->timer.data = (unsigned long)frame; - frame->timer.function = lowpan_fragment_timer_expired; - - add_timer(&frame->timer); - - list_add_tail(&frame->list, &lowpan_fragments); - - return frame; - -skb_err: - kfree(frame); -frame_err: - return NULL; -} - -static int -lowpan_process_data(struct sk_buff *skb) -{ - struct ipv6hdr hdr = {}; - u8 tmp, iphc0, iphc1, num_context = 0; - const struct ieee802154_addr *_saddr, *_daddr; - int err; - - lowpan_raw_dump_table(__func__, "raw skb data dump", skb->data, - skb->len); - /* at least two bytes will be used for the encoding */ - if (skb->len < 2) - goto drop; - - if (lowpan_fetch_skb_u8(skb, &iphc0)) - goto drop; - - /* fragments assembling */ - switch (iphc0 & LOWPAN_DISPATCH_MASK) { - case LOWPAN_DISPATCH_FRAG1: - case LOWPAN_DISPATCH_FRAGN: - { - struct lowpan_fragment *frame; - /* slen stores the rightmost 8 bits of the 11 bits length */ - u8 slen, offset = 0; - u16 len, tag; - bool found = false; - - if (lowpan_fetch_skb_u8(skb, &slen) || /* frame length */ - lowpan_fetch_skb_u16(skb, &tag)) /* fragment tag */ - goto drop; - - /* adds the 3 MSB to the 8 LSB to retrieve the 11 bits length */ - len = ((iphc0 & 7) << 8) | slen; - - if ((iphc0 & LOWPAN_DISPATCH_MASK) == LOWPAN_DISPATCH_FRAG1) { - pr_debug("%s received a FRAG1 packet (tag: %d, " - "size of the entire IP packet: %d)", - __func__, tag, len); - } else { /* FRAGN */ - if (lowpan_fetch_skb_u8(skb, &offset)) - goto unlock_and_drop; - pr_debug("%s received a FRAGN packet (tag: %d, " - "size of the entire IP packet: %d, " - "offset: %d)", __func__, tag, len, offset * 8); - } - - /* - * check if frame assembling with the same tag is - * already in progress - */ - spin_lock_bh(&flist_lock); - - list_for_each_entry(frame, &lowpan_fragments, list) - if (frame->tag == tag) { - found = true; - break; - } - - /* alloc new frame structure */ - if (!found) { - pr_debug("%s first fragment received for tag %d, " - "begin packet reassembly", __func__, tag); - frame = lowpan_alloc_new_frame(skb, len, tag); - if (!frame) - goto unlock_and_drop; - } - - /* if payload fits buffer, copy it */ - if (likely((offset * 8 + skb->len) <= frame->length)) - skb_copy_to_linear_data_offset(frame->skb, offset * 8, - skb->data, skb->len); - else - goto unlock_and_drop; - - frame->bytes_rcv += skb->len; - - /* frame assembling complete */ - if ((frame->bytes_rcv == frame->length) && - frame->timer.expires > jiffies) { - /* if timer haven't expired - first of all delete it */ - del_timer_sync(&frame->timer); - list_del(&frame->list); - spin_unlock_bh(&flist_lock); - - pr_debug("%s successfully reassembled fragment " - "(tag %d)", __func__, tag); - - dev_kfree_skb(skb); - skb = frame->skb; - kfree(frame); - - if (lowpan_fetch_skb_u8(skb, &iphc0)) - goto drop; - - break; - } - spin_unlock_bh(&flist_lock); - - return kfree_skb(skb), 0; - } - default: - break; - } - - if (lowpan_fetch_skb_u8(skb, &iphc1)) - goto drop; - - _saddr = &mac_cb(skb)->sa; - _daddr = &mac_cb(skb)->da; - - pr_debug("iphc0 = %02x, iphc1 = %02x\n", iphc0, iphc1); - - /* another if the CID flag is set */ - if (iphc1 & LOWPAN_IPHC_CID) { - pr_debug("CID flag is set, increase header with one\n"); - if (lowpan_fetch_skb_u8(skb, &num_context)) - goto drop; - } - - hdr.version = 6; - - /* Traffic Class and Flow Label */ - switch ((iphc0 & LOWPAN_IPHC_TF) >> 3) { - /* - * Traffic Class and FLow Label carried in-line - * ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) - */ - case 0: /* 00b */ - if (lowpan_fetch_skb_u8(skb, &tmp)) - goto drop; - - memcpy(&hdr.flow_lbl, &skb->data[0], 3); - skb_pull(skb, 3); - hdr.priority = ((tmp >> 2) & 0x0f); - hdr.flow_lbl[0] = ((tmp >> 2) & 0x30) | (tmp << 6) | - (hdr.flow_lbl[0] & 0x0f); - break; - /* - * Traffic class carried in-line - * ECN + DSCP (1 byte), Flow Label is elided - */ - case 2: /* 10b */ - if (lowpan_fetch_skb_u8(skb, &tmp)) - goto drop; - - hdr.priority = ((tmp >> 2) & 0x0f); - hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30); - break; - /* - * Flow Label carried in-line - * ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided - */ - case 1: /* 01b */ - if (lowpan_fetch_skb_u8(skb, &tmp)) - goto drop; - - hdr.flow_lbl[0] = (skb->data[0] & 0x0F) | ((tmp >> 2) & 0x30); - memcpy(&hdr.flow_lbl[1], &skb->data[0], 2); - skb_pull(skb, 2); - break; - /* Traffic Class and Flow Label are elided */ - case 3: /* 11b */ - break; - default: - break; - } - - /* Next Header */ - if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { - /* Next header is carried inline */ - if (lowpan_fetch_skb_u8(skb, &(hdr.nexthdr))) - goto drop; - - pr_debug("NH flag is set, next header carried inline: %02x\n", - hdr.nexthdr); - } - - /* Hop Limit */ - if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) - hdr.hop_limit = lowpan_ttl_values[iphc0 & 0x03]; - else { - if (lowpan_fetch_skb_u8(skb, &(hdr.hop_limit))) - goto drop; - } - - /* Extract SAM to the tmp variable */ - tmp = ((iphc1 & LOWPAN_IPHC_SAM) >> LOWPAN_IPHC_SAM_BIT) & 0x03; - - if (iphc1 & LOWPAN_IPHC_SAC) { - /* Source address context based uncompression */ - pr_debug("SAC bit is set. Handle context based source address.\n"); - err = lowpan_uncompress_context_based_src_addr( - skb, &hdr.saddr, tmp); - } else { - /* Source address uncompression */ - pr_debug("source address stateless compression\n"); - err = lowpan_uncompress_addr(skb, &hdr.saddr, tmp, _saddr); - } - - /* Check on error of previous branch */ - if (err) - goto drop; - - /* Extract DAM to the tmp variable */ - tmp = ((iphc1 & LOWPAN_IPHC_DAM_11) >> LOWPAN_IPHC_DAM_BIT) & 0x03; - - /* check for Multicast Compression */ - if (iphc1 & LOWPAN_IPHC_M) { - if (iphc1 & LOWPAN_IPHC_DAC) { - pr_debug("dest: context-based mcast compression\n"); - /* TODO: implement this */ - } else { - err = lowpan_uncompress_multicast_daddr( - skb, &hdr.daddr, tmp); - if (err) - goto drop; - } - } else { - pr_debug("dest: stateless compression\n"); - err = lowpan_uncompress_addr(skb, &hdr.daddr, tmp, _daddr); - if (err) - goto drop; - } - - /* UDP data uncompression */ - if (iphc0 & LOWPAN_IPHC_NH_C) { - struct udphdr uh; - struct sk_buff *new; - if (lowpan_uncompress_udp_header(skb, &uh)) - goto drop; - - /* - * replace the compressed UDP head by the uncompressed UDP - * header - */ - new = skb_copy_expand(skb, sizeof(struct udphdr), - skb_tailroom(skb), GFP_ATOMIC); - kfree_skb(skb); - - if (!new) - return -ENOMEM; - - skb = new; - - skb_push(skb, sizeof(struct udphdr)); - skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr)); - - lowpan_raw_dump_table(__func__, "raw UDP header dump", - (u8 *)&uh, sizeof(uh)); - - hdr.nexthdr = UIP_PROTO_UDP; - } - - /* Not fragmented package */ - hdr.payload_len = htons(skb->len); - - pr_debug("skb headroom size = %d, data length = %d\n", - skb_headroom(skb), skb->len); - - pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n\t" - "nexthdr = 0x%02x\n\thop_lim = %d\n", hdr.version, - ntohs(hdr.payload_len), hdr.nexthdr, hdr.hop_limit); - - lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, - sizeof(hdr)); - return lowpan_skb_deliver(skb, &hdr); - -unlock_and_drop: - spin_unlock_bh(&flist_lock); -drop: - kfree_skb(skb); - return -EINVAL; -} - -static int lowpan_set_address(struct net_device *dev, void *p) -{ - struct sockaddr *sa = p; - - if (netif_running(dev)) - return -EBUSY; - - /* TODO: validate addr */ - memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); - - return 0; -} - -static int -lowpan_fragment_xmit(struct sk_buff *skb, u8 *head, - int mlen, int plen, int offset, int type) -{ - struct sk_buff *frag; - int hlen; - - hlen = (type == LOWPAN_DISPATCH_FRAG1) ? - LOWPAN_FRAG1_HEAD_SIZE : LOWPAN_FRAGN_HEAD_SIZE; - - lowpan_raw_dump_inline(__func__, "6lowpan fragment header", head, hlen); - - frag = netdev_alloc_skb(skb->dev, - hlen + mlen + plen + IEEE802154_MFR_SIZE); - if (!frag) - return -ENOMEM; - - frag->priority = skb->priority; - - /* copy header, MFR and payload */ - skb_put(frag, mlen); - skb_copy_to_linear_data(frag, skb_mac_header(skb), mlen); - - skb_put(frag, hlen); - skb_copy_to_linear_data_offset(frag, mlen, head, hlen); - - skb_put(frag, plen); - skb_copy_to_linear_data_offset(frag, mlen + hlen, - skb_network_header(skb) + offset, plen); - - lowpan_raw_dump_table(__func__, " raw fragment dump", frag->data, - frag->len); - - return dev_queue_xmit(frag); -} - -static int -lowpan_skb_fragmentation(struct sk_buff *skb, struct net_device *dev) -{ - int err, header_length, payload_length, tag, offset = 0; - u8 head[5]; - - header_length = skb->mac_len; - payload_length = skb->len - header_length; - tag = lowpan_dev_info(dev)->fragment_tag++; - - /* first fragment header */ - head[0] = LOWPAN_DISPATCH_FRAG1 | ((payload_length >> 8) & 0x7); - head[1] = payload_length & 0xff; - head[2] = tag >> 8; - head[3] = tag & 0xff; - - err = lowpan_fragment_xmit(skb, head, header_length, LOWPAN_FRAG_SIZE, - 0, LOWPAN_DISPATCH_FRAG1); - - if (err) { - pr_debug("%s unable to send FRAG1 packet (tag: %d)", - __func__, tag); - goto exit; - } - - offset = LOWPAN_FRAG_SIZE; - - /* next fragment header */ - head[0] &= ~LOWPAN_DISPATCH_FRAG1; - head[0] |= LOWPAN_DISPATCH_FRAGN; - - while (payload_length - offset > 0) { - int len = LOWPAN_FRAG_SIZE; - - head[4] = offset / 8; - - if (payload_length - offset < len) - len = payload_length - offset; - - err = lowpan_fragment_xmit(skb, head, header_length, - len, offset, LOWPAN_DISPATCH_FRAGN); - if (err) { - pr_debug("%s unable to send a subsequent FRAGN packet " - "(tag: %d, offset: %d", __func__, tag, offset); - goto exit; - } - - offset += len; - } - -exit: - return err; -} - -static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) -{ - int err = -1; - - pr_debug("package xmit\n"); - - skb->dev = lowpan_dev_info(dev)->real_dev; - if (skb->dev == NULL) { - pr_debug("ERROR: no real wpan device found\n"); - goto error; - } - - /* Send directly if less than the MTU minus the 2 checksum bytes. */ - if (skb->len <= IEEE802154_MTU - IEEE802154_MFR_SIZE) { - err = dev_queue_xmit(skb); - goto out; - } - - pr_debug("frame is too big, fragmentation is needed\n"); - err = lowpan_skb_fragmentation(skb, dev); -error: - dev_kfree_skb(skb); -out: - if (err) - pr_debug("ERROR: xmit failed\n"); - - return (err < 0) ? NET_XMIT_DROP : err; -} - -static struct wpan_phy *lowpan_get_phy(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - return ieee802154_mlme_ops(real_dev)->get_phy(real_dev); -} - -static u16 lowpan_get_pan_id(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); -} - -static u16 lowpan_get_short_addr(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); -} - -static u8 lowpan_get_dsn(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); -} - -static struct header_ops lowpan_header_ops = { - .create = lowpan_header_create, -}; - -static const struct net_device_ops lowpan_netdev_ops = { - .ndo_start_xmit = lowpan_xmit, - .ndo_set_mac_address = lowpan_set_address, -}; - -static struct ieee802154_mlme_ops lowpan_mlme = { - .get_pan_id = lowpan_get_pan_id, - .get_phy = lowpan_get_phy, - .get_short_addr = lowpan_get_short_addr, - .get_dsn = lowpan_get_dsn, -}; - -static void lowpan_setup(struct net_device *dev) -{ - dev->addr_len = IEEE802154_ADDR_LEN; - memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN); - dev->type = ARPHRD_IEEE802154; - /* Frame Control + Sequence Number + Address fields + Security Header */ - dev->hard_header_len = 2 + 1 + 20 + 14; - dev->needed_tailroom = 2; /* FCS */ - dev->mtu = 1281; - dev->tx_queue_len = 0; - dev->flags = IFF_BROADCAST | IFF_MULTICAST; - dev->watchdog_timeo = 0; - - dev->netdev_ops = &lowpan_netdev_ops; - dev->header_ops = &lowpan_header_ops; - dev->ml_priv = &lowpan_mlme; - dev->destructor = free_netdev; -} - -static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) -{ - if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN) - return -EINVAL; - } - return 0; -} - -static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct sk_buff *local_skb; - - if (!netif_running(dev)) - goto drop; - - if (dev->type != ARPHRD_IEEE802154) - goto drop; - - /* check that it's our buffer */ - if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { - /* Copy the packet so that the IPv6 header is - * properly aligned. - */ - local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1, - skb_tailroom(skb), GFP_ATOMIC); - if (!local_skb) - goto drop; - - local_skb->protocol = htons(ETH_P_IPV6); - local_skb->pkt_type = PACKET_HOST; - - /* Pull off the 1-byte of 6lowpan header. */ - skb_pull(local_skb, 1); - - lowpan_give_skb_to_devices(local_skb); - - kfree_skb(local_skb); - kfree_skb(skb); - } else { - switch (skb->data[0] & 0xe0) { - case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ - case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ - case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */ - local_skb = skb_clone(skb, GFP_ATOMIC); - if (!local_skb) - goto drop; - lowpan_process_data(local_skb); - - kfree_skb(skb); - break; - default: - break; - } - } - - return NET_RX_SUCCESS; - -drop: - kfree_skb(skb); - return NET_RX_DROP; -} - -static int lowpan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -{ - struct net_device *real_dev; - struct lowpan_dev_record *entry; - - pr_debug("adding new link\n"); - - if (!tb[IFLA_LINK]) - return -EINVAL; - /* find and hold real wpan device */ - real_dev = dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); - if (!real_dev) - return -ENODEV; - if (real_dev->type != ARPHRD_IEEE802154) { - dev_put(real_dev); - return -EINVAL; - } - - lowpan_dev_info(dev)->real_dev = real_dev; - lowpan_dev_info(dev)->fragment_tag = 0; - mutex_init(&lowpan_dev_info(dev)->dev_list_mtx); - - entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL); - if (!entry) { - dev_put(real_dev); - lowpan_dev_info(dev)->real_dev = NULL; - return -ENOMEM; - } - - entry->ldev = dev; - - /* Set the lowpan harware address to the wpan hardware address. */ - memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN); - - mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); - INIT_LIST_HEAD(&entry->list); - list_add_tail(&entry->list, &lowpan_devices); - mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); - - register_netdevice(dev); - - return 0; -} - -static void lowpan_dellink(struct net_device *dev, struct list_head *head) -{ - struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev); - struct net_device *real_dev = lowpan_dev->real_dev; - struct lowpan_dev_record *entry, *tmp; - - ASSERT_RTNL(); - - mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); - list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { - if (entry->ldev == dev) { - list_del(&entry->list); - kfree(entry); - } - } - mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); - - mutex_destroy(&lowpan_dev_info(dev)->dev_list_mtx); - - unregister_netdevice_queue(dev, head); - - dev_put(real_dev); -} - -static struct rtnl_link_ops lowpan_link_ops __read_mostly = { - .kind = "lowpan", - .priv_size = sizeof(struct lowpan_dev_info), - .setup = lowpan_setup, - .newlink = lowpan_newlink, - .dellink = lowpan_dellink, - .validate = lowpan_validate, -}; - -static inline int __init lowpan_netlink_init(void) -{ - return rtnl_link_register(&lowpan_link_ops); -} - -static inline void lowpan_netlink_fini(void) -{ - rtnl_link_unregister(&lowpan_link_ops); -} - -static int lowpan_device_event(struct notifier_block *unused, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - LIST_HEAD(del_list); - struct lowpan_dev_record *entry, *tmp; - - if (dev->type != ARPHRD_IEEE802154) - goto out; - - if (event == NETDEV_UNREGISTER) { - list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { - if (lowpan_dev_info(entry->ldev)->real_dev == dev) - lowpan_dellink(entry->ldev, &del_list); - } - - unregister_netdevice_many(&del_list); - } - -out: - return NOTIFY_DONE; -} - -static struct notifier_block lowpan_dev_notifier = { - .notifier_call = lowpan_device_event, -}; - -static struct packet_type lowpan_packet_type = { - .type = __constant_htons(ETH_P_IEEE802154), - .func = lowpan_rcv, -}; - -static int __init lowpan_init_module(void) -{ - int err = 0; - - err = lowpan_netlink_init(); - if (err < 0) - goto out; - - dev_add_pack(&lowpan_packet_type); - - err = register_netdevice_notifier(&lowpan_dev_notifier); - if (err < 0) { - dev_remove_pack(&lowpan_packet_type); - lowpan_netlink_fini(); - } -out: - return err; -} - -static void __exit lowpan_cleanup_module(void) -{ - struct lowpan_fragment *frame, *tframe; - - lowpan_netlink_fini(); - - dev_remove_pack(&lowpan_packet_type); - - unregister_netdevice_notifier(&lowpan_dev_notifier); - - /* Now 6lowpan packet_type is removed, so no new fragments are - * expected on RX, therefore that's the time to clean incomplete - * fragments. - */ - spin_lock_bh(&flist_lock); - list_for_each_entry_safe(frame, tframe, &lowpan_fragments, list) { - del_timer_sync(&frame->timer); - list_del(&frame->list); - dev_kfree_skb(frame->skb); - kfree(frame); - } - spin_unlock_bh(&flist_lock); -} - -module_init(lowpan_init_module); -module_exit(lowpan_cleanup_module); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_RTNL_LINK("lowpan"); diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h deleted file mode 100644 index 2869c0526da..00000000000 --- a/net/ieee802154/6lowpan.h +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright 2011, Siemens AG - * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> - */ - -/* - * Based on patches from Jon Smirl <jonsmirl@gmail.com> - * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -/* Jon's code is based on 6lowpan implementation for Contiki which is: - * Copyright (c) 2008, Swedish Institute of Computer Science. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Institute nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef __6LOWPAN_H__ -#define __6LOWPAN_H__ - -#define UIP_802154_SHORTADDR_LEN 2 /* compressed ipv6 address length */ -#define UIP_IPH_LEN 40 /* ipv6 fixed header size */ -#define UIP_PROTO_UDP 17 /* ipv6 next header value for UDP */ -#define UIP_FRAGH_LEN 8 /* ipv6 fragment header size */ - -/* - * ipv6 address based on mac - * second bit-flip (Universe/Local) is done according RFC2464 - */ -#define is_addr_mac_addr_based(a, m) \ - ((((a)->s6_addr[8]) == (((m)[0]) ^ 0x02)) && \ - (((a)->s6_addr[9]) == (m)[1]) && \ - (((a)->s6_addr[10]) == (m)[2]) && \ - (((a)->s6_addr[11]) == (m)[3]) && \ - (((a)->s6_addr[12]) == (m)[4]) && \ - (((a)->s6_addr[13]) == (m)[5]) && \ - (((a)->s6_addr[14]) == (m)[6]) && \ - (((a)->s6_addr[15]) == (m)[7])) - -/* ipv6 address is unspecified */ -#define is_addr_unspecified(a) \ - ((((a)->s6_addr32[0]) == 0) && \ - (((a)->s6_addr32[1]) == 0) && \ - (((a)->s6_addr32[2]) == 0) && \ - (((a)->s6_addr32[3]) == 0)) - -/* compare ipv6 addresses prefixes */ -#define ipaddr_prefixcmp(addr1, addr2, length) \ - (memcmp(addr1, addr2, length >> 3) == 0) - -/* local link, i.e. FE80::/10 */ -#define is_addr_link_local(a) (((a)->s6_addr16[0]) == htons(0xFE80)) - -/* - * check whether we can compress the IID to 16 bits, - * it's possible for unicast adresses with first 49 bits are zero only. - */ -#define lowpan_is_iid_16_bit_compressable(a) \ - ((((a)->s6_addr16[4]) == 0) && \ - (((a)->s6_addr[10]) == 0) && \ - (((a)->s6_addr[11]) == 0xff) && \ - (((a)->s6_addr[12]) == 0xfe) && \ - (((a)->s6_addr[13]) == 0)) - -/* multicast address */ -#define is_addr_mcast(a) (((a)->s6_addr[0]) == 0xFF) - -/* check whether the 112-bit gid of the multicast address is mappable to: */ - -/* 9 bits, for FF02::1 (all nodes) and FF02::2 (all routers) addresses only. */ -#define lowpan_is_mcast_addr_compressable(a) \ - ((((a)->s6_addr16[1]) == 0) && \ - (((a)->s6_addr16[2]) == 0) && \ - (((a)->s6_addr16[3]) == 0) && \ - (((a)->s6_addr16[4]) == 0) && \ - (((a)->s6_addr16[5]) == 0) && \ - (((a)->s6_addr16[6]) == 0) && \ - (((a)->s6_addr[14]) == 0) && \ - ((((a)->s6_addr[15]) == 1) || (((a)->s6_addr[15]) == 2))) - -/* 48 bits, FFXX::00XX:XXXX:XXXX */ -#define lowpan_is_mcast_addr_compressable48(a) \ - ((((a)->s6_addr16[1]) == 0) && \ - (((a)->s6_addr16[2]) == 0) && \ - (((a)->s6_addr16[3]) == 0) && \ - (((a)->s6_addr16[4]) == 0) && \ - (((a)->s6_addr[10]) == 0)) - -/* 32 bits, FFXX::00XX:XXXX */ -#define lowpan_is_mcast_addr_compressable32(a) \ - ((((a)->s6_addr16[1]) == 0) && \ - (((a)->s6_addr16[2]) == 0) && \ - (((a)->s6_addr16[3]) == 0) && \ - (((a)->s6_addr16[4]) == 0) && \ - (((a)->s6_addr16[5]) == 0) && \ - (((a)->s6_addr[12]) == 0)) - -/* 8 bits, FF02::00XX */ -#define lowpan_is_mcast_addr_compressable8(a) \ - ((((a)->s6_addr[1]) == 2) && \ - (((a)->s6_addr16[1]) == 0) && \ - (((a)->s6_addr16[2]) == 0) && \ - (((a)->s6_addr16[3]) == 0) && \ - (((a)->s6_addr16[4]) == 0) && \ - (((a)->s6_addr16[5]) == 0) && \ - (((a)->s6_addr16[6]) == 0) && \ - (((a)->s6_addr[14]) == 0)) - -#define lowpan_is_addr_broadcast(a) \ - ((((a)[0]) == 0xFF) && \ - (((a)[1]) == 0xFF) && \ - (((a)[2]) == 0xFF) && \ - (((a)[3]) == 0xFF) && \ - (((a)[4]) == 0xFF) && \ - (((a)[5]) == 0xFF) && \ - (((a)[6]) == 0xFF) && \ - (((a)[7]) == 0xFF)) - -#define LOWPAN_DISPATCH_IPV6 0x41 /* 01000001 = 65 */ -#define LOWPAN_DISPATCH_HC1 0x42 /* 01000010 = 66 */ -#define LOWPAN_DISPATCH_IPHC 0x60 /* 011xxxxx = ... */ -#define LOWPAN_DISPATCH_FRAG1 0xc0 /* 11000xxx */ -#define LOWPAN_DISPATCH_FRAGN 0xe0 /* 11100xxx */ - -#define LOWPAN_DISPATCH_MASK 0xf8 /* 11111000 */ - -#define LOWPAN_FRAG_TIMEOUT (HZ * 60) /* time-out 60 sec */ - -#define LOWPAN_FRAG1_HEAD_SIZE 0x4 -#define LOWPAN_FRAGN_HEAD_SIZE 0x5 - -/* - * According IEEE802.15.4 standard: - * - MTU is 127 octets - * - maximum MHR size is 37 octets - * - MFR size is 2 octets - * - * so minimal payload size that we may guarantee is: - * MTU - MHR - MFR = 88 octets - */ -#define LOWPAN_FRAG_SIZE 88 - -/* - * Values of fields within the IPHC encoding first byte - * (C stands for compressed and I for inline) - */ -#define LOWPAN_IPHC_TF 0x18 - -#define LOWPAN_IPHC_FL_C 0x10 -#define LOWPAN_IPHC_TC_C 0x08 -#define LOWPAN_IPHC_NH_C 0x04 -#define LOWPAN_IPHC_TTL_1 0x01 -#define LOWPAN_IPHC_TTL_64 0x02 -#define LOWPAN_IPHC_TTL_255 0x03 -#define LOWPAN_IPHC_TTL_I 0x00 - - -/* Values of fields within the IPHC encoding second byte */ -#define LOWPAN_IPHC_CID 0x80 - -#define LOWPAN_IPHC_ADDR_00 0x00 -#define LOWPAN_IPHC_ADDR_01 0x01 -#define LOWPAN_IPHC_ADDR_02 0x02 -#define LOWPAN_IPHC_ADDR_03 0x03 - -#define LOWPAN_IPHC_SAC 0x40 -#define LOWPAN_IPHC_SAM 0x30 - -#define LOWPAN_IPHC_SAM_BIT 4 - -#define LOWPAN_IPHC_M 0x08 -#define LOWPAN_IPHC_DAC 0x04 -#define LOWPAN_IPHC_DAM_00 0x00 -#define LOWPAN_IPHC_DAM_01 0x01 -#define LOWPAN_IPHC_DAM_10 0x02 -#define LOWPAN_IPHC_DAM_11 0x03 - -#define LOWPAN_IPHC_DAM_BIT 0 -/* - * LOWPAN_UDP encoding (works together with IPHC) - */ -#define LOWPAN_NHC_UDP_MASK 0xF8 -#define LOWPAN_NHC_UDP_ID 0xF0 -#define LOWPAN_NHC_UDP_CHECKSUMC 0x04 -#define LOWPAN_NHC_UDP_CHECKSUMI 0x00 - -#define LOWPAN_NHC_UDP_4BIT_PORT 0xF0B0 -#define LOWPAN_NHC_UDP_4BIT_MASK 0xFFF0 -#define LOWPAN_NHC_UDP_8BIT_PORT 0xF000 -#define LOWPAN_NHC_UDP_8BIT_MASK 0xFF00 - -/* values for port compression, _with checksum_ ie bit 5 set to 0 */ -#define LOWPAN_NHC_UDP_CS_P_00 0xF0 /* all inline */ -#define LOWPAN_NHC_UDP_CS_P_01 0xF1 /* source 16bit inline, - dest = 0xF0 + 8 bit inline */ -#define LOWPAN_NHC_UDP_CS_P_10 0xF2 /* source = 0xF0 + 8bit inline, - dest = 16 bit inline */ -#define LOWPAN_NHC_UDP_CS_P_11 0xF3 /* source & dest = 0xF0B + 4bit inline */ - -static inline bool lowpan_fetch_skb(struct sk_buff *skb, - void *data, const unsigned int len) -{ - if (unlikely(!pskb_may_pull(skb, len))) - return true; - - skb_copy_from_linear_data(skb, data, len); - skb_pull(skb, len); - - return false; -} - -#endif /* __6LOWPAN_H__ */ diff --git a/net/ieee802154/6lowpan_iphc.c b/net/ieee802154/6lowpan_iphc.c new file mode 100644 index 00000000000..211b5686d71 --- /dev/null +++ b/net/ieee802154/6lowpan_iphc.c @@ -0,0 +1,801 @@ +/* + * Copyright 2011, Siemens AG + * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> + */ + +/* + * Based on patches from Jon Smirl <jonsmirl@gmail.com> + * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* Jon's code is based on 6lowpan implementation for Contiki which is: + * Copyright (c) 2008, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <linux/bitops.h> +#include <linux/if_arp.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <net/6lowpan.h> +#include <net/ipv6.h> +#include <net/af_ieee802154.h> + +/* + * Uncompress address function for source and + * destination address(non-multicast). + * + * address_mode is sam value or dam value. + */ +static int uncompress_addr(struct sk_buff *skb, + struct in6_addr *ipaddr, const u8 address_mode, + const u8 *lladdr, const u8 addr_type, + const u8 addr_len) +{ + bool fail; + + switch (address_mode) { + case LOWPAN_IPHC_ADDR_00: + /* for global link addresses */ + fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16); + break; + case LOWPAN_IPHC_ADDR_01: + /* fe:80::XXXX:XXXX:XXXX:XXXX */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8); + break; + case LOWPAN_IPHC_ADDR_02: + /* fe:80::ff:fe00:XXXX */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2); + break; + case LOWPAN_IPHC_ADDR_03: + fail = false; + switch (addr_type) { + case IEEE802154_ADDR_LONG: + /* fe:80::XXXX:XXXX:XXXX:XXXX + * \_________________/ + * hwaddr + */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + memcpy(&ipaddr->s6_addr[8], lladdr, addr_len); + /* second bit-flip (Universe/Local) + * is done according RFC2464 + */ + ipaddr->s6_addr[8] ^= 0x02; + break; + case IEEE802154_ADDR_SHORT: + /* fe:80::ff:fe00:XXXX + * \__/ + * short_addr + * + * Universe/Local bit is zero. + */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + ipaddr->s6_addr16[7] = htons(*((u16 *)lladdr)); + break; + default: + pr_debug("Invalid addr_type set\n"); + return -EINVAL; + } + break; + default: + pr_debug("Invalid address mode value: 0x%x\n", address_mode); + return -EINVAL; + } + + if (fail) { + pr_debug("Failed to fetch skb data\n"); + return -EIO; + } + + raw_dump_inline(NULL, "Reconstructed ipv6 addr is", + ipaddr->s6_addr, 16); + + return 0; +} + +/* + * Uncompress address function for source context + * based address(non-multicast). + */ +static int uncompress_context_based_src_addr(struct sk_buff *skb, + struct in6_addr *ipaddr, + const u8 sam) +{ + switch (sam) { + case LOWPAN_IPHC_ADDR_00: + /* unspec address :: + * Do nothing, address is already :: + */ + break; + case LOWPAN_IPHC_ADDR_01: + /* TODO */ + case LOWPAN_IPHC_ADDR_02: + /* TODO */ + case LOWPAN_IPHC_ADDR_03: + /* TODO */ + netdev_warn(skb->dev, "SAM value 0x%x not supported\n", sam); + return -EINVAL; + default: + pr_debug("Invalid sam value: 0x%x\n", sam); + return -EINVAL; + } + + raw_dump_inline(NULL, + "Reconstructed context based ipv6 src addr is", + ipaddr->s6_addr, 16); + + return 0; +} + +static int skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr, + struct net_device *dev, skb_delivery_cb deliver_skb) +{ + struct sk_buff *new; + int stat; + + new = skb_copy_expand(skb, sizeof(struct ipv6hdr), skb_tailroom(skb), + GFP_ATOMIC); + kfree_skb(skb); + + if (!new) + return -ENOMEM; + + skb_push(new, sizeof(struct ipv6hdr)); + skb_reset_network_header(new); + skb_copy_to_linear_data(new, hdr, sizeof(struct ipv6hdr)); + + new->protocol = htons(ETH_P_IPV6); + new->pkt_type = PACKET_HOST; + new->dev = dev; + + raw_dump_table(__func__, "raw skb data dump before receiving", + new->data, new->len); + + stat = deliver_skb(new, dev); + + kfree_skb(new); + + return stat; +} + +/* Uncompress function for multicast destination address, + * when M bit is set. + */ +static int +lowpan_uncompress_multicast_daddr(struct sk_buff *skb, + struct in6_addr *ipaddr, + const u8 dam) +{ + bool fail; + + switch (dam) { + case LOWPAN_IPHC_DAM_00: + /* 00: 128 bits. The full address + * is carried in-line. + */ + fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16); + break; + case LOWPAN_IPHC_DAM_01: + /* 01: 48 bits. The address takes + * the form ffXX::00XX:XXXX:XXXX. + */ + ipaddr->s6_addr[0] = 0xFF; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1); + fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[11], 5); + break; + case LOWPAN_IPHC_DAM_10: + /* 10: 32 bits. The address takes + * the form ffXX::00XX:XXXX. + */ + ipaddr->s6_addr[0] = 0xFF; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1); + fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[13], 3); + break; + case LOWPAN_IPHC_DAM_11: + /* 11: 8 bits. The address takes + * the form ff02::00XX. + */ + ipaddr->s6_addr[0] = 0xFF; + ipaddr->s6_addr[1] = 0x02; + fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1); + break; + default: + pr_debug("DAM value has a wrong value: 0x%x\n", dam); + return -EINVAL; + } + + if (fail) { + pr_debug("Failed to fetch skb data\n"); + return -EIO; + } + + raw_dump_inline(NULL, "Reconstructed ipv6 multicast addr is", + ipaddr->s6_addr, 16); + + return 0; +} + +static int +uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) +{ + bool fail; + u8 tmp = 0, val = 0; + + if (!uh) + goto err; + + fail = lowpan_fetch_skb(skb, &tmp, 1); + + if ((tmp & LOWPAN_NHC_UDP_MASK) == LOWPAN_NHC_UDP_ID) { + pr_debug("UDP header uncompression\n"); + switch (tmp & LOWPAN_NHC_UDP_CS_P_11) { + case LOWPAN_NHC_UDP_CS_P_00: + fail |= lowpan_fetch_skb(skb, &uh->source, 2); + fail |= lowpan_fetch_skb(skb, &uh->dest, 2); + break; + case LOWPAN_NHC_UDP_CS_P_01: + fail |= lowpan_fetch_skb(skb, &uh->source, 2); + fail |= lowpan_fetch_skb(skb, &val, 1); + uh->dest = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); + break; + case LOWPAN_NHC_UDP_CS_P_10: + fail |= lowpan_fetch_skb(skb, &val, 1); + uh->source = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); + fail |= lowpan_fetch_skb(skb, &uh->dest, 2); + break; + case LOWPAN_NHC_UDP_CS_P_11: + fail |= lowpan_fetch_skb(skb, &val, 1); + uh->source = htons(LOWPAN_NHC_UDP_4BIT_PORT + + (val >> 4)); + uh->dest = htons(LOWPAN_NHC_UDP_4BIT_PORT + + (val & 0x0f)); + break; + default: + pr_debug("ERROR: unknown UDP format\n"); + goto err; + break; + } + + pr_debug("uncompressed UDP ports: src = %d, dst = %d\n", + ntohs(uh->source), ntohs(uh->dest)); + + /* checksum */ + if (tmp & LOWPAN_NHC_UDP_CS_C) { + pr_debug_ratelimited("checksum elided currently not supported\n"); + goto err; + } else { + fail |= lowpan_fetch_skb(skb, &uh->check, 2); + } + + /* + * UDP lenght needs to be infered from the lower layers + * here, we obtain the hint from the remaining size of the + * frame + */ + uh->len = htons(skb->len + sizeof(struct udphdr)); + pr_debug("uncompressed UDP length: src = %d", ntohs(uh->len)); + } else { + pr_debug("ERROR: unsupported NH format\n"); + goto err; + } + + if (fail) + goto err; + + return 0; +err: + return -EINVAL; +} + +/* TTL uncompression values */ +static const u8 lowpan_ttl_values[] = { 0, 1, 64, 255 }; + +int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, + const u8 *saddr, const u8 saddr_type, const u8 saddr_len, + const u8 *daddr, const u8 daddr_type, const u8 daddr_len, + u8 iphc0, u8 iphc1, skb_delivery_cb deliver_skb) +{ + struct ipv6hdr hdr = {}; + u8 tmp, num_context = 0; + int err; + + raw_dump_table(__func__, "raw skb data dump uncompressed", + skb->data, skb->len); + + /* another if the CID flag is set */ + if (iphc1 & LOWPAN_IPHC_CID) { + pr_debug("CID flag is set, increase header with one\n"); + if (lowpan_fetch_skb_u8(skb, &num_context)) + goto drop; + } + + hdr.version = 6; + + /* Traffic Class and Flow Label */ + switch ((iphc0 & LOWPAN_IPHC_TF) >> 3) { + /* + * Traffic Class and FLow Label carried in-line + * ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) + */ + case 0: /* 00b */ + if (lowpan_fetch_skb_u8(skb, &tmp)) + goto drop; + + memcpy(&hdr.flow_lbl, &skb->data[0], 3); + skb_pull(skb, 3); + hdr.priority = ((tmp >> 2) & 0x0f); + hdr.flow_lbl[0] = ((tmp >> 2) & 0x30) | (tmp << 6) | + (hdr.flow_lbl[0] & 0x0f); + break; + /* + * Traffic class carried in-line + * ECN + DSCP (1 byte), Flow Label is elided + */ + case 2: /* 10b */ + if (lowpan_fetch_skb_u8(skb, &tmp)) + goto drop; + + hdr.priority = ((tmp >> 2) & 0x0f); + hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30); + break; + /* + * Flow Label carried in-line + * ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided + */ + case 1: /* 01b */ + if (lowpan_fetch_skb_u8(skb, &tmp)) + goto drop; + + hdr.flow_lbl[0] = (skb->data[0] & 0x0F) | ((tmp >> 2) & 0x30); + memcpy(&hdr.flow_lbl[1], &skb->data[0], 2); + skb_pull(skb, 2); + break; + /* Traffic Class and Flow Label are elided */ + case 3: /* 11b */ + break; + default: + break; + } + + /* Next Header */ + if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { + /* Next header is carried inline */ + if (lowpan_fetch_skb_u8(skb, &(hdr.nexthdr))) + goto drop; + + pr_debug("NH flag is set, next header carried inline: %02x\n", + hdr.nexthdr); + } + + /* Hop Limit */ + if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) + hdr.hop_limit = lowpan_ttl_values[iphc0 & 0x03]; + else { + if (lowpan_fetch_skb_u8(skb, &(hdr.hop_limit))) + goto drop; + } + + /* Extract SAM to the tmp variable */ + tmp = ((iphc1 & LOWPAN_IPHC_SAM) >> LOWPAN_IPHC_SAM_BIT) & 0x03; + + if (iphc1 & LOWPAN_IPHC_SAC) { + /* Source address context based uncompression */ + pr_debug("SAC bit is set. Handle context based source address.\n"); + err = uncompress_context_based_src_addr( + skb, &hdr.saddr, tmp); + } else { + /* Source address uncompression */ + pr_debug("source address stateless compression\n"); + err = uncompress_addr(skb, &hdr.saddr, tmp, saddr, + saddr_type, saddr_len); + } + + /* Check on error of previous branch */ + if (err) + goto drop; + + /* Extract DAM to the tmp variable */ + tmp = ((iphc1 & LOWPAN_IPHC_DAM_11) >> LOWPAN_IPHC_DAM_BIT) & 0x03; + + /* check for Multicast Compression */ + if (iphc1 & LOWPAN_IPHC_M) { + if (iphc1 & LOWPAN_IPHC_DAC) { + pr_debug("dest: context-based mcast compression\n"); + /* TODO: implement this */ + } else { + err = lowpan_uncompress_multicast_daddr( + skb, &hdr.daddr, tmp); + if (err) + goto drop; + } + } else { + err = uncompress_addr(skb, &hdr.daddr, tmp, daddr, + daddr_type, daddr_len); + pr_debug("dest: stateless compression mode %d dest %pI6c\n", + tmp, &hdr.daddr); + if (err) + goto drop; + } + + /* UDP data uncompression */ + if (iphc0 & LOWPAN_IPHC_NH_C) { + struct udphdr uh; + struct sk_buff *new; + if (uncompress_udp_header(skb, &uh)) + goto drop; + + /* + * replace the compressed UDP head by the uncompressed UDP + * header + */ + new = skb_copy_expand(skb, sizeof(struct udphdr), + skb_tailroom(skb), GFP_ATOMIC); + kfree_skb(skb); + + if (!new) + return -ENOMEM; + + skb = new; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr)); + + raw_dump_table(__func__, "raw UDP header dump", + (u8 *)&uh, sizeof(uh)); + + hdr.nexthdr = UIP_PROTO_UDP; + } + + hdr.payload_len = htons(skb->len); + + pr_debug("skb headroom size = %d, data length = %d\n", + skb_headroom(skb), skb->len); + + pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n\t" + "nexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n", + hdr.version, ntohs(hdr.payload_len), hdr.nexthdr, + hdr.hop_limit, &hdr.daddr); + + raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, + sizeof(hdr)); + + return skb_deliver(skb, &hdr, dev, deliver_skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(lowpan_process_data); + +static u8 lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, + const struct in6_addr *ipaddr, + const unsigned char *lladdr) +{ + u8 val = 0; + + if (is_addr_mac_addr_based(ipaddr, lladdr)) { + val = 3; /* 0-bits */ + pr_debug("address compression 0 bits\n"); + } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) { + /* compress IID to 16 bits xxxx::XXXX */ + memcpy(*hc06_ptr, &ipaddr->s6_addr16[7], 2); + *hc06_ptr += 2; + val = 2; /* 16-bits */ + raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)", + *hc06_ptr - 2, 2); + } else { + /* do not compress IID => xxxx::IID */ + memcpy(*hc06_ptr, &ipaddr->s6_addr16[4], 8); + *hc06_ptr += 8; + val = 1; /* 64-bits */ + raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", + *hc06_ptr - 8, 8); + } + + return rol8(val, shift); +} + +static void compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + u8 tmp; + + if (((ntohs(uh->source) & LOWPAN_NHC_UDP_4BIT_MASK) == + LOWPAN_NHC_UDP_4BIT_PORT) && + ((ntohs(uh->dest) & LOWPAN_NHC_UDP_4BIT_MASK) == + LOWPAN_NHC_UDP_4BIT_PORT)) { + pr_debug("UDP header: both ports compression to 4 bits\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_11; + lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + /* source and destination port */ + tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_4BIT_PORT + + ((ntohs(uh->source) - LOWPAN_NHC_UDP_4BIT_PORT) << 4); + lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + } else if ((ntohs(uh->dest) & LOWPAN_NHC_UDP_8BIT_MASK) == + LOWPAN_NHC_UDP_8BIT_PORT) { + pr_debug("UDP header: remove 8 bits of dest\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_01; + lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + /* source port */ + lowpan_push_hc_data(hc06_ptr, &uh->source, sizeof(uh->source)); + /* destination port */ + tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_8BIT_PORT; + lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + } else if ((ntohs(uh->source) & LOWPAN_NHC_UDP_8BIT_MASK) == + LOWPAN_NHC_UDP_8BIT_PORT) { + pr_debug("UDP header: remove 8 bits of source\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_10; + lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + /* source port */ + tmp = ntohs(uh->source) - LOWPAN_NHC_UDP_8BIT_PORT; + lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + /* destination port */ + lowpan_push_hc_data(hc06_ptr, &uh->dest, sizeof(uh->dest)); + } else { + pr_debug("UDP header: can't compress\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_00; + lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + /* source port */ + lowpan_push_hc_data(hc06_ptr, &uh->source, sizeof(uh->source)); + /* destination port */ + lowpan_push_hc_data(hc06_ptr, &uh->dest, sizeof(uh->dest)); + } + + /* checksum is always inline */ + lowpan_push_hc_data(hc06_ptr, &uh->check, sizeof(uh->check)); + + /* skip the UDP header */ + skb_pull(skb, sizeof(struct udphdr)); +} + +int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) +{ + u8 tmp, iphc0, iphc1, *hc06_ptr; + struct ipv6hdr *hdr; + u8 head[100] = {}; + + if (type != ETH_P_IPV6) + return -EINVAL; + + hdr = ipv6_hdr(skb); + hc06_ptr = head + 2; + + pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n" + "\tnexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n", + hdr->version, ntohs(hdr->payload_len), hdr->nexthdr, + hdr->hop_limit, &hdr->daddr); + + raw_dump_table(__func__, "raw skb network header dump", + skb_network_header(skb), sizeof(struct ipv6hdr)); + + /* + * As we copy some bit-length fields, in the IPHC encoding bytes, + * we sometimes use |= + * If the field is 0, and the current bit value in memory is 1, + * this does not work. We therefore reset the IPHC encoding here + */ + iphc0 = LOWPAN_DISPATCH_IPHC; + iphc1 = 0; + + /* TODO: context lookup */ + + raw_dump_inline(__func__, "saddr", + (unsigned char *)_saddr, IEEE802154_ADDR_LEN); + raw_dump_inline(__func__, "daddr", + (unsigned char *)_daddr, IEEE802154_ADDR_LEN); + + raw_dump_table(__func__, + "sending raw skb network uncompressed packet", + skb->data, skb->len); + + /* + * Traffic class, flow label + * If flow label is 0, compress it. If traffic class is 0, compress it + * We have to process both in the same time as the offset of traffic + * class depends on the presence of version and flow label + */ + + /* hc06 format of TC is ECN | DSCP , original one is DSCP | ECN */ + tmp = (hdr->priority << 4) | (hdr->flow_lbl[0] >> 4); + tmp = ((tmp & 0x03) << 6) | (tmp >> 2); + + if (((hdr->flow_lbl[0] & 0x0F) == 0) && + (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) { + /* flow label can be compressed */ + iphc0 |= LOWPAN_IPHC_FL_C; + if ((hdr->priority == 0) && + ((hdr->flow_lbl[0] & 0xF0) == 0)) { + /* compress (elide) all */ + iphc0 |= LOWPAN_IPHC_TC_C; + } else { + /* compress only the flow label */ + *hc06_ptr = tmp; + hc06_ptr += 1; + } + } else { + /* Flow label cannot be compressed */ + if ((hdr->priority == 0) && + ((hdr->flow_lbl[0] & 0xF0) == 0)) { + /* compress only traffic class */ + iphc0 |= LOWPAN_IPHC_TC_C; + *hc06_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F); + memcpy(hc06_ptr + 1, &hdr->flow_lbl[1], 2); + hc06_ptr += 3; + } else { + /* compress nothing */ + memcpy(hc06_ptr, hdr, 4); + /* replace the top byte with new ECN | DSCP format */ + *hc06_ptr = tmp; + hc06_ptr += 4; + } + } + + /* NOTE: payload length is always compressed */ + + /* Next Header is compress if UDP */ + if (hdr->nexthdr == UIP_PROTO_UDP) + iphc0 |= LOWPAN_IPHC_NH_C; + + if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { + *hc06_ptr = hdr->nexthdr; + hc06_ptr += 1; + } + + /* + * Hop limit + * if 1: compress, encoding is 01 + * if 64: compress, encoding is 10 + * if 255: compress, encoding is 11 + * else do not compress + */ + switch (hdr->hop_limit) { + case 1: + iphc0 |= LOWPAN_IPHC_TTL_1; + break; + case 64: + iphc0 |= LOWPAN_IPHC_TTL_64; + break; + case 255: + iphc0 |= LOWPAN_IPHC_TTL_255; + break; + default: + *hc06_ptr = hdr->hop_limit; + hc06_ptr += 1; + break; + } + + /* source address compression */ + if (is_addr_unspecified(&hdr->saddr)) { + pr_debug("source address is unspecified, setting SAC\n"); + iphc1 |= LOWPAN_IPHC_SAC; + /* TODO: context lookup */ + } else if (is_addr_link_local(&hdr->saddr)) { + iphc1 |= lowpan_compress_addr_64(&hc06_ptr, + LOWPAN_IPHC_SAM_BIT, &hdr->saddr, _saddr); + pr_debug("source address unicast link-local %pI6c " + "iphc1 0x%02x\n", &hdr->saddr, iphc1); + } else { + pr_debug("send the full source address\n"); + memcpy(hc06_ptr, &hdr->saddr.s6_addr16[0], 16); + hc06_ptr += 16; + } + + /* destination address compression */ + if (is_addr_mcast(&hdr->daddr)) { + pr_debug("destination address is multicast: "); + iphc1 |= LOWPAN_IPHC_M; + if (lowpan_is_mcast_addr_compressable8(&hdr->daddr)) { + pr_debug("compressed to 1 octet\n"); + iphc1 |= LOWPAN_IPHC_DAM_11; + /* use last byte */ + *hc06_ptr = hdr->daddr.s6_addr[15]; + hc06_ptr += 1; + } else if (lowpan_is_mcast_addr_compressable32(&hdr->daddr)) { + pr_debug("compressed to 4 octets\n"); + iphc1 |= LOWPAN_IPHC_DAM_10; + /* second byte + the last three */ + *hc06_ptr = hdr->daddr.s6_addr[1]; + memcpy(hc06_ptr + 1, &hdr->daddr.s6_addr[13], 3); + hc06_ptr += 4; + } else if (lowpan_is_mcast_addr_compressable48(&hdr->daddr)) { + pr_debug("compressed to 6 octets\n"); + iphc1 |= LOWPAN_IPHC_DAM_01; + /* second byte + the last five */ + *hc06_ptr = hdr->daddr.s6_addr[1]; + memcpy(hc06_ptr + 1, &hdr->daddr.s6_addr[11], 5); + hc06_ptr += 6; + } else { + pr_debug("using full address\n"); + iphc1 |= LOWPAN_IPHC_DAM_00; + memcpy(hc06_ptr, &hdr->daddr.s6_addr[0], 16); + hc06_ptr += 16; + } + } else { + /* TODO: context lookup */ + if (is_addr_link_local(&hdr->daddr)) { + iphc1 |= lowpan_compress_addr_64(&hc06_ptr, + LOWPAN_IPHC_DAM_BIT, &hdr->daddr, _daddr); + pr_debug("dest address unicast link-local %pI6c " + "iphc1 0x%02x\n", &hdr->daddr, iphc1); + } else { + pr_debug("dest address unicast %pI6c\n", &hdr->daddr); + memcpy(hc06_ptr, &hdr->daddr.s6_addr16[0], 16); + hc06_ptr += 16; + } + } + + /* UDP header compression */ + if (hdr->nexthdr == UIP_PROTO_UDP) + compress_udp_header(&hc06_ptr, skb); + + head[0] = iphc0; + head[1] = iphc1; + + skb_pull(skb, sizeof(struct ipv6hdr)); + skb_reset_transport_header(skb); + memcpy(skb_push(skb, hc06_ptr - head), head, hc06_ptr - head); + skb_reset_network_header(skb); + + pr_debug("header len %d skb %u\n", (int)(hc06_ptr - head), skb->len); + + raw_dump_table(__func__, "raw skb data dump compressed", + skb->data, skb->len); + return 0; +} +EXPORT_SYMBOL_GPL(lowpan_header_compress); + +MODULE_LICENSE("GPL"); diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c new file mode 100644 index 00000000000..fe6bd7a7108 --- /dev/null +++ b/net/ieee802154/6lowpan_rtnl.c @@ -0,0 +1,683 @@ +/* Copyright 2011, Siemens AG + * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> + */ + +/* Based on patches from Jon Smirl <jonsmirl@gmail.com> + * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* Jon's code is based on 6lowpan implementation for Contiki which is: + * Copyright (c) 2008, Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <linux/bitops.h> +#include <linux/if_arp.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netdevice.h> +#include <net/af_ieee802154.h> +#include <net/ieee802154.h> +#include <net/ieee802154_netdev.h> +#include <net/6lowpan.h> +#include <net/ipv6.h> + +#include "reassembly.h" + +static LIST_HEAD(lowpan_devices); + +/* private device info */ +struct lowpan_dev_info { + struct net_device *real_dev; /* real WPAN device ptr */ + struct mutex dev_list_mtx; /* mutex for list ops */ + __be16 fragment_tag; +}; + +struct lowpan_dev_record { + struct net_device *ldev; + struct list_head list; +}; + +static inline struct +lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) +{ + return netdev_priv(dev); +} + +static inline void lowpan_address_flip(u8 *src, u8 *dest) +{ + int i; + for (i = 0; i < IEEE802154_ADDR_LEN; i++) + (dest)[IEEE802154_ADDR_LEN - i - 1] = (src)[i]; +} + +static int lowpan_header_create(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) +{ + const u8 *saddr = _saddr; + const u8 *daddr = _daddr; + struct ieee802154_addr sa, da; + struct ieee802154_mac_cb *cb = mac_cb_init(skb); + + /* TODO: + * if this package isn't ipv6 one, where should it be routed? + */ + if (type != ETH_P_IPV6) + return 0; + + if (!saddr) + saddr = dev->dev_addr; + + raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); + raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); + + lowpan_header_compress(skb, dev, type, daddr, saddr, len); + + /* NOTE1: I'm still unsure about the fact that compression and WPAN + * header are created here and not later in the xmit. So wait for + * an opinion of net maintainers. + */ + /* NOTE2: to be absolutely correct, we must derive PANid information + * from MAC subif of the 'dev' and 'real_dev' network devices, but + * this isn't implemented in mainline yet, so currently we assign 0xff + */ + cb->type = IEEE802154_FC_TYPE_DATA; + + /* prepare wpan address data */ + sa.mode = IEEE802154_ADDR_LONG; + sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + sa.extended_addr = ieee802154_devaddr_from_raw(saddr); + + /* intra-PAN communications */ + da.pan_id = sa.pan_id; + + /* if the destination address is the broadcast address, use the + * corresponding short address + */ + if (lowpan_is_addr_broadcast(daddr)) { + da.mode = IEEE802154_ADDR_SHORT; + da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + } else { + da.mode = IEEE802154_ADDR_LONG; + da.extended_addr = ieee802154_devaddr_from_raw(daddr); + } + + cb->ackreq = !lowpan_is_addr_broadcast(daddr); + + return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, + type, (void *)&da, (void *)&sa, 0); +} + +static int lowpan_give_skb_to_devices(struct sk_buff *skb, + struct net_device *dev) +{ + struct lowpan_dev_record *entry; + struct sk_buff *skb_cp; + int stat = NET_RX_SUCCESS; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &lowpan_devices, list) + if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) { + skb_cp = skb_copy(skb, GFP_ATOMIC); + if (!skb_cp) { + stat = -ENOMEM; + break; + } + + skb_cp->dev = entry->ldev; + stat = netif_rx(skb_cp); + } + rcu_read_unlock(); + + return stat; +} + +static int process_data(struct sk_buff *skb, const struct ieee802154_hdr *hdr) +{ + u8 iphc0, iphc1; + struct ieee802154_addr_sa sa, da; + void *sap, *dap; + + raw_dump_table(__func__, "raw skb data dump", skb->data, skb->len); + /* at least two bytes will be used for the encoding */ + if (skb->len < 2) + goto drop; + + if (lowpan_fetch_skb_u8(skb, &iphc0)) + goto drop; + + if (lowpan_fetch_skb_u8(skb, &iphc1)) + goto drop; + + ieee802154_addr_to_sa(&sa, &hdr->source); + ieee802154_addr_to_sa(&da, &hdr->dest); + + if (sa.addr_type == IEEE802154_ADDR_SHORT) + sap = &sa.short_addr; + else + sap = &sa.hwaddr; + + if (da.addr_type == IEEE802154_ADDR_SHORT) + dap = &da.short_addr; + else + dap = &da.hwaddr; + + return lowpan_process_data(skb, skb->dev, sap, sa.addr_type, + IEEE802154_ADDR_LEN, dap, da.addr_type, + IEEE802154_ADDR_LEN, iphc0, iphc1, + lowpan_give_skb_to_devices); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int lowpan_set_address(struct net_device *dev, void *p) +{ + struct sockaddr *sa = p; + + if (netif_running(dev)) + return -EBUSY; + + /* TODO: validate addr */ + memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + + return 0; +} + +static struct sk_buff* +lowpan_alloc_frag(struct sk_buff *skb, int size, + const struct ieee802154_hdr *master_hdr) +{ + struct net_device *real_dev = lowpan_dev_info(skb->dev)->real_dev; + struct sk_buff *frag; + int rc; + + frag = alloc_skb(real_dev->hard_header_len + + real_dev->needed_tailroom + size, + GFP_ATOMIC); + + if (likely(frag)) { + frag->dev = real_dev; + frag->priority = skb->priority; + skb_reserve(frag, real_dev->hard_header_len); + skb_reset_network_header(frag); + *mac_cb(frag) = *mac_cb(skb); + + rc = dev_hard_header(frag, real_dev, 0, &master_hdr->dest, + &master_hdr->source, size); + if (rc < 0) { + kfree_skb(frag); + return ERR_PTR(-rc); + } + } else { + frag = ERR_PTR(ENOMEM); + } + + return frag; +} + +static int +lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, + u8 *frag_hdr, int frag_hdrlen, + int offset, int len) +{ + struct sk_buff *frag; + + raw_dump_inline(__func__, " fragment header", frag_hdr, frag_hdrlen); + + frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr); + if (IS_ERR(frag)) + return -PTR_ERR(frag); + + memcpy(skb_put(frag, frag_hdrlen), frag_hdr, frag_hdrlen); + memcpy(skb_put(frag, len), skb_network_header(skb) + offset, len); + + raw_dump_table(__func__, " fragment dump", frag->data, frag->len); + + return dev_queue_xmit(frag); +} + +static int +lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, + const struct ieee802154_hdr *wpan_hdr) +{ + u16 dgram_size, dgram_offset; + __be16 frag_tag; + u8 frag_hdr[5]; + int frag_cap, frag_len, payload_cap, rc; + int skb_unprocessed, skb_offset; + + dgram_size = lowpan_uncompress_size(skb, &dgram_offset) - + skb->mac_len; + frag_tag = lowpan_dev_info(dev)->fragment_tag++; + + frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07); + frag_hdr[1] = dgram_size & 0xff; + memcpy(frag_hdr + 2, &frag_tag, sizeof(frag_tag)); + + payload_cap = ieee802154_max_payload(wpan_hdr); + + frag_len = round_down(payload_cap - LOWPAN_FRAG1_HEAD_SIZE - + skb_network_header_len(skb), 8); + + skb_offset = skb_network_header_len(skb); + skb_unprocessed = skb->len - skb->mac_len - skb_offset; + + rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, + LOWPAN_FRAG1_HEAD_SIZE, 0, + frag_len + skb_network_header_len(skb)); + if (rc) { + pr_debug("%s unable to send FRAG1 packet (tag: %d)", + __func__, frag_tag); + goto err; + } + + frag_hdr[0] &= ~LOWPAN_DISPATCH_FRAG1; + frag_hdr[0] |= LOWPAN_DISPATCH_FRAGN; + frag_cap = round_down(payload_cap - LOWPAN_FRAGN_HEAD_SIZE, 8); + + do { + dgram_offset += frag_len; + skb_offset += frag_len; + skb_unprocessed -= frag_len; + frag_len = min(frag_cap, skb_unprocessed); + + frag_hdr[4] = dgram_offset >> 3; + + rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, + LOWPAN_FRAGN_HEAD_SIZE, skb_offset, + frag_len); + if (rc) { + pr_debug("%s unable to send a FRAGN packet. (tag: %d, offset: %d)\n", + __func__, frag_tag, skb_offset); + goto err; + } + } while (skb_unprocessed > frag_cap); + + consume_skb(skb); + return NET_XMIT_SUCCESS; + +err: + kfree_skb(skb); + return rc; +} + +static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ieee802154_hdr wpan_hdr; + int max_single; + + pr_debug("package xmit\n"); + + if (ieee802154_hdr_peek(skb, &wpan_hdr) < 0) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + + max_single = ieee802154_max_payload(&wpan_hdr); + + if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) { + skb->dev = lowpan_dev_info(dev)->real_dev; + return dev_queue_xmit(skb); + } else { + netdev_tx_t rc; + + pr_debug("frame is too big, fragmentation is needed\n"); + rc = lowpan_xmit_fragmented(skb, dev, &wpan_hdr); + + return rc < 0 ? NET_XMIT_DROP : rc; + } +} + +static struct wpan_phy *lowpan_get_phy(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_phy(real_dev); +} + +static __le16 lowpan_get_pan_id(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); +} + +static __le16 lowpan_get_short_addr(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); +} + +static u8 lowpan_get_dsn(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); +} + +static struct header_ops lowpan_header_ops = { + .create = lowpan_header_create, +}; + +static struct lock_class_key lowpan_tx_busylock; +static struct lock_class_key lowpan_netdev_xmit_lock_key; + +static void lowpan_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &lowpan_netdev_xmit_lock_key); +} + + +static int lowpan_dev_init(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, lowpan_set_lockdep_class_one, NULL); + dev->qdisc_tx_busylock = &lowpan_tx_busylock; + return 0; +} + +static const struct net_device_ops lowpan_netdev_ops = { + .ndo_init = lowpan_dev_init, + .ndo_start_xmit = lowpan_xmit, + .ndo_set_mac_address = lowpan_set_address, +}; + +static struct ieee802154_mlme_ops lowpan_mlme = { + .get_pan_id = lowpan_get_pan_id, + .get_phy = lowpan_get_phy, + .get_short_addr = lowpan_get_short_addr, + .get_dsn = lowpan_get_dsn, +}; + +static void lowpan_setup(struct net_device *dev) +{ + dev->addr_len = IEEE802154_ADDR_LEN; + memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN); + dev->type = ARPHRD_IEEE802154; + /* Frame Control + Sequence Number + Address fields + Security Header */ + dev->hard_header_len = 2 + 1 + 20 + 14; + dev->needed_tailroom = 2; /* FCS */ + dev->mtu = 1281; + dev->tx_queue_len = 0; + dev->flags = IFF_BROADCAST | IFF_MULTICAST; + dev->watchdog_timeo = 0; + + dev->netdev_ops = &lowpan_netdev_ops; + dev->header_ops = &lowpan_header_ops; + dev->ml_priv = &lowpan_mlme; + dev->destructor = free_netdev; +} + +static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN) + return -EINVAL; + } + return 0; +} + +static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct ieee802154_hdr hdr; + int ret; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto drop; + + if (!netif_running(dev)) + goto drop_skb; + + if (dev->type != ARPHRD_IEEE802154) + goto drop_skb; + + if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) + goto drop_skb; + + /* check that it's our buffer */ + if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + + /* Pull off the 1-byte of 6lowpan header. */ + skb_pull(skb, 1); + + ret = lowpan_give_skb_to_devices(skb, NULL); + if (ret == NET_RX_DROP) + goto drop; + } else { + switch (skb->data[0] & 0xe0) { + case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ + ret = process_data(skb, &hdr); + if (ret == NET_RX_DROP) + goto drop; + break; + case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ + ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAG1); + if (ret == 1) { + ret = process_data(skb, &hdr); + if (ret == NET_RX_DROP) + goto drop; + } + break; + case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */ + ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAGN); + if (ret == 1) { + ret = process_data(skb, &hdr); + if (ret == NET_RX_DROP) + goto drop; + } + break; + default: + break; + } + } + + return NET_RX_SUCCESS; +drop_skb: + kfree_skb(skb); +drop: + return NET_RX_DROP; +} + +static int lowpan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_device *real_dev; + struct lowpan_dev_record *entry; + + pr_debug("adding new link\n"); + + if (!tb[IFLA_LINK]) + return -EINVAL; + /* find and hold real wpan device */ + real_dev = dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!real_dev) + return -ENODEV; + if (real_dev->type != ARPHRD_IEEE802154) { + dev_put(real_dev); + return -EINVAL; + } + + lowpan_dev_info(dev)->real_dev = real_dev; + mutex_init(&lowpan_dev_info(dev)->dev_list_mtx); + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + dev_put(real_dev); + lowpan_dev_info(dev)->real_dev = NULL; + return -ENOMEM; + } + + entry->ldev = dev; + + /* Set the lowpan harware address to the wpan hardware address. */ + memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN); + + mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); + INIT_LIST_HEAD(&entry->list); + list_add_tail(&entry->list, &lowpan_devices); + mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); + + register_netdevice(dev); + + return 0; +} + +static void lowpan_dellink(struct net_device *dev, struct list_head *head) +{ + struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev); + struct net_device *real_dev = lowpan_dev->real_dev; + struct lowpan_dev_record *entry, *tmp; + + ASSERT_RTNL(); + + mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); + list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { + if (entry->ldev == dev) { + list_del(&entry->list); + kfree(entry); + } + } + mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); + + mutex_destroy(&lowpan_dev_info(dev)->dev_list_mtx); + + unregister_netdevice_queue(dev, head); + + dev_put(real_dev); +} + +static struct rtnl_link_ops lowpan_link_ops __read_mostly = { + .kind = "lowpan", + .priv_size = sizeof(struct lowpan_dev_info), + .setup = lowpan_setup, + .newlink = lowpan_newlink, + .dellink = lowpan_dellink, + .validate = lowpan_validate, +}; + +static inline int __init lowpan_netlink_init(void) +{ + return rtnl_link_register(&lowpan_link_ops); +} + +static inline void lowpan_netlink_fini(void) +{ + rtnl_link_unregister(&lowpan_link_ops); +} + +static int lowpan_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + LIST_HEAD(del_list); + struct lowpan_dev_record *entry, *tmp; + + if (dev->type != ARPHRD_IEEE802154) + goto out; + + if (event == NETDEV_UNREGISTER) { + list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { + if (lowpan_dev_info(entry->ldev)->real_dev == dev) + lowpan_dellink(entry->ldev, &del_list); + } + + unregister_netdevice_many(&del_list); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block lowpan_dev_notifier = { + .notifier_call = lowpan_device_event, +}; + +static struct packet_type lowpan_packet_type = { + .type = htons(ETH_P_IEEE802154), + .func = lowpan_rcv, +}; + +static int __init lowpan_init_module(void) +{ + int err = 0; + + err = lowpan_net_frag_init(); + if (err < 0) + goto out; + + err = lowpan_netlink_init(); + if (err < 0) + goto out_frag; + + dev_add_pack(&lowpan_packet_type); + + err = register_netdevice_notifier(&lowpan_dev_notifier); + if (err < 0) + goto out_pack; + + return 0; + +out_pack: + dev_remove_pack(&lowpan_packet_type); + lowpan_netlink_fini(); +out_frag: + lowpan_net_frag_exit(); +out: + return err; +} + +static void __exit lowpan_cleanup_module(void) +{ + lowpan_netlink_fini(); + + dev_remove_pack(&lowpan_packet_type); + + lowpan_net_frag_exit(); + + unregister_netdevice_notifier(&lowpan_dev_notifier); +} + +module_init(lowpan_init_module); +module_exit(lowpan_cleanup_module); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("lowpan"); diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig index b2e06df0076..8af1330b313 100644 --- a/net/ieee802154/Kconfig +++ b/net/ieee802154/Kconfig @@ -13,5 +13,12 @@ config IEEE802154 config IEEE802154_6LOWPAN tristate "6lowpan support over IEEE 802.15.4" depends on IEEE802154 && IPV6 + select 6LOWPAN_IPHC ---help--- - IPv6 compression over IEEE 802.15.4. + IPv6 compression over IEEE 802.15.4. + +config 6LOWPAN_IPHC + tristate + ---help--- + 6lowpan compression code which is shared between IEEE 802.15.4 and Bluetooth + stacks. diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile index d7716d64c6b..bf1b51497a4 100644 --- a/net/ieee802154/Makefile +++ b/net/ieee802154/Makefile @@ -1,5 +1,10 @@ obj-$(CONFIG_IEEE802154) += ieee802154.o af_802154.o obj-$(CONFIG_IEEE802154_6LOWPAN) += 6lowpan.o +obj-$(CONFIG_6LOWPAN_IPHC) += 6lowpan_iphc.o -ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o wpan-class.o +6lowpan-y := 6lowpan_rtnl.o reassembly.o +ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o wpan-class.o \ + header_ops.o af_802154-y := af_ieee802154.o raw.o dgram.o + +ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/ieee802154/af802154.h b/net/ieee802154/af802154.h index b1ec5253752..8330a09bfc9 100644 --- a/net/ieee802154/af802154.h +++ b/net/ieee802154/af802154.h @@ -25,12 +25,13 @@ #define AF802154_H struct sk_buff; -struct net_devce; +struct net_device; +struct ieee802154_addr; extern struct proto ieee802154_raw_prot; extern struct proto ieee802154_dgram_prot; void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb); int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb); struct net_device *ieee802154_get_dev(struct net *net, - struct ieee802154_addr *addr); + const struct ieee802154_addr *addr); #endif diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c index 40e606f3788..351d9a94ec2 100644 --- a/net/ieee802154/af_ieee802154.c +++ b/net/ieee802154/af_ieee802154.c @@ -43,25 +43,27 @@ /* * Utility function for families */ -struct net_device *ieee802154_get_dev(struct net *net, - struct ieee802154_addr *addr) +struct net_device* +ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) { struct net_device *dev = NULL; struct net_device *tmp; - u16 pan_id, short_addr; + __le16 pan_id, short_addr; + u8 hwaddr[IEEE802154_ADDR_LEN]; - switch (addr->addr_type) { + switch (addr->mode) { case IEEE802154_ADDR_LONG: + ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr); rcu_read_lock(); - dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, addr->hwaddr); + dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr); if (dev) dev_hold(dev); rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: - if (addr->pan_id == 0xffff || - addr->short_addr == IEEE802154_ADDR_UNDEF || - addr->short_addr == 0xffff) + if (addr->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST) || + addr->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || + addr->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) break; rtnl_lock(); @@ -86,7 +88,7 @@ struct net_device *ieee802154_get_dev(struct net *net, break; default: pr_warning("Unsupported ieee802154 address type: %d\n", - addr->addr_type); + addr->mode); break; } @@ -326,7 +328,7 @@ drop: static struct packet_type ieee802154_packet_type = { - .type = __constant_htons(ETH_P_IEEE802154), + .type = htons(ETH_P_IEEE802154), .func = ieee802154_rcv, }; diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index 1865fdf5a5a..4f0ed878019 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -21,6 +21,7 @@ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> */ +#include <linux/capability.h> #include <linux/net.h> #include <linux/module.h> #include <linux/if_arp.h> @@ -45,7 +46,12 @@ struct dgram_sock { struct ieee802154_addr dst_addr; unsigned int bound:1; + unsigned int connected:1; unsigned int want_ack:1; + unsigned int secen:1; + unsigned int secen_override:1; + unsigned int seclevel:3; + unsigned int seclevel_override:1; }; static inline struct dgram_sock *dgram_sk(const struct sock *sk) @@ -73,10 +79,7 @@ static int dgram_init(struct sock *sk) { struct dgram_sock *ro = dgram_sk(sk); - ro->dst_addr.addr_type = IEEE802154_ADDR_LONG; - ro->dst_addr.pan_id = 0xffff; ro->want_ack = 1; - memset(&ro->dst_addr.hwaddr, 0xff, sizeof(ro->dst_addr.hwaddr)); return 0; } @@ -88,6 +91,7 @@ static void dgram_close(struct sock *sk, long timeout) static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; + struct ieee802154_addr haddr; struct dgram_sock *ro = dgram_sk(sk); int err = -EINVAL; struct net_device *dev; @@ -102,7 +106,8 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) if (addr->family != AF_IEEE802154) goto out; - dev = ieee802154_get_dev(sock_net(sk), &addr->addr); + ieee802154_addr_from_sa(&haddr, &addr->addr); + dev = ieee802154_get_dev(sock_net(sk), &haddr); if (!dev) { err = -ENODEV; goto out; @@ -113,7 +118,7 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) goto out_put; } - memcpy(&ro->src_addr, &addr->addr, sizeof(struct ieee802154_addr)); + ro->src_addr = haddr; ro->bound = 1; err = 0; @@ -149,8 +154,7 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) * of this packet since that is all * that will be read. */ - /* FIXME: parse the header for more correct value */ - amount = skb->len - (3+8+8); + amount = skb->len - ieee802154_hdr_length(skb); } spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); @@ -181,7 +185,8 @@ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, goto out; } - memcpy(&ro->dst_addr, &addr->addr, sizeof(struct ieee802154_addr)); + ieee802154_addr_from_sa(&ro->dst_addr, &addr->addr); + ro->connected = 1; out: release_sock(sk); @@ -193,10 +198,7 @@ static int dgram_disconnect(struct sock *sk, int flags) struct dgram_sock *ro = dgram_sk(sk); lock_sock(sk); - - ro->dst_addr.addr_type = IEEE802154_ADDR_LONG; - memset(&ro->dst_addr.hwaddr, 0xff, sizeof(ro->dst_addr.hwaddr)); - + ro->connected = 0; release_sock(sk); return 0; @@ -208,7 +210,9 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, struct net_device *dev; unsigned int mtu; struct sk_buff *skb; + struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); + struct ieee802154_addr dst_addr; int hlen, tlen; int err; @@ -217,6 +221,11 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, return -EOPNOTSUPP; } + if (!ro->connected && !msg->msg_name) + return -EDESTADDRREQ; + else if (ro->connected && msg->msg_name) + return -EISCONN; + if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else @@ -232,7 +241,7 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, if (size > mtu) { pr_debug("size = %Zu, mtu = %u\n", size, mtu); - err = -EINVAL; + err = -EMSGSIZE; goto out_dev; } @@ -248,18 +257,28 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, skb_reset_network_header(skb); - mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA; - if (ro->want_ack) - mac_cb(skb)->flags |= MAC_CB_FLAG_ACKREQ; + cb = mac_cb_init(skb); + cb->type = IEEE802154_FC_TYPE_DATA; + cb->ackreq = ro->want_ack; + + if (msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); + + ieee802154_addr_from_sa(&dst_addr, &daddr->addr); + } else { + dst_addr = ro->dst_addr; + } + + cb->secen = ro->secen; + cb->secen_override = ro->secen_override; + cb->seclevel = ro->seclevel; + cb->seclevel_override = ro->seclevel_override; - mac_cb(skb)->seq = ieee802154_mlme_ops(dev)->get_dsn(dev); - err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &ro->dst_addr, - ro->bound ? &ro->src_addr : NULL, size); + err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &dst_addr, + ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; - skb_reset_mac_header(skb); - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err < 0) goto out_skb; @@ -291,9 +310,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; - struct sockaddr_ieee802154 *saddr; - - saddr = (struct sockaddr_ieee802154 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) @@ -314,7 +331,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, if (saddr) { saddr->family = AF_IEEE802154; - saddr->addr = mac_cb(skb)->sa; + ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); *addr_len = sizeof(*saddr); } @@ -330,6 +347,10 @@ out: static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; @@ -338,40 +359,43 @@ static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb) return NET_RX_SUCCESS; } -static inline int ieee802154_match_sock(u8 *hw_addr, u16 pan_id, - u16 short_addr, struct dgram_sock *ro) +static inline bool +ieee802154_match_sock(__le64 hw_addr, __le16 pan_id, __le16 short_addr, + struct dgram_sock *ro) { if (!ro->bound) - return 1; + return true; - if (ro->src_addr.addr_type == IEEE802154_ADDR_LONG && - !memcmp(ro->src_addr.hwaddr, hw_addr, IEEE802154_ADDR_LEN)) - return 1; + if (ro->src_addr.mode == IEEE802154_ADDR_LONG && + hw_addr == ro->src_addr.extended_addr) + return true; - if (ro->src_addr.addr_type == IEEE802154_ADDR_SHORT && - pan_id == ro->src_addr.pan_id && - short_addr == ro->src_addr.short_addr) - return 1; + if (ro->src_addr.mode == IEEE802154_ADDR_SHORT && + pan_id == ro->src_addr.pan_id && + short_addr == ro->src_addr.short_addr) + return true; - return 0; + return false; } int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk, *prev = NULL; int ret = NET_RX_SUCCESS; - u16 pan_id, short_addr; + __le16 pan_id, short_addr; + __le64 hw_addr; /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev); + hw_addr = ieee802154_devaddr_from_raw(dev->dev_addr); read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { - if (ieee802154_match_sock(dev->dev_addr, pan_id, short_addr, - dgram_sk(sk))) { + if (ieee802154_match_sock(hw_addr, pan_id, short_addr, + dgram_sk(sk))) { if (prev) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); @@ -413,6 +437,20 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, case WPAN_WANTACK: val = ro->want_ack; break; + case WPAN_SECURITY: + if (!ro->secen_override) + val = WPAN_SECURITY_DEFAULT; + else if (ro->secen) + val = WPAN_SECURITY_ON; + else + val = WPAN_SECURITY_OFF; + break; + case WPAN_SECURITY_LEVEL: + if (!ro->seclevel_override) + val = WPAN_SECURITY_LEVEL_DEFAULT; + else + val = ro->seclevel; + break; default: return -ENOPROTOOPT; } @@ -428,6 +466,7 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); + struct net *net = sock_net(sk); int val; int err = 0; @@ -443,6 +482,47 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname, case WPAN_WANTACK: ro->want_ack = !!val; break; + case WPAN_SECURITY: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && + !ns_capable(net->user_ns, CAP_NET_RAW)) { + err = -EPERM; + break; + } + + switch (val) { + case WPAN_SECURITY_DEFAULT: + ro->secen_override = 0; + break; + case WPAN_SECURITY_ON: + ro->secen_override = 1; + ro->secen = 1; + break; + case WPAN_SECURITY_OFF: + ro->secen_override = 1; + ro->secen = 0; + break; + default: + err = -EINVAL; + break; + } + break; + case WPAN_SECURITY_LEVEL: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && + !ns_capable(net->user_ns, CAP_NET_RAW)) { + err = -EPERM; + break; + } + + if (val < WPAN_SECURITY_LEVEL_DEFAULT || + val > IEEE802154_SCF_SECLEVEL_ENC_MIC128) { + err = -EINVAL; + } else if (val == WPAN_SECURITY_LEVEL_DEFAULT) { + ro->seclevel_override = 0; + } else { + ro->seclevel_override = 1; + ro->seclevel = val; + } + break; default: err = -ENOPROTOOPT; break; diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c new file mode 100644 index 00000000000..c09294e39ca --- /dev/null +++ b/net/ieee802154/header_ops.c @@ -0,0 +1,325 @@ +/* + * Copyright (C) 2014 Fraunhofer ITWM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by: + * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de> + */ + +#include <net/mac802154.h> +#include <net/ieee802154.h> +#include <net/ieee802154_netdev.h> + +static int +ieee802154_hdr_push_addr(u8 *buf, const struct ieee802154_addr *addr, + bool omit_pan) +{ + int pos = 0; + + if (addr->mode == IEEE802154_ADDR_NONE) + return 0; + + if (!omit_pan) { + memcpy(buf + pos, &addr->pan_id, 2); + pos += 2; + } + + switch (addr->mode) { + case IEEE802154_ADDR_SHORT: + memcpy(buf + pos, &addr->short_addr, 2); + pos += 2; + break; + + case IEEE802154_ADDR_LONG: + memcpy(buf + pos, &addr->extended_addr, IEEE802154_ADDR_LEN); + pos += IEEE802154_ADDR_LEN; + break; + + default: + return -EINVAL; + } + + return pos; +} + +static int +ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) +{ + int pos = 5; + + memcpy(buf, hdr, 1); + memcpy(buf + 1, &hdr->frame_counter, 4); + + switch (hdr->key_id_mode) { + case IEEE802154_SCF_KEY_IMPLICIT: + return pos; + + case IEEE802154_SCF_KEY_INDEX: + break; + + case IEEE802154_SCF_KEY_SHORT_INDEX: + memcpy(buf + pos, &hdr->short_src, 4); + pos += 4; + break; + + case IEEE802154_SCF_KEY_HW_INDEX: + memcpy(buf + pos, &hdr->extended_src, IEEE802154_ADDR_LEN); + pos += IEEE802154_ADDR_LEN; + break; + } + + buf[pos++] = hdr->key_id; + + return pos; +} + +int +ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr) +{ + u8 buf[MAC802154_FRAME_HARD_HEADER_LEN]; + int pos = 2; + int rc; + struct ieee802154_hdr_fc fc = hdr->fc; + + buf[pos++] = hdr->seq; + + fc.dest_addr_mode = hdr->dest.mode; + + rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false); + if (rc < 0) + return -EINVAL; + pos += rc; + + fc.source_addr_mode = hdr->source.mode; + + if (hdr->source.pan_id == hdr->dest.pan_id && + hdr->dest.mode != IEEE802154_ADDR_NONE) + fc.intra_pan = true; + + rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc.intra_pan); + if (rc < 0) + return -EINVAL; + pos += rc; + + if (fc.security_enabled) { + fc.version = 1; + + rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec); + if (rc < 0) + return -EINVAL; + + pos += rc; + } + + memcpy(buf, &fc, 2); + + memcpy(skb_push(skb, pos), buf, pos); + + return pos; +} +EXPORT_SYMBOL_GPL(ieee802154_hdr_push); + +static int +ieee802154_hdr_get_addr(const u8 *buf, int mode, bool omit_pan, + struct ieee802154_addr *addr) +{ + int pos = 0; + + addr->mode = mode; + + if (mode == IEEE802154_ADDR_NONE) + return 0; + + if (!omit_pan) { + memcpy(&addr->pan_id, buf + pos, 2); + pos += 2; + } + + if (mode == IEEE802154_ADDR_SHORT) { + memcpy(&addr->short_addr, buf + pos, 2); + return pos + 2; + } else { + memcpy(&addr->extended_addr, buf + pos, IEEE802154_ADDR_LEN); + return pos + IEEE802154_ADDR_LEN; + } +} + +static int ieee802154_hdr_addr_len(int mode, bool omit_pan) +{ + int pan_len = omit_pan ? 0 : 2; + + switch (mode) { + case IEEE802154_ADDR_NONE: return 0; + case IEEE802154_ADDR_SHORT: return 2 + pan_len; + case IEEE802154_ADDR_LONG: return IEEE802154_ADDR_LEN + pan_len; + default: return -EINVAL; + } +} + +static int +ieee802154_hdr_get_sechdr(const u8 *buf, struct ieee802154_sechdr *hdr) +{ + int pos = 5; + + memcpy(hdr, buf, 1); + memcpy(&hdr->frame_counter, buf + 1, 4); + + switch (hdr->key_id_mode) { + case IEEE802154_SCF_KEY_IMPLICIT: + return pos; + + case IEEE802154_SCF_KEY_INDEX: + break; + + case IEEE802154_SCF_KEY_SHORT_INDEX: + memcpy(&hdr->short_src, buf + pos, 4); + pos += 4; + break; + + case IEEE802154_SCF_KEY_HW_INDEX: + memcpy(&hdr->extended_src, buf + pos, IEEE802154_ADDR_LEN); + pos += IEEE802154_ADDR_LEN; + break; + } + + hdr->key_id = buf[pos++]; + + return pos; +} + +static int ieee802154_sechdr_lengths[4] = { + [IEEE802154_SCF_KEY_IMPLICIT] = 5, + [IEEE802154_SCF_KEY_INDEX] = 6, + [IEEE802154_SCF_KEY_SHORT_INDEX] = 10, + [IEEE802154_SCF_KEY_HW_INDEX] = 14, +}; + +static int ieee802154_hdr_sechdr_len(u8 sc) +{ + return ieee802154_sechdr_lengths[IEEE802154_SCF_KEY_ID_MODE(sc)]; +} + +static int ieee802154_hdr_minlen(const struct ieee802154_hdr *hdr) +{ + int dlen, slen; + + dlen = ieee802154_hdr_addr_len(hdr->fc.dest_addr_mode, false); + slen = ieee802154_hdr_addr_len(hdr->fc.source_addr_mode, + hdr->fc.intra_pan); + + if (slen < 0 || dlen < 0) + return -EINVAL; + + return 3 + dlen + slen + hdr->fc.security_enabled; +} + +static int +ieee802154_hdr_get_addrs(const u8 *buf, struct ieee802154_hdr *hdr) +{ + int pos = 0; + + pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.dest_addr_mode, + false, &hdr->dest); + pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.source_addr_mode, + hdr->fc.intra_pan, &hdr->source); + + if (hdr->fc.intra_pan) + hdr->source.pan_id = hdr->dest.pan_id; + + return pos; +} + +int +ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr) +{ + int pos = 3, rc; + + if (!pskb_may_pull(skb, 3)) + return -EINVAL; + + memcpy(hdr, skb->data, 3); + + rc = ieee802154_hdr_minlen(hdr); + if (rc < 0 || !pskb_may_pull(skb, rc)) + return -EINVAL; + + pos += ieee802154_hdr_get_addrs(skb->data + pos, hdr); + + if (hdr->fc.security_enabled) { + int want = pos + ieee802154_hdr_sechdr_len(skb->data[pos]); + + if (!pskb_may_pull(skb, want)) + return -EINVAL; + + pos += ieee802154_hdr_get_sechdr(skb->data + pos, &hdr->sec); + } + + skb_pull(skb, pos); + return pos; +} +EXPORT_SYMBOL_GPL(ieee802154_hdr_pull); + +int +ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr) +{ + const u8 *buf = skb_mac_header(skb); + int pos = 3, rc; + + if (buf + 3 > skb_tail_pointer(skb)) + return -EINVAL; + + memcpy(hdr, buf, 3); + + rc = ieee802154_hdr_minlen(hdr); + if (rc < 0 || buf + rc > skb_tail_pointer(skb)) + return -EINVAL; + + pos += ieee802154_hdr_get_addrs(buf + pos, hdr); + return pos; +} +EXPORT_SYMBOL_GPL(ieee802154_hdr_peek_addrs); + +int +ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr) +{ + const u8 *buf = skb_mac_header(skb); + int pos; + + pos = ieee802154_hdr_peek_addrs(skb, hdr); + if (pos < 0) + return -EINVAL; + + if (hdr->fc.security_enabled) { + u8 key_id_mode = IEEE802154_SCF_KEY_ID_MODE(*(buf + pos)); + int want = pos + ieee802154_sechdr_lengths[key_id_mode]; + + if (buf + want > skb_tail_pointer(skb)) + return -EINVAL; + + pos += ieee802154_hdr_get_sechdr(buf + pos, &hdr->sec); + } + + return pos; +} +EXPORT_SYMBOL_GPL(ieee802154_hdr_peek); + +int ieee802154_max_payload(const struct ieee802154_hdr *hdr) +{ + int hlen = ieee802154_hdr_minlen(hdr); + + if (hdr->fc.security_enabled) { + hlen += ieee802154_sechdr_lengths[hdr->sec.key_id_mode] - 1; + hlen += ieee802154_sechdr_authtag_len(&hdr->sec); + } + + return IEEE802154_MTU - hlen - IEEE802154_MFR_SIZE; +} +EXPORT_SYMBOL_GPL(ieee802154_max_payload); diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h index cee4425b995..8b83a231299 100644 --- a/net/ieee802154/ieee802154.h +++ b/net/ieee802154/ieee802154.h @@ -66,5 +66,25 @@ int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info); int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info); int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info); int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb); +int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info); + +int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_dump_keys(struct sk_buff *skb, + struct netlink_callback *cb); +int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_dump_devs(struct sk_buff *skb, + struct netlink_callback *cb); +int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_dump_devkeys(struct sk_buff *skb, + struct netlink_callback *cb); +int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info); +int ieee802154_llsec_dump_seclevels(struct sk_buff *skb, + struct netlink_callback *cb); #endif diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 43f1b2bf469..26efcf4fd2f 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -123,6 +123,27 @@ static const struct genl_ops ieee8021154_ops[] = { IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req), IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface, ieee802154_dump_iface), + IEEE802154_OP(IEEE802154_SET_MACPARAMS, ieee802154_set_macparams), + IEEE802154_OP(IEEE802154_LLSEC_GETPARAMS, ieee802154_llsec_getparams), + IEEE802154_OP(IEEE802154_LLSEC_SETPARAMS, ieee802154_llsec_setparams), + IEEE802154_DUMP(IEEE802154_LLSEC_LIST_KEY, NULL, + ieee802154_llsec_dump_keys), + IEEE802154_OP(IEEE802154_LLSEC_ADD_KEY, ieee802154_llsec_add_key), + IEEE802154_OP(IEEE802154_LLSEC_DEL_KEY, ieee802154_llsec_del_key), + IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEV, NULL, + ieee802154_llsec_dump_devs), + IEEE802154_OP(IEEE802154_LLSEC_ADD_DEV, ieee802154_llsec_add_dev), + IEEE802154_OP(IEEE802154_LLSEC_DEL_DEV, ieee802154_llsec_del_dev), + IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEVKEY, NULL, + ieee802154_llsec_dump_devkeys), + IEEE802154_OP(IEEE802154_LLSEC_ADD_DEVKEY, ieee802154_llsec_add_devkey), + IEEE802154_OP(IEEE802154_LLSEC_DEL_DEVKEY, ieee802154_llsec_del_devkey), + IEEE802154_DUMP(IEEE802154_LLSEC_LIST_SECLEVEL, NULL, + ieee802154_llsec_dump_seclevels), + IEEE802154_OP(IEEE802154_LLSEC_ADD_SECLEVEL, + ieee802154_llsec_add_seclevel), + IEEE802154_OP(IEEE802154_LLSEC_DEL_SECLEVEL, + ieee802154_llsec_del_seclevel), }; static const struct genl_multicast_group ieee802154_mcgrps[] = { diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index ba5c1e002f3..a3281b8bfd5 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -39,6 +39,26 @@ #include "ieee802154.h" +static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr) +{ + return nla_put_u64(msg, type, swab64((__force u64)hwaddr)); +} + +static __le64 nla_get_hwaddr(const struct nlattr *nla) +{ + return ieee802154_devaddr_from_raw(nla_data(nla)); +} + +static int nla_put_shortaddr(struct sk_buff *msg, int type, __le16 addr) +{ + return nla_put_u16(msg, type, le16_to_cpu(addr)); +} + +static __le16 nla_get_shortaddr(const struct nlattr *nla) +{ + return cpu_to_le16(nla_get_u16(nla)); +} + int ieee802154_nl_assoc_indic(struct net_device *dev, struct ieee802154_addr *addr, u8 cap) { @@ -46,7 +66,7 @@ int ieee802154_nl_assoc_indic(struct net_device *dev, pr_debug("%s\n", __func__); - if (addr->addr_type != IEEE802154_ADDR_LONG) { + if (addr->mode != IEEE802154_ADDR_LONG) { pr_err("%s: received non-long source address!\n", __func__); return -EINVAL; } @@ -59,8 +79,8 @@ int ieee802154_nl_assoc_indic(struct net_device *dev, nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || - nla_put(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN, - addr->hwaddr) || + nla_put_hwaddr(msg, IEEE802154_ATTR_SRC_HW_ADDR, + addr->extended_addr) || nla_put_u8(msg, IEEE802154_ATTR_CAPABILITY, cap)) goto nla_put_failure; @@ -72,7 +92,7 @@ nla_put_failure: } EXPORT_SYMBOL(ieee802154_nl_assoc_indic); -int ieee802154_nl_assoc_confirm(struct net_device *dev, u16 short_addr, +int ieee802154_nl_assoc_confirm(struct net_device *dev, __le16 short_addr, u8 status) { struct sk_buff *msg; @@ -87,7 +107,7 @@ int ieee802154_nl_assoc_confirm(struct net_device *dev, u16 short_addr, nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || - nla_put_u16(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) || + nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) || nla_put_u8(msg, IEEE802154_ATTR_STATUS, status)) goto nla_put_failure; return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP); @@ -114,13 +134,13 @@ int ieee802154_nl_disassoc_indic(struct net_device *dev, nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr)) goto nla_put_failure; - if (addr->addr_type == IEEE802154_ADDR_LONG) { - if (nla_put(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN, - addr->hwaddr)) + if (addr->mode == IEEE802154_ADDR_LONG) { + if (nla_put_hwaddr(msg, IEEE802154_ATTR_SRC_HW_ADDR, + addr->extended_addr)) goto nla_put_failure; } else { - if (nla_put_u16(msg, IEEE802154_ATTR_SRC_SHORT_ADDR, - addr->short_addr)) + if (nla_put_shortaddr(msg, IEEE802154_ATTR_SRC_SHORT_ADDR, + addr->short_addr)) goto nla_put_failure; } if (nla_put_u8(msg, IEEE802154_ATTR_REASON, reason)) @@ -157,8 +177,8 @@ nla_put_failure: } EXPORT_SYMBOL(ieee802154_nl_disassoc_confirm); -int ieee802154_nl_beacon_indic(struct net_device *dev, - u16 panid, u16 coord_addr) +int ieee802154_nl_beacon_indic(struct net_device *dev, __le16 panid, + __le16 coord_addr) { struct sk_buff *msg; @@ -172,8 +192,9 @@ int ieee802154_nl_beacon_indic(struct net_device *dev, nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || - nla_put_u16(msg, IEEE802154_ATTR_COORD_SHORT_ADDR, coord_addr) || - nla_put_u16(msg, IEEE802154_ATTR_COORD_PAN_ID, panid)) + nla_put_shortaddr(msg, IEEE802154_ATTR_COORD_SHORT_ADDR, + coord_addr) || + nla_put_shortaddr(msg, IEEE802154_ATTR_COORD_PAN_ID, panid)) goto nla_put_failure; return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP); @@ -243,6 +264,8 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, { void *hdr; struct wpan_phy *phy; + struct ieee802154_mlme_ops *ops; + __le16 short_addr, pan_id; pr_debug("%s\n", __func__); @@ -251,19 +274,45 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, if (!hdr) goto out; - phy = ieee802154_mlme_ops(dev)->get_phy(dev); + ops = ieee802154_mlme_ops(dev); + phy = ops->get_phy(dev); BUG_ON(!phy); + short_addr = ops->get_short_addr(dev); + pan_id = ops->get_pan_id(dev); + if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || - nla_put_u16(msg, IEEE802154_ATTR_SHORT_ADDR, - ieee802154_mlme_ops(dev)->get_short_addr(dev)) || - nla_put_u16(msg, IEEE802154_ATTR_PAN_ID, - ieee802154_mlme_ops(dev)->get_pan_id(dev))) + nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) || + nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, pan_id)) goto nla_put_failure; + + if (ops->get_mac_params) { + struct ieee802154_mac_params params; + + ops->get_mac_params(dev, ¶ms); + + if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER, + params.transmit_power) || + nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) || + nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE, + params.cca_mode) || + nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL, + params.cca_ed_level) || + nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES, + params.csma_retries) || + nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE, + params.min_be) || + nla_put_u8(msg, IEEE802154_ATTR_CSMA_MAX_BE, + params.max_be) || + nla_put_s8(msg, IEEE802154_ATTR_FRAME_RETRIES, + params.frame_retries)) + goto nla_put_failure; + } + wpan_phy_put(phy); return genlmsg_end(msg, hdr); @@ -322,16 +371,16 @@ int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info) goto out; if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) { - addr.addr_type = IEEE802154_ADDR_LONG; - nla_memcpy(addr.hwaddr, - info->attrs[IEEE802154_ATTR_COORD_HW_ADDR], - IEEE802154_ADDR_LEN); + addr.mode = IEEE802154_ADDR_LONG; + addr.extended_addr = nla_get_hwaddr( + info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]); } else { - addr.addr_type = IEEE802154_ADDR_SHORT; - addr.short_addr = nla_get_u16( + addr.mode = IEEE802154_ADDR_SHORT; + addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); } - addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); + addr.pan_id = nla_get_shortaddr( + info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); if (info->attrs[IEEE802154_ATTR_PAGE]) page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]); @@ -365,14 +414,13 @@ int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info) if (!ieee802154_mlme_ops(dev)->assoc_resp) goto out; - addr.addr_type = IEEE802154_ADDR_LONG; - nla_memcpy(addr.hwaddr, info->attrs[IEEE802154_ATTR_DEST_HW_ADDR], - IEEE802154_ADDR_LEN); + addr.mode = IEEE802154_ADDR_LONG; + addr.extended_addr = nla_get_hwaddr( + info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr, - nla_get_u16(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), + nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS])); out: @@ -398,13 +446,12 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) goto out; if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) { - addr.addr_type = IEEE802154_ADDR_LONG; - nla_memcpy(addr.hwaddr, - info->attrs[IEEE802154_ATTR_DEST_HW_ADDR], - IEEE802154_ADDR_LEN); + addr.mode = IEEE802154_ADDR_LONG; + addr.extended_addr = nla_get_hwaddr( + info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); } else { - addr.addr_type = IEEE802154_ADDR_SHORT; - addr.short_addr = nla_get_u16( + addr.mode = IEEE802154_ADDR_SHORT; + addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]); } addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); @@ -449,10 +496,11 @@ int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) if (!ieee802154_mlme_ops(dev)->start_req) goto out; - addr.addr_type = IEEE802154_ADDR_SHORT; - addr.short_addr = nla_get_u16( + addr.mode = IEEE802154_ADDR_SHORT; + addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); - addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); + addr.pan_id = nla_get_shortaddr( + info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]); bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]); @@ -467,7 +515,7 @@ int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) page = 0; - if (addr.short_addr == IEEE802154_ADDR_BROADCAST) { + if (addr.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS); dev_put(dev); return -EINVAL; @@ -577,3 +625,902 @@ cont: return skb->len; } + +int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *dev = NULL; + struct ieee802154_mlme_ops *ops; + struct ieee802154_mac_params params; + struct wpan_phy *phy; + int rc = -EINVAL; + + pr_debug("%s\n", __func__); + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + ops = ieee802154_mlme_ops(dev); + + if (!ops->get_mac_params || !ops->set_mac_params) { + rc = -EOPNOTSUPP; + goto out; + } + + if (netif_running(dev)) { + rc = -EBUSY; + goto out; + } + + if (!info->attrs[IEEE802154_ATTR_LBT_ENABLED] && + !info->attrs[IEEE802154_ATTR_CCA_MODE] && + !info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL] && + !info->attrs[IEEE802154_ATTR_CSMA_RETRIES] && + !info->attrs[IEEE802154_ATTR_CSMA_MIN_BE] && + !info->attrs[IEEE802154_ATTR_CSMA_MAX_BE] && + !info->attrs[IEEE802154_ATTR_FRAME_RETRIES]) + goto out; + + phy = ops->get_phy(dev); + + if ((!phy->set_lbt && info->attrs[IEEE802154_ATTR_LBT_ENABLED]) || + (!phy->set_cca_mode && info->attrs[IEEE802154_ATTR_CCA_MODE]) || + (!phy->set_cca_ed_level && + info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) || + (!phy->set_csma_params && + (info->attrs[IEEE802154_ATTR_CSMA_RETRIES] || + info->attrs[IEEE802154_ATTR_CSMA_MIN_BE] || + info->attrs[IEEE802154_ATTR_CSMA_MAX_BE])) || + (!phy->set_frame_retries && + info->attrs[IEEE802154_ATTR_FRAME_RETRIES])) { + rc = -EOPNOTSUPP; + goto out_phy; + } + + ops->get_mac_params(dev, ¶ms); + + if (info->attrs[IEEE802154_ATTR_TXPOWER]) + params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]); + + if (info->attrs[IEEE802154_ATTR_LBT_ENABLED]) + params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]); + + if (info->attrs[IEEE802154_ATTR_CCA_MODE]) + params.cca_mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]); + + if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) + params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]); + + if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES]) + params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]); + + if (info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]) + params.min_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]); + + if (info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]) + params.max_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]); + + if (info->attrs[IEEE802154_ATTR_FRAME_RETRIES]) + params.frame_retries = nla_get_s8(info->attrs[IEEE802154_ATTR_FRAME_RETRIES]); + + rc = ops->set_mac_params(dev, ¶ms); + + wpan_phy_put(phy); + dev_put(dev); + return rc; + +out_phy: + wpan_phy_put(phy); +out: + dev_put(dev); + return rc; +} + + + +static int +ieee802154_llsec_parse_key_id(struct genl_info *info, + struct ieee802154_llsec_key_id *desc) +{ + memset(desc, 0, sizeof(*desc)); + + if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) + return -EINVAL; + + desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]); + + if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) { + if (!info->attrs[IEEE802154_ATTR_PAN_ID] && + !(info->attrs[IEEE802154_ATTR_SHORT_ADDR] || + info->attrs[IEEE802154_ATTR_HW_ADDR])) + return -EINVAL; + + desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]); + + if (info->attrs[IEEE802154_ATTR_SHORT_ADDR]) { + desc->device_addr.mode = IEEE802154_ADDR_SHORT; + desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]); + } else { + desc->device_addr.mode = IEEE802154_ADDR_LONG; + desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); + } + } + + if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT && + !info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]) + return -EINVAL; + + if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX && + !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]) + return -EINVAL; + + if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX && + !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]) + return -EINVAL; + + if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT) + desc->id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]); + + switch (desc->mode) { + case IEEE802154_SCF_KEY_SHORT_INDEX: + { + u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]); + desc->short_source = cpu_to_le32(source); + break; + } + case IEEE802154_SCF_KEY_HW_INDEX: + desc->extended_source = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]); + break; + } + + return 0; +} + +static int +ieee802154_llsec_fill_key_id(struct sk_buff *msg, + const struct ieee802154_llsec_key_id *desc) +{ + if (nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_MODE, desc->mode)) + return -EMSGSIZE; + + if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) { + if (nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, + desc->device_addr.pan_id)) + return -EMSGSIZE; + + if (desc->device_addr.mode == IEEE802154_ADDR_SHORT && + nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, + desc->device_addr.short_addr)) + return -EMSGSIZE; + + if (desc->device_addr.mode == IEEE802154_ADDR_LONG && + nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, + desc->device_addr.extended_addr)) + return -EMSGSIZE; + } + + if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT && + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_ID, desc->id)) + return -EMSGSIZE; + + if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX && + nla_put_u32(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT, + le32_to_cpu(desc->short_source))) + return -EMSGSIZE; + + if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX && + nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED, + desc->extended_source)) + return -EMSGSIZE; + + return 0; +} + +int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + struct net_device *dev = NULL; + int rc = -ENOBUFS; + struct ieee802154_mlme_ops *ops; + void *hdr; + struct ieee802154_llsec_params params; + + pr_debug("%s\n", __func__); + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + ops = ieee802154_mlme_ops(dev); + if (!ops->llsec) { + rc = -EOPNOTSUPP; + goto out_dev; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto out_dev; + + hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0, + IEEE802154_LLSEC_GETPARAMS); + if (!hdr) + goto out_free; + + rc = ops->llsec->get_params(dev, ¶ms); + if (rc < 0) + goto out_free; + + if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || + nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_ENABLED, params.enabled) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) || + nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, + be32_to_cpu(params.frame_counter)) || + ieee802154_llsec_fill_key_id(msg, ¶ms.out_key)) + goto out_free; + + dev_put(dev); + + return ieee802154_nl_reply(msg, info); +out_free: + nlmsg_free(msg); +out_dev: + dev_put(dev); + return rc; +} + +int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *dev = NULL; + int rc = -EINVAL; + struct ieee802154_mlme_ops *ops; + struct ieee802154_llsec_params params; + int changed = 0; + + pr_debug("%s\n", __func__); + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + if (!info->attrs[IEEE802154_ATTR_LLSEC_ENABLED] && + !info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE] && + !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) + goto out; + + ops = ieee802154_mlme_ops(dev); + if (!ops->llsec) { + rc = -EOPNOTSUPP; + goto out; + } + + if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL] && + nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) > 7) + goto out; + + if (info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]) { + params.enabled = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]); + changed |= IEEE802154_LLSEC_PARAM_ENABLED; + } + + if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) { + if (ieee802154_llsec_parse_key_id(info, ¶ms.out_key)) + goto out; + + changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; + } + + if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) { + params.out_level = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]); + changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; + } + + if (info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]) { + u32 fc = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); + + params.frame_counter = cpu_to_be32(fc); + changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; + } + + rc = ops->llsec->set_params(dev, ¶ms, changed); + + dev_put(dev); + + return rc; +out: + dev_put(dev); + return rc; +} + + + +struct llsec_dump_data { + struct sk_buff *skb; + int s_idx, s_idx2; + int portid; + int nlmsg_seq; + struct net_device *dev; + struct ieee802154_mlme_ops *ops; + struct ieee802154_llsec_table *table; +}; + +static int +ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, + int (*step)(struct llsec_dump_data*)) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct llsec_dump_data data; + int idx = 0; + int first_dev = cb->args[0]; + int rc; + + for_each_netdev(net, dev) { + if (idx < first_dev || dev->type != ARPHRD_IEEE802154) + goto skip; + + data.ops = ieee802154_mlme_ops(dev); + if (!data.ops->llsec) + goto skip; + + data.skb = skb; + data.s_idx = cb->args[1]; + data.s_idx2 = cb->args[2]; + data.dev = dev; + data.portid = NETLINK_CB(cb->skb).portid; + data.nlmsg_seq = cb->nlh->nlmsg_seq; + + data.ops->llsec->lock_table(dev); + data.ops->llsec->get_table(data.dev, &data.table); + rc = step(&data); + data.ops->llsec->unlock_table(dev); + + if (rc < 0) + break; + +skip: + idx++; + } + cb->args[0] = idx; + + return skb->len; +} + +static int +ieee802154_nl_llsec_change(struct sk_buff *skb, struct genl_info *info, + int (*fn)(struct net_device*, struct genl_info*)) +{ + struct net_device *dev = NULL; + int rc = -EINVAL; + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + if (!ieee802154_mlme_ops(dev)->llsec) + rc = -EOPNOTSUPP; + else + rc = fn(dev, info); + + dev_put(dev); + return rc; +} + + + +static int +ieee802154_llsec_parse_key(struct genl_info *info, + struct ieee802154_llsec_key *key) +{ + u8 frames; + u32 commands[256 / 32]; + + memset(key, 0, sizeof(*key)); + + if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] || + !info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES]) + return -EINVAL; + + frames = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES]); + if ((frames & BIT(IEEE802154_FC_TYPE_MAC_CMD)) && + !info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) + return -EINVAL; + + if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) { + nla_memcpy(commands, + info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS], + 256 / 8); + + if (commands[0] || commands[1] || commands[2] || commands[3] || + commands[4] || commands[5] || commands[6] || + commands[7] >= BIT(IEEE802154_CMD_GTS_REQ + 1)) + return -EINVAL; + + key->cmd_frame_ids = commands[7]; + } + + key->frame_types = frames; + + nla_memcpy(key->key, info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES], + IEEE802154_LLSEC_KEY_SIZE); + + return 0; +} + +static int llsec_add_key(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + struct ieee802154_llsec_key key; + struct ieee802154_llsec_key_id id; + + if (ieee802154_llsec_parse_key(info, &key) || + ieee802154_llsec_parse_key_id(info, &id)) + return -EINVAL; + + return ops->llsec->add_key(dev, &id, &key); +} + +int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info) +{ + if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != + (NLM_F_CREATE | NLM_F_EXCL)) + return -EINVAL; + + return ieee802154_nl_llsec_change(skb, info, llsec_add_key); +} + +static int llsec_remove_key(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + struct ieee802154_llsec_key_id id; + + if (ieee802154_llsec_parse_key_id(info, &id)) + return -EINVAL; + + return ops->llsec->del_key(dev, &id); +} + +int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info) +{ + return ieee802154_nl_llsec_change(skb, info, llsec_remove_key); +} + +static int +ieee802154_nl_fill_key(struct sk_buff *msg, u32 portid, u32 seq, + const struct ieee802154_llsec_key_entry *key, + const struct net_device *dev) +{ + void *hdr; + u32 commands[256 / 32]; + + hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, + IEEE802154_LLSEC_LIST_KEY); + if (!hdr) + goto out; + + if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || + nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || + ieee802154_llsec_fill_key_id(msg, &key->id) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES, + key->key->frame_types)) + goto nla_put_failure; + + if (key->key->frame_types & BIT(IEEE802154_FC_TYPE_MAC_CMD)) { + memset(commands, 0, sizeof(commands)); + commands[7] = key->key->cmd_frame_ids; + if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS, + sizeof(commands), commands)) + goto nla_put_failure; + } + + if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_BYTES, + IEEE802154_LLSEC_KEY_SIZE, key->key->key)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +out: + return -EMSGSIZE; +} + +static int llsec_iter_keys(struct llsec_dump_data *data) +{ + struct ieee802154_llsec_key_entry *pos; + int rc = 0, idx = 0; + + list_for_each_entry(pos, &data->table->keys, list) { + if (idx++ < data->s_idx) + continue; + + if (ieee802154_nl_fill_key(data->skb, data->portid, + data->nlmsg_seq, pos, data->dev)) { + rc = -EMSGSIZE; + break; + } + + data->s_idx++; + } + + return rc; +} + +int ieee802154_llsec_dump_keys(struct sk_buff *skb, struct netlink_callback *cb) +{ + return ieee802154_llsec_dump_table(skb, cb, llsec_iter_keys); +} + + + +static int +llsec_parse_dev(struct genl_info *info, + struct ieee802154_llsec_device *dev) +{ + memset(dev, 0, sizeof(*dev)); + + if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] || + !info->attrs[IEEE802154_ATTR_HW_ADDR] || + !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] || + !info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] || + (!!info->attrs[IEEE802154_ATTR_PAN_ID] != + !!info->attrs[IEEE802154_ATTR_SHORT_ADDR])) + return -EINVAL; + + if (info->attrs[IEEE802154_ATTR_PAN_ID]) { + dev->pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]); + dev->short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]); + } else { + dev->short_addr = cpu_to_le16(IEEE802154_ADDR_UNDEF); + } + + dev->hwaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); + dev->frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); + dev->seclevel_exempt = !!nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]); + dev->key_mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE]); + + if (dev->key_mode >= __IEEE802154_LLSEC_DEVKEY_MAX) + return -EINVAL; + + return 0; +} + +static int llsec_add_dev(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + struct ieee802154_llsec_device desc; + + if (llsec_parse_dev(info, &desc)) + return -EINVAL; + + return ops->llsec->add_dev(dev, &desc); +} + +int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info) +{ + if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != + (NLM_F_CREATE | NLM_F_EXCL)) + return -EINVAL; + + return ieee802154_nl_llsec_change(skb, info, llsec_add_dev); +} + +static int llsec_del_dev(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + __le64 devaddr; + + if (!info->attrs[IEEE802154_ATTR_HW_ADDR]) + return -EINVAL; + + devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); + + return ops->llsec->del_dev(dev, devaddr); +} + +int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info) +{ + return ieee802154_nl_llsec_change(skb, info, llsec_del_dev); +} + +static int +ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq, + const struct ieee802154_llsec_device *desc, + const struct net_device *dev) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, + IEEE802154_LLSEC_LIST_DEV); + if (!hdr) + goto out; + + if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || + nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || + nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) || + nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, + desc->short_addr) || + nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr) || + nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, + desc->frame_counter) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE, + desc->seclevel_exempt) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_KEY_MODE, desc->key_mode)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +out: + return -EMSGSIZE; +} + +static int llsec_iter_devs(struct llsec_dump_data *data) +{ + struct ieee802154_llsec_device *pos; + int rc = 0, idx = 0; + + list_for_each_entry(pos, &data->table->devices, list) { + if (idx++ < data->s_idx) + continue; + + if (ieee802154_nl_fill_dev(data->skb, data->portid, + data->nlmsg_seq, pos, data->dev)) { + rc = -EMSGSIZE; + break; + } + + data->s_idx++; + } + + return rc; +} + +int ieee802154_llsec_dump_devs(struct sk_buff *skb, struct netlink_callback *cb) +{ + return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devs); +} + + + +static int llsec_add_devkey(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + struct ieee802154_llsec_device_key key; + __le64 devaddr; + + if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] || + !info->attrs[IEEE802154_ATTR_HW_ADDR] || + ieee802154_llsec_parse_key_id(info, &key.key_id)) + return -EINVAL; + + devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); + key.frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); + + return ops->llsec->add_devkey(dev, devaddr, &key); +} + +int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info) +{ + if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != + (NLM_F_CREATE | NLM_F_EXCL)) + return -EINVAL; + + return ieee802154_nl_llsec_change(skb, info, llsec_add_devkey); +} + +static int llsec_del_devkey(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + struct ieee802154_llsec_device_key key; + __le64 devaddr; + + if (!info->attrs[IEEE802154_ATTR_HW_ADDR] || + ieee802154_llsec_parse_key_id(info, &key.key_id)) + return -EINVAL; + + devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); + + return ops->llsec->del_devkey(dev, devaddr, &key); +} + +int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info) +{ + return ieee802154_nl_llsec_change(skb, info, llsec_del_devkey); +} + +static int +ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq, + __le64 devaddr, + const struct ieee802154_llsec_device_key *devkey, + const struct net_device *dev) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, + IEEE802154_LLSEC_LIST_DEVKEY); + if (!hdr) + goto out; + + if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || + nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || + nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr) || + nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, + devkey->frame_counter) || + ieee802154_llsec_fill_key_id(msg, &devkey->key_id)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +out: + return -EMSGSIZE; +} + +static int llsec_iter_devkeys(struct llsec_dump_data *data) +{ + struct ieee802154_llsec_device *dpos; + struct ieee802154_llsec_device_key *kpos; + int rc = 0, idx = 0, idx2; + + list_for_each_entry(dpos, &data->table->devices, list) { + if (idx++ < data->s_idx) + continue; + + idx2 = 0; + + list_for_each_entry(kpos, &dpos->keys, list) { + if (idx2++ < data->s_idx2) + continue; + + if (ieee802154_nl_fill_devkey(data->skb, data->portid, + data->nlmsg_seq, + dpos->hwaddr, kpos, + data->dev)) { + return rc = -EMSGSIZE; + } + + data->s_idx2++; + } + + data->s_idx++; + } + + return rc; +} + +int ieee802154_llsec_dump_devkeys(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devkeys); +} + + + +static int +llsec_parse_seclevel(struct genl_info *info, + struct ieee802154_llsec_seclevel *sl) +{ + memset(sl, 0, sizeof(*sl)); + + if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE] || + !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS] || + !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]) + return -EINVAL; + + sl->frame_type = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE]); + if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD) { + if (!info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]) + return -EINVAL; + + sl->cmd_frame_id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]); + } + + sl->sec_levels = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS]); + sl->device_override = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]); + + return 0; +} + +static int llsec_add_seclevel(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + struct ieee802154_llsec_seclevel sl; + + if (llsec_parse_seclevel(info, &sl)) + return -EINVAL; + + return ops->llsec->add_seclevel(dev, &sl); +} + +int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info) +{ + if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != + (NLM_F_CREATE | NLM_F_EXCL)) + return -EINVAL; + + return ieee802154_nl_llsec_change(skb, info, llsec_add_seclevel); +} + +static int llsec_del_seclevel(struct net_device *dev, struct genl_info *info) +{ + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + struct ieee802154_llsec_seclevel sl; + + if (llsec_parse_seclevel(info, &sl)) + return -EINVAL; + + return ops->llsec->del_seclevel(dev, &sl); +} + +int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info) +{ + return ieee802154_nl_llsec_change(skb, info, llsec_del_seclevel); +} + +static int +ieee802154_nl_fill_seclevel(struct sk_buff *msg, u32 portid, u32 seq, + const struct ieee802154_llsec_seclevel *sl, + const struct net_device *dev) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, + IEEE802154_LLSEC_LIST_SECLEVEL); + if (!hdr) + goto out; + + if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || + nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_FRAME_TYPE, sl->frame_type) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVELS, sl->sec_levels) || + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE, + sl->device_override)) + goto nla_put_failure; + + if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD && + nla_put_u8(msg, IEEE802154_ATTR_LLSEC_CMD_FRAME_ID, + sl->cmd_frame_id)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +out: + return -EMSGSIZE; +} + +static int llsec_iter_seclevels(struct llsec_dump_data *data) +{ + struct ieee802154_llsec_seclevel *pos; + int rc = 0, idx = 0; + + list_for_each_entry(pos, &data->table->security_levels, list) { + if (idx++ < data->s_idx) + continue; + + if (ieee802154_nl_fill_seclevel(data->skb, data->portid, + data->nlmsg_seq, pos, + data->dev)) { + rc = -EMSGSIZE; + break; + } + + data->s_idx++; + } + + return rc; +} + +int ieee802154_llsec_dump_seclevels(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return ieee802154_llsec_dump_table(skb, cb, llsec_iter_seclevels); +} diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index d08c7a43dcd..89b265aea15 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -221,8 +221,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) if (info->attrs[IEEE802154_ATTR_DEV_TYPE]) { type = nla_get_u8(info->attrs[IEEE802154_ATTR_DEV_TYPE]); - if (type >= __IEEE802154_DEV_MAX) - return -EINVAL; + if (type >= __IEEE802154_DEV_MAX) { + rc = -EINVAL; + goto nla_put_failure; + } } dev = phy->add_iface(phy, devname, type); diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 6adda4d46f9..3a703ab8834 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -52,5 +52,31 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_DURATION] = { .type = NLA_U8, }, [IEEE802154_ATTR_ED_LIST] = { .len = 27 }, [IEEE802154_ATTR_CHANNEL_PAGE_LIST] = { .len = 32 * 4, }, + + [IEEE802154_ATTR_TXPOWER] = { .type = NLA_S8, }, + [IEEE802154_ATTR_LBT_ENABLED] = { .type = NLA_U8, }, + [IEEE802154_ATTR_CCA_MODE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, }, + [IEEE802154_ATTR_CSMA_RETRIES] = { .type = NLA_U8, }, + [IEEE802154_ATTR_CSMA_MIN_BE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_CSMA_MAX_BE] = { .type = NLA_U8, }, + + [IEEE802154_ATTR_FRAME_RETRIES] = { .type = NLA_S8, }, + + [IEEE802154_ATTR_LLSEC_ENABLED] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_SECLEVEL] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_KEY_MODE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT] = { .type = NLA_U32, }, + [IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED] = { .type = NLA_HW_ADDR, }, + [IEEE802154_ATTR_LLSEC_KEY_ID] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_FRAME_COUNTER] = { .type = NLA_U32 }, + [IEEE802154_ATTR_LLSEC_KEY_BYTES] = { .len = 16, }, + [IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS] = { .len = 258 / 8 }, + [IEEE802154_ATTR_LLSEC_FRAME_TYPE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_CMD_FRAME_ID] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_SECLEVELS] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, }, }; diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 41f538b8e59..74d54fae33d 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <net/sock.h> #include <net/af_ieee802154.h> +#include <net/ieee802154_netdev.h> #include "af802154.h" @@ -55,21 +56,24 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int len) +static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) { - struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; + struct ieee802154_addr addr; + struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr; int err = 0; struct net_device *dev = NULL; - if (len < sizeof(*addr)) + if (len < sizeof(*uaddr)) return -EINVAL; - if (addr->family != AF_IEEE802154) + uaddr = (struct sockaddr_ieee802154 *)_uaddr; + if (uaddr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); - dev = ieee802154_get_dev(sock_net(sk), &addr->addr); + ieee802154_addr_from_sa(&addr, &uaddr->addr); + dev = ieee802154_get_dev(sock_net(sk), &addr); if (!dev) { err = -ENODEV; goto out; @@ -209,6 +213,10 @@ out: static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c new file mode 100644 index 00000000000..6f1428c4870 --- /dev/null +++ b/net/ieee802154/reassembly.c @@ -0,0 +1,585 @@ +/* 6LoWPAN fragment reassembly + * + * + * Authors: + * Alexander Aring <aar@pengutronix.de> + * + * Based on: net/ipv6/reassembly.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "6LoWPAN: " fmt + +#include <linux/net.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/ieee802154_netdev.h> +#include <net/6lowpan.h> +#include <net/ipv6.h> +#include <net/inet_frag.h> + +#include "reassembly.h" + +struct lowpan_frag_info { + __be16 d_tag; + u16 d_size; + u8 d_offset; +}; + +static struct lowpan_frag_info *lowpan_cb(struct sk_buff *skb) +{ + return (struct lowpan_frag_info *)skb->cb; +} + +static struct inet_frags lowpan_frags; + +static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, + struct sk_buff *prev, struct net_device *dev); + +static unsigned int lowpan_hash_frag(__be16 tag, u16 d_size, + const struct ieee802154_addr *saddr, + const struct ieee802154_addr *daddr) +{ + u32 c; + + net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); + c = jhash_3words(ieee802154_addr_hash(saddr), + ieee802154_addr_hash(daddr), + (__force u32)(tag + (d_size << 16)), + lowpan_frags.rnd); + + return c & (INETFRAGS_HASHSZ - 1); +} + +static unsigned int lowpan_hashfn(struct inet_frag_queue *q) +{ + struct lowpan_frag_queue *fq; + + fq = container_of(q, struct lowpan_frag_queue, q); + return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); +} + +static bool lowpan_frag_match(struct inet_frag_queue *q, void *a) +{ + struct lowpan_frag_queue *fq; + struct lowpan_create_arg *arg = a; + + fq = container_of(q, struct lowpan_frag_queue, q); + return fq->tag == arg->tag && fq->d_size == arg->d_size && + ieee802154_addr_equal(&fq->saddr, arg->src) && + ieee802154_addr_equal(&fq->daddr, arg->dst); +} + +static void lowpan_frag_init(struct inet_frag_queue *q, void *a) +{ + struct lowpan_frag_queue *fq; + struct lowpan_create_arg *arg = a; + + fq = container_of(q, struct lowpan_frag_queue, q); + + fq->tag = arg->tag; + fq->d_size = arg->d_size; + fq->saddr = *arg->src; + fq->daddr = *arg->dst; +} + +static void lowpan_frag_expire(unsigned long data) +{ + struct frag_queue *fq; + struct net *net; + + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags); + + spin_lock(&fq->q.lock); + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto out; + + inet_frag_kill(&fq->q, &lowpan_frags); +out: + spin_unlock(&fq->q.lock); + inet_frag_put(&fq->q, &lowpan_frags); +} + +static inline struct lowpan_frag_queue * +fq_find(struct net *net, const struct lowpan_frag_info *frag_info, + const struct ieee802154_addr *src, + const struct ieee802154_addr *dst) +{ + struct inet_frag_queue *q; + struct lowpan_create_arg arg; + unsigned int hash; + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + arg.tag = frag_info->d_tag; + arg.d_size = frag_info->d_size; + arg.src = src; + arg.dst = dst; + + read_lock(&lowpan_frags.lock); + hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst); + + q = inet_frag_find(&ieee802154_lowpan->frags, + &lowpan_frags, &arg, hash); + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } + return container_of(q, struct lowpan_frag_queue, q); +} + +static int lowpan_frag_queue(struct lowpan_frag_queue *fq, + struct sk_buff *skb, const u8 frag_type) +{ + struct sk_buff *prev, *next; + struct net_device *dev; + int end, offset; + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto err; + + offset = lowpan_cb(skb)->d_offset << 3; + end = lowpan_cb(skb)->d_size; + + /* Is this the final fragment? */ + if (offset + skb->len == end) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->q.len || + ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) + goto err; + fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.len = end; + } else { + if (end > fq->q.len) { + /* Some bits beyond end -> corruption. */ + if (fq->q.last_in & INET_FRAG_LAST_IN) + goto err; + fq->q.len = end; + } + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = fq->q.fragments_tail; + if (!prev || lowpan_cb(prev)->d_offset < lowpan_cb(skb)->d_offset) { + next = NULL; + goto found; + } + prev = NULL; + for (next = fq->q.fragments; next != NULL; next = next->next) { + if (lowpan_cb(next)->d_offset >= lowpan_cb(skb)->d_offset) + break; /* bingo! */ + prev = next; + } + +found: + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (!next) + fq->q.fragments_tail = skb; + if (prev) + prev->next = skb; + else + fq->q.fragments = skb; + + dev = skb->dev; + if (dev) + skb->dev = NULL; + + fq->q.stamp = skb->tstamp; + if (frag_type == LOWPAN_DISPATCH_FRAG1) { + /* Calculate uncomp. 6lowpan header to estimate full size */ + fq->q.meat += lowpan_uncompress_size(skb, NULL); + fq->q.last_in |= INET_FRAG_FIRST_IN; + } else { + fq->q.meat += skb->len; + } + add_frag_mem_limit(&fq->q, skb->truesize); + + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { + int res; + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + res = lowpan_frag_reasm(fq, prev, dev); + skb->_skb_refdst = orefdst; + return res; + } + + inet_frag_lru_move(&fq->q); + return -1; +err: + kfree_skb(skb); + return -1; +} + +/* Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, + struct net_device *dev) +{ + struct sk_buff *fp, *head = fq->q.fragments; + int sum_truesize; + + inet_frag_kill(&fq->q, &lowpan_frags); + + /* Make the one we just received the head. */ + if (prev) { + head = prev->next; + fp = skb_clone(head, GFP_ATOMIC); + + if (!fp) + goto out_oom; + + fp->next = head->next; + if (!fp->next) + fq->q.fragments_tail = fp; + prev->next = fp; + + skb_morph(head, fq->q.fragments); + head->next = fq->q.fragments->next; + + consume_skb(fq->q.fragments); + fq->q.fragments = head; + } + + /* Head of list must not be cloned. */ + if (skb_unclone(head, GFP_ATOMIC)) + goto out_oom; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. + */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) + goto out_oom; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->len = head->data_len - plen; + clone->data_len = clone->len; + head->data_len -= clone->len; + head->len -= clone->len; + add_frag_mem_limit(&fq->q, clone->truesize); + } + + WARN_ON(head == NULL); + + sum_truesize = head->truesize; + for (fp = head->next; fp;) { + bool headstolen; + int delta; + struct sk_buff *next = fp->next; + + sum_truesize += fp->truesize; + if (skb_try_coalesce(head, fp, &headstolen, &delta)) { + kfree_skb_partial(fp, headstolen); + } else { + if (!skb_shinfo(head)->frag_list) + skb_shinfo(head)->frag_list = fp; + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + } + fp = next; + } + sub_frag_mem_limit(&fq->q, sum_truesize); + + head->next = NULL; + head->dev = dev; + head->tstamp = fq->q.stamp; + + fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; + + return 1; +out_oom: + net_dbg_ratelimited("lowpan_frag_reasm: no memory for reassembly\n"); + return -1; +} + +static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type, + struct lowpan_frag_info *frag_info) +{ + bool fail; + u8 pattern = 0, low = 0; + + fail = lowpan_fetch_skb(skb, &pattern, 1); + fail |= lowpan_fetch_skb(skb, &low, 1); + frag_info->d_size = (pattern & 7) << 8 | low; + fail |= lowpan_fetch_skb(skb, &frag_info->d_tag, 2); + + if (frag_type == LOWPAN_DISPATCH_FRAGN) { + fail |= lowpan_fetch_skb(skb, &frag_info->d_offset, 1); + } else { + skb_reset_network_header(skb); + frag_info->d_offset = 0; + } + + if (unlikely(fail)) + return -EIO; + + return 0; +} + +int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) +{ + struct lowpan_frag_queue *fq; + struct net *net = dev_net(skb->dev); + struct lowpan_frag_info *frag_info = lowpan_cb(skb); + struct ieee802154_addr source, dest; + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + int err; + + source = mac_cb(skb)->source; + dest = mac_cb(skb)->dest; + + err = lowpan_get_frag_info(skb, frag_type, frag_info); + if (err < 0) + goto err; + + if (frag_info->d_size > ieee802154_lowpan->max_dsize) + goto err; + + inet_frag_evictor(&ieee802154_lowpan->frags, &lowpan_frags, false); + + fq = fq_find(net, frag_info, &source, &dest); + if (fq != NULL) { + int ret; + spin_lock(&fq->q.lock); + ret = lowpan_frag_queue(fq, skb, frag_type); + spin_unlock(&fq->q.lock); + + inet_frag_put(&fq->q, &lowpan_frags); + return ret; + } + +err: + kfree_skb(skb); + return -1; +} +EXPORT_SYMBOL(lowpan_frag_rcv); + +#ifdef CONFIG_SYSCTL +static struct ctl_table lowpan_frags_ns_ctl_table[] = { + { + .procname = "6lowpanfrag_high_thresh", + .data = &init_net.ieee802154_lowpan.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "6lowpanfrag_low_thresh", + .data = &init_net.ieee802154_lowpan.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "6lowpanfrag_time", + .data = &init_net.ieee802154_lowpan.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "6lowpanfrag_max_datagram_size", + .data = &init_net.ieee802154_lowpan.max_dsize, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +static struct ctl_table lowpan_frags_ctl_table[] = { + { + .procname = "6lowpanfrag_secret_interval", + .data = &lowpan_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + table = lowpan_frags_ns_ctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(lowpan_frags_ns_ctl_table), + GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &ieee802154_lowpan->frags.high_thresh; + table[1].data = &ieee802154_lowpan->frags.low_thresh; + table[2].data = &ieee802154_lowpan->frags.timeout; + table[3].data = &ieee802154_lowpan->max_dsize; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + } + + hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table); + if (hdr == NULL) + goto err_reg; + + ieee802154_lowpan->sysctl.frags_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) +{ + struct ctl_table *table; + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + table = ieee802154_lowpan->sysctl.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(ieee802154_lowpan->sysctl.frags_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct ctl_table_header *lowpan_ctl_header; + +static int lowpan_frags_sysctl_register(void) +{ + lowpan_ctl_header = register_net_sysctl(&init_net, + "net/ieee802154/6lowpan", + lowpan_frags_ctl_table); + return lowpan_ctl_header == NULL ? -ENOMEM : 0; +} + +static void lowpan_frags_sysctl_unregister(void) +{ + unregister_net_sysctl_table(lowpan_ctl_header); +} +#else +static inline int lowpan_frags_ns_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void lowpan_frags_ns_sysctl_unregister(struct net *net) +{ +} + +static inline int lowpan_frags_sysctl_register(void) +{ + return 0; +} + +static inline void lowpan_frags_sysctl_unregister(void) +{ +} +#endif + +static int __net_init lowpan_frags_init_net(struct net *net) +{ + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; + ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; + ieee802154_lowpan->max_dsize = 0xFFFF; + + inet_frags_init_net(&ieee802154_lowpan->frags); + + return lowpan_frags_ns_sysctl_register(net); +} + +static void __net_exit lowpan_frags_exit_net(struct net *net) +{ + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + lowpan_frags_ns_sysctl_unregister(net); + inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); +} + +static struct pernet_operations lowpan_frags_ops = { + .init = lowpan_frags_init_net, + .exit = lowpan_frags_exit_net, +}; + +int __init lowpan_net_frag_init(void) +{ + int ret; + + ret = lowpan_frags_sysctl_register(); + if (ret) + return ret; + + ret = register_pernet_subsys(&lowpan_frags_ops); + if (ret) + goto err_pernet; + + lowpan_frags.hashfn = lowpan_hashfn; + lowpan_frags.constructor = lowpan_frag_init; + lowpan_frags.destructor = NULL; + lowpan_frags.skb_free = NULL; + lowpan_frags.qsize = sizeof(struct frag_queue); + lowpan_frags.match = lowpan_frag_match; + lowpan_frags.frag_expire = lowpan_frag_expire; + lowpan_frags.secret_interval = 10 * 60 * HZ; + inet_frags_init(&lowpan_frags); + + return ret; +err_pernet: + lowpan_frags_sysctl_unregister(); + return ret; +} + +void lowpan_net_frag_exit(void) +{ + inet_frags_fini(&lowpan_frags); + lowpan_frags_sysctl_unregister(); + unregister_pernet_subsys(&lowpan_frags_ops); +} diff --git a/net/ieee802154/reassembly.h b/net/ieee802154/reassembly.h new file mode 100644 index 00000000000..74e4a7c9819 --- /dev/null +++ b/net/ieee802154/reassembly.h @@ -0,0 +1,41 @@ +#ifndef __IEEE802154_6LOWPAN_REASSEMBLY_H__ +#define __IEEE802154_6LOWPAN_REASSEMBLY_H__ + +#include <net/inet_frag.h> + +struct lowpan_create_arg { + __be16 tag; + u16 d_size; + const struct ieee802154_addr *src; + const struct ieee802154_addr *dst; +}; + +/* Equivalent of ipv4 struct ip + */ +struct lowpan_frag_queue { + struct inet_frag_queue q; + + __be16 tag; + u16 d_size; + struct ieee802154_addr saddr; + struct ieee802154_addr daddr; +}; + +static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) +{ + switch (a->mode) { + case IEEE802154_ADDR_LONG: + return (((__force u64)a->extended_addr) >> 32) ^ + (((__force u64)a->extended_addr) & 0xffffffff); + case IEEE802154_ADDR_SHORT: + return (__force u32)(a->short_addr); + default: + return 0; + } +} + +int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); +void lowpan_net_frag_exit(void); +int lowpan_net_frag_init(void); + +#endif /* __IEEE802154_6LOWPAN_REASSEMBLY_H__ */ diff --git a/net/ieee802154/wpan-class.c b/net/ieee802154/wpan-class.c index ef56ab5b35f..8d6f6704da8 100644 --- a/net/ieee802154/wpan-class.c +++ b/net/ieee802154/wpan-class.c @@ -44,9 +44,7 @@ static DEVICE_ATTR_RO(name); MASTER_SHOW(current_channel, "%d"); MASTER_SHOW(current_page, "%d"); -MASTER_SHOW_COMPLEX(transmit_power, "%d +- %d dB", - ((signed char) (phy->transmit_power << 2)) >> 2, - (phy->transmit_power >> 6) ? (phy->transmit_power >> 6) * 3 : 1 ); +MASTER_SHOW(transmit_power, "%d +- 1 dB"); MASTER_SHOW(cca_mode, "%d"); static ssize_t channels_supported_show(struct device *dev, diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4b81e91c80f..f032688d20d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ - inet_fragment.o ping.o ip_tunnel_core.o + inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o @@ -19,7 +19,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o -gre-y := gre_demux.o gre_offload.o +gre-y := gre_demux.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o @@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ - xfrm4_output.o + xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70011e029ac..d156b3c5f36 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,9 +126,6 @@ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); - /* New destruction routine */ void inet_sock_destruct(struct sock *sk) @@ -257,7 +254,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; @@ -315,7 +311,6 @@ lookup_protocol: sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); @@ -327,7 +322,6 @@ lookup_protocol: goto out; err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; @@ -342,7 +336,7 @@ lookup_protocol: inet->hdrincl = 1; } - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; @@ -1005,7 +999,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, @@ -1015,7 +1008,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, @@ -1024,7 +1016,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }, @@ -1033,7 +1024,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } }; @@ -1133,7 +1123,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, sk->sk_protocol, - inet->inet_sport, inet->inet_dport, sk, false); + inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -1264,10 +1254,12 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_MPLS | 0))) goto out; @@ -1299,8 +1291,11 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - /* Note : following gso_segment() might change skb->encapsulation */ - udpfrag = !skb->encapsulation && proto == IPPROTO_UDP; + if (skb->encapsulation && + skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) + udpfrag = proto == IPPROTO_UDP && encap; + else + udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) @@ -1377,8 +1372,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (!NAPI_GRO_CB(p)->same_flow) continue; - iph2 = ip_hdr(p); - + iph2 = (struct iphdr *)(p->data + off); + /* The above works because, with the exception of the top + * (inner most) layer, we only aggregate pkts with the same + * hdr length so all the hdrs we'll need to verify will start + * at the same offset. + */ if ((iph->protocol ^ iph2->protocol) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { @@ -1390,13 +1389,24 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | - (__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | - ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); + ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); + /* Save the IP ID check to be included later when we get to + * the transport layer so only the inner most IP ID is checked. + * This is because some GSO/TSO implementations do not + * correctly increment the IP ID for the outer hdrs. + */ + NAPI_GRO_CB(p)->flush_id = + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; } NAPI_GRO_CB(skb)->flush |= flush; + skb_set_network_header(skb, off); + /* The above will be needed by the transport layer if there is one + * immediately following this IP hdr. + */ + skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -1411,14 +1421,17 @@ out: return pp; } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - skb_network_offset(skb)); - struct iphdr *iph = ip_hdr(skb); + __be16 newlen = htons(skb->len - nhoff); + struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; int proto = iph->protocol; int err = -ENOSYS; + if (skb->encapsulation) + skb_set_inner_network_header(skb, nhoff); + csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; @@ -1427,7 +1440,11 @@ static int inet_gro_complete(struct sk_buff *skb) if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb); + /* Only need to add sizeof(*iph) to get to the next hdr below + * because any hdr with option will have been flushed in + * inet_gro_receive(). + */ + err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); out_unlock: rcu_read_unlock(); @@ -1457,22 +1474,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; - int i, j; + int i; - for_each_possible_cpu(i) { - for (j = 0; j < SNMP_ARRAY_SZ; j++) - res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); - } + for_each_possible_cpu(i) + res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; @@ -1483,12 +1498,12 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) u64 v; unsigned int start; - bhptr = per_cpu_ptr(mib[0], cpu); + bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { - start = u64_stats_fetch_begin_bh(syncp); + start = u64_stats_fetch_begin_irq(syncp); v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_bh(syncp, start)); + } while (u64_stats_fetch_retry_irq(syncp, start)); res += v; } @@ -1497,25 +1512,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) EXPORT_SYMBOL_GPL(snmp_fold_field64); #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ - BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize, align); - if (!ptr[0]) - return -ENOMEM; - -#if SNMP_ARRAY_SZ == 2 - ptr[1] = __alloc_percpu(mibsize, align); - if (!ptr[1]) { - free_percpu(ptr[0]); - ptr[0] = NULL; - return -ENOMEM; - } -#endif - return 0; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); - #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1529,6 +1525,7 @@ static const struct net_protocol tcp_protocol = { .err_handler = tcp_v4_err, .no_policy = 1, .netns_ok = 1, + .icmp_strict_tag_validation = 1, }; static const struct net_protocol udp_protocol = { @@ -1550,40 +1547,30 @@ static __net_init int ipv4_mib_init_net(struct net *net) { int i; - if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, - sizeof(struct tcp_mib), - __alignof__(struct tcp_mib)) < 0) + net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); + if (!net->mib.tcp_statistics) goto err_tcp_mib; - if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ip_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet_stats; - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i); - u64_stats_init(&af_inet_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i); + af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); u64_stats_init(&af_inet_stats->syncp); -#endif } - if (snmp_mib_init((void __percpu **)net->mib.net_statistics, - sizeof(struct linux_mib), - __alignof__(struct linux_mib)) < 0) + net->mib.net_statistics = alloc_percpu(struct linux_mib); + if (!net->mib.net_statistics) goto err_net_mib; - if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udp_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udp_statistics) goto err_udp_mib; - if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udplite_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_statistics) goto err_udplite_mib; - if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, - sizeof(struct icmp_mib), - __alignof__(struct icmp_mib)) < 0) + net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); + if (!net->mib.icmp_statistics) goto err_icmp_mib; net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), GFP_KERNEL); @@ -1594,17 +1581,17 @@ static __net_init int ipv4_mib_init_net(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); + free_percpu(net->mib.icmp_statistics); err_icmp_mib: - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); + free_percpu(net->mib.udplite_statistics); err_udplite_mib: - snmp_mib_free((void __percpu **)net->mib.udp_statistics); + free_percpu(net->mib.udp_statistics); err_udp_mib: - snmp_mib_free((void __percpu **)net->mib.net_statistics); + free_percpu(net->mib.net_statistics); err_net_mib: - snmp_mib_free((void __percpu **)net->mib.ip_statistics); + free_percpu(net->mib.ip_statistics); err_ip_mib: - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } @@ -1612,12 +1599,12 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); - snmp_mib_free((void __percpu **)net->mib.udp_statistics); - snmp_mib_free((void __percpu **)net->mib.net_statistics); - snmp_mib_free((void __percpu **)net->mib.ip_statistics); - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.icmp_statistics); + free_percpu(net->mib.udplite_statistics); + free_percpu(net->mib.udp_statistics); + free_percpu(net->mib.net_statistics); + free_percpu(net->mib.ip_statistics); + free_percpu(net->mib.tcp_statistics); } static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1630,6 +1617,39 @@ static int __init init_ipv4_mibs(void) return register_pernet_subsys(&ipv4_mib_ops); } +static __net_init int inet_init_net(struct net *net) +{ + /* + * Set defaults for local port range + */ + seqlock_init(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = 32768; + net->ipv4.ip_local_ports.range[1] = 61000; + + seqlock_init(&net->ipv4.ping_group_range.lock); + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); + net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); + return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { + .init = inet_init_net, + .exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ + return register_pernet_subsys(&af_inet_ops); +} + static int ipv4_proc_init(void); /* @@ -1683,13 +1703,9 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); - sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); - if (!sysctl_local_reserved_ports) - goto out; - rc = proto_register(&tcp_prot, 1); if (rc) - goto out_free_reserved_ports; + goto out; rc = proto_register(&udp_prot, 1); if (rc) @@ -1774,6 +1790,9 @@ static int __init inet_init(void) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif + + if (init_inet_pernet_ops()) + pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); /* * Initialise per-cpu ipv4 mibs */ @@ -1796,8 +1815,6 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); -out_free_reserved_ports: - kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 717902669d2..a2afa89513a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; @@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } err = -ENOMEM; - iph = ah_alloc_tmp(ahash, nfrags, ihl); + iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; - - icv = ah_tmp_icv(ahash, iph, ihl); + seqhi = (__be32 *)((char *)iph + ihl); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; @@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; @@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) iph = ip_hdr(skb); ihl = ip_hdrlen(skb); - work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + + work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + + ahp->icv_trunc_len + seqhi_len); if (!work_iph) goto out; - auth_data = ah_tmp_auth(work_iph, ihl); + seqhi = (__be32 *)((char *)work_iph + ihl); + auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; @@ -397,7 +428,7 @@ out: return err; } -static void ah4_err(struct sk_buff *skb, u32 info) +static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); + + return 0; } static int ah_init_state(struct xfrm_state *x) @@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x) kfree(ahp); } +static int ah4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} static const struct xfrm_type ah_type = { @@ -518,11 +555,12 @@ static const struct xfrm_type ah_type = .output = ah_output }; -static const struct net_protocol ah4_protocol = { +static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init ah4_init(void) @@ -531,7 +569,7 @@ static int __init ah4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; @@ -541,7 +579,7 @@ static int __init ah4_init(void) static void __exit ah4_fini(void) { - if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f0418586394..1a9b99e0446 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -381,6 +381,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { + struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { @@ -399,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; + in_dev = NULL; break; case 4: /* Reserved */ case 5: @@ -410,7 +412,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) default: return 0; } - return !inet_confirm_addr(in_dev, sip, tip, scope); + return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) @@ -730,6 +732,7 @@ static int arp_process(struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. @@ -896,10 +899,12 @@ static int arp_process(struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ + is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && + inet_addr_type(net, sip) == RTN_UNICAST; + if (n == NULL && - (arp->ar_op == htons(ARPOP_REPLY) || - (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && - inet_addr_type(net, sip) == RTN_UNICAST) + ((arp->ar_op == htons(ARPOP_REPLY) && + inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -912,8 +917,10 @@ static int arp_process(struct sk_buff *skb) agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ - override = time_after(jiffies, n->updated + - NEIGH_VAR(n->parms, LOCKTIME)); + override = time_after(jiffies, + n->updated + + NEIGH_VAR(n->parms, LOCKTIME)) || + is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. @@ -1110,7 +1117,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -int arp_invalidate(struct net_device *dev, __be32 ip) +static int arp_invalidate(struct net_device *dev, __be32 ip) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; @@ -1125,7 +1132,6 @@ int arp_invalidate(struct net_device *dev, __be32 ip) return err; } -EXPORT_SYMBOL(arp_invalidate); static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 4b59b6e488d..69e77c8ff28 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1335,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1431,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1526,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 19e36376d2a..a3095fdefbe 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -53,7 +53,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, - inet->inet_sport, usin->sin_port, sk, true); + inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) @@ -86,18 +86,26 @@ out: } EXPORT_SYMBOL(ip4_datagram_connect); +/* Because UDP xmit path can manipulate sk_dst_cache without holding + * socket lock, we need to use sk_dst_set() here, + * even if we own the socket lock. + */ void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; + struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; - if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0)) - return; - rcu_read_lock(); + + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { + rcu_read_unlock(); + return; + } inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; @@ -105,8 +113,10 @@ void ip4_datagram_release_cb(struct sock *sk) inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); - if (!IS_ERR(rt)) - __sk_dst_set(sk, &rt->dst); + + dst = !IS_ERR(rt) ? &rt->dst : NULL; + sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 43065be3630..e9449376b58 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -99,13 +99,13 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, + [IFA_FLAGS] = { .type = NLA_U32 }, }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static DEFINE_SPINLOCK(inet_addr_hash_lock); static u32 inet_addr_hash(struct net *net, __be32 addr) { @@ -118,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); - spin_unlock(&inet_addr_hash_lock); } static void inet_hash_remove(struct in_ifaddr *ifa) { - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_del_init_rcu(&ifa->hash); - spin_unlock(&inet_addr_hash_lock); } /** @@ -463,7 +461,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { - net_srandom(ifa->ifa_local); + prandom_seed((__force u32) ifa->ifa_local); ifap = last_primary; } @@ -473,7 +471,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -683,7 +681,8 @@ static void check_lifetime(struct work_struct *work) if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; - schedule_delayed_work(&check_lifetime_work, next_sched - now); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, + next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, @@ -757,7 +756,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); - ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : + ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = in_dev; @@ -827,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa_existing = find_matching_ifa(ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but - * userspace alreay relies on not having to provide this. + * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); @@ -840,7 +840,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa = ifa_existing; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, + &check_lifetime_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } @@ -1238,22 +1239,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, /* * Confirm that local IP address exists using wildcards: - * - in_dev: only on this interface, 0=any interface + * - net: netns to check, cannot be NULL + * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ -__be32 inet_confirm_addr(struct in_device *in_dev, +__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; - struct net *net; - if (scope != RT_SCOPE_LINK) + if (in_dev != NULL) return confirm_addr_indev(in_dev, dst, local, scope); - net = dev_net(in_dev->dev); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); @@ -1384,6 +1384,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } @@ -1437,7 +1439,9 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ + + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) @@ -1505,6 +1509,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, preferred, valid)) goto nla_put_failure; @@ -1693,6 +1698,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (type == -1 || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_PROXY_NEIGH) + size += nla_total_size(4); return size; } @@ -1729,6 +1736,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, + IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) + goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -1766,6 +1777,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, + [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, @@ -1947,6 +1959,19 @@ static void inet_forward_change(struct net *net) } } +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ + if (cnf == net->ipv4.devconf_dflt) + return NETCONFA_IFINDEX_DEFAULT; + else if (cnf == net->ipv4.devconf_all) + return NETCONFA_IFINDEX_ALL; + else { + struct in_device *idev + = container_of(cnf, struct in_device, cnf); + return idev->dev->ifindex; + } +} + static int devinet_conf_proc(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1959,6 +1984,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; + int ifindex; set_bit(i, cnf->state); @@ -1968,23 +1994,19 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); + if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { - int ifindex; - - if (cnf == net->ipv4.devconf_dflt) - ifindex = NETCONFA_IFINDEX_DEFAULT; - else if (cnf == net->ipv4.devconf_all) - ifindex = NETCONFA_IFINDEX_ALL; - else { - struct in_device *idev = - container_of(cnf, struct in_device, - cnf); - ifindex = idev->dev->ifindex; - } + ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, ifindex, cnf); } + if (i == IPV4_DEVCONF_PROXY_ARP - 1 && + new_value != old_value) { + ifindex = devinet_conf_ifindex(net, cnf); + inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + ifindex, cnf); + } } return ret; @@ -2300,7 +2322,7 @@ void __init devinet_init(void) register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); rtnl_af_register(&inet_af_ops); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 7785b28061a..360b565918c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -473,7 +473,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) net_adj) & ~(blksize - 1)) + net_adj - 2; } -static void esp4_err(struct sk_buff *skb, u32 info) +static int esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -483,23 +483,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); + + return 0; } static void esp_destroy(struct xfrm_state *x) @@ -672,6 +674,11 @@ error: return err; } +static int esp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type esp_type = { .description = "ESP4", @@ -685,11 +692,12 @@ static const struct xfrm_type esp_type = .output = esp_output }; -static const struct net_protocol esp4_protocol = { +static struct xfrm4_protocol esp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = esp4_rcv_cb, .err_handler = esp4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init esp4_init(void) @@ -698,7 +706,7 @@ static int __init esp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&esp_type, AF_INET); return -EAGAIN; @@ -708,7 +716,7 @@ static int __init esp4_init(void) static void __exit esp4_fini(void) { - if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&esp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d846304b7b8..255aa9946fe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -250,7 +250,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif; + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) - return ip_rt_dump(skb, cb); + return skb->len; s_h = cb->args[0]; s_e = cb->args[1]; @@ -1047,6 +1047,8 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo } in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) + return NOTIFY_DONE; switch (event) { case NETDEV_UP: diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 388d113fd28..1e4f6600b31 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -33,8 +33,6 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 523be38e37d..f2e15738534 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -104,7 +104,10 @@ errout: static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; - struct net_device *dev = result->fi->fib_dev; + struct net_device *dev = NULL; + + if (result->fi) + dev = result->fi->fib_dev; /* do not accept result if the route does * not meet the required prefix length diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e63f47a4e65..b10cd43a472 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -426,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) return NULL; } -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt) +static int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, + int dflt) { struct neighbour *n; int state = NUD_NONE; @@ -630,6 +631,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, .flowi4_oif = nh->nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ @@ -819,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); if (!fi->fib_metrics) goto failure; } else fi->fib_metrics = (u32 *) dst_default_metrics; - fib_info_cnt++; fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5893e99e829..0485bf7f8f0 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_push(skb, hdr_len); + skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = tnl_flags_to_gre_flags(tpi->flags); greh->protocol = tpi->proto; @@ -84,7 +85,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, ptr--; } if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { *ptr = 0; *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, skb->len, 0)); @@ -93,28 +95,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, } EXPORT_SYMBOL_GPL(gre_build_header); -static __sum16 check_checksum(struct sk_buff *skb) -{ - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - - if (!csum) - break; - /* Fall through. */ - - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - break; - } - - return csum; -} - static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { @@ -141,7 +121,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { - if (check_checksum(skb)) { + if (skb_checksum_simple_validate(skb)) { *csum_err = true; return -EINVAL; } @@ -182,6 +162,14 @@ static int gre_cisco_rcv(struct sk_buff *skb) int i; bool csum_err = false; +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + } +#endif + if (parse_gre_header(skb, &tpi, &csum_err) < 0) goto drop; @@ -355,14 +343,7 @@ static int __init gre_init(void) goto err_gre; } - if (gre_offload_init()) { - pr_err("can't add protocol offload\n"); - goto err_gso; - } - return 0; -err_gso: - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); err_gre: inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); err: @@ -371,8 +352,6 @@ err: static void __exit gre_exit(void) { - gre_offload_exit(); - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index e5d43618846..f0bdd47bbbc 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -11,6 +11,7 @@ */ #include <linux/skbuff.h> +#include <linux/init.h> #include <net/protocol.h> #include <net/gre.h> @@ -26,8 +27,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t enc_features; - int ghl = GRE_HEADER_SECTION; + int ghl; struct gre_base_hdr *greh; + u16 mac_offset = skb->mac_header; int mac_len = skb->mac_len; __be16 protocol = skb->protocol; int tnl_hlen; @@ -40,6 +42,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP))) goto out; @@ -48,23 +51,21 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (greh->flags & GRE_KEY) - ghl += GRE_HEADER_SECTION; - if (greh->flags & GRE_SEQ) - ghl += GRE_HEADER_SECTION; - if (greh->flags & GRE_CSUM) { - ghl += GRE_HEADER_SECTION; - csum = true; - } else - csum = false; + ghl = skb_inner_network_header(skb) - skb_transport_header(skb); + if (unlikely(ghl < sizeof(*greh))) + goto out; - /* setup inner skb. */ - skb->protocol = greh->protocol; - skb->encapsulation = 0; + csum = !!(greh->flags & GRE_CSUM); + if (csum) + skb->encap_hdr_csum = 1; if (unlikely(!pskb_may_pull(skb, ghl))) goto out; + /* setup inner skb. */ + skb->protocol = greh->protocol; + skb->encapsulation = 0; + __skb_pull(skb, ghl); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); @@ -73,8 +74,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); goto out; + } skb = segs; tnl_hlen = skb_tnl_header_len(skb); @@ -94,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } } - greh = (struct gre_base_hdr *)(skb->data); + skb_reset_transport_header(skb); + + greh = (struct gre_base_hdr *) + skb_transport_header(skb); pcsum = (__be32 *)(greh + 1); *pcsum = 0; - *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); } __skb_push(skb, tnl_hlen - ghl); @@ -113,19 +119,180 @@ out: return segs; } +/* Compute the whole skb csum in s/w and store it, then verify GRO csum + * starting from gro_offset. + */ +static __sum16 gro_skb_checksum(struct sk_buff *skb) +{ + __sum16 sum; + + skb->csum = skb_checksum(skb, 0, skb->len, 0); + NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, + csum_partial(skb->data, skb_gro_offset(skb), 0)); + sum = csum_fold(NAPI_GRO_CB(skb)->csum); + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { + if (unlikely(!sum) && !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } else { + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + } + + return sum; +} + +static struct sk_buff **gre_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + const struct gre_base_hdr *greh; + unsigned int hlen, grehlen; + unsigned int off; + int flush = 1; + struct packet_offload *ptype; + __be16 type; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*greh); + greh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out; + } + + /* Only support version 0 and K (key), C (csum) flags. Note that + * although the support for the S (seq#) flag can be added easily + * for GRO, this is problematic for GSO hence can not be enabled + * here because a GRO pkt may end up in the forwarding path, thus + * requiring GSO support to break it up correctly. + */ + if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) + goto out; + + type = greh->protocol; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (ptype == NULL) + goto out_unlock; + + grehlen = GRE_HEADER_SECTION; + + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + hlen = off + grehlen; + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out_unlock; + } + if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ + __sum16 csum = 0; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum = csum_fold(NAPI_GRO_CB(skb)->csum); + /* Don't trust csum error calculated/reported by h/w */ + if (skb->ip_summed == CHECKSUM_NONE || csum != 0) + csum = gro_skb_checksum(skb); + + /* GRE CSUM is the 1's complement of the 1's complement sum + * of the GRE hdr plus payload so it should add up to 0xffff + * (and 0 after csum_fold()) just like the IPv4 hdr csum. + */ + if (csum) + goto out_unlock; + } + flush = 0; + + for (p = *head; p; p = p->next) { + const struct gre_base_hdr *greh2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + /* The following checks are needed to ensure only pkts + * from the same tunnel are considered for aggregation. + * The criteria for "the same tunnel" includes: + * 1) same version (we only support version 0 here) + * 2) same protocol (we only support ETH_P_IP for now) + * 3) same set of flags + * 4) same key if the key field is present. + */ + greh2 = (struct gre_base_hdr *)(p->data + off); + + if (greh2->flags != greh->flags || + greh2->protocol != greh->protocol) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + if (greh->flags & GRE_KEY) { + /* compare keys */ + if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + } + + skb_gro_pull(skb, grehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, greh, grehlen); + + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int gre_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); + struct packet_offload *ptype; + unsigned int grehlen = sizeof(*greh); + int err = -ENOENT; + __be16 type; + + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type = SKB_GSO_GRE; + + type = greh->protocol; + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype != NULL) + err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); + + rcu_read_unlock(); + return err; +} + static const struct net_offload gre_offload = { .callbacks = { .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, + .gro_receive = gre_gro_receive, + .gro_complete = gre_gro_complete, }, }; -int __init gre_offload_init(void) +static int __init gre_offload_init(void) { return inet_add_offload(&gre_offload, IPPROTO_GRE); } - -void __exit gre_offload_exit(void) -{ - inet_del_offload(&gre_offload, IPPROTO_GRE); -} +device_initcall(gre_offload_init); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5c0e8bc6e5b..42b7bcf8045 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct sock *sk; struct inet_sock *inet; __be32 daddr, saddr; + u32 mark = IP4_REPLY_MARK(net, skb->mark); if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; @@ -349,6 +350,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; + sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; @@ -364,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; fl4.saddr = saddr; + fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); @@ -382,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, const struct iphdr *iph, - __be32 saddr, u8 tos, + __be32 saddr, u8 tos, u32 mark, int type, int code, struct icmp_bxm *param) { @@ -394,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->daddr = (param->replyopts.opt.opt.srr ? param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; + fl4->flowi4_mark = mark; fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; @@ -491,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct flowi4 fl4; __be32 saddr; u8 tos; + u32 mark; struct net *net; struct sock *sk; @@ -592,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; + mark = IP4_REPLY_MARK(net, skb_in->mark); if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) goto out_unlock; @@ -608,13 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param->skb = skb_in; icmp_param->offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; + sk->sk_mark = mark; ipc.addr = iph->saddr; ipc.opt = &icmp_param->replyopts.opt; ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; - rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -668,6 +675,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) rcu_read_unlock(); } +static bool icmp_tag_validation(int proto) +{ + bool ok; + + rcu_read_lock(); + ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; + rcu_read_unlock(); + return ok; +} + /* * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. @@ -705,13 +722,23 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (ipv4_config.no_pmtu_disc) { + /* for documentation of the ip_no_pmtu_disc + * values please see + * Documentation/networking/ip-sysctl.txt + */ + switch (net->ipv4.sysctl_ip_no_pmtu_disc) { + default: LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); - } else { - info = ntohs(icmph->un.frag.mtu); - if (!info) + break; + case 2: + goto out; + case 3: + if (!icmp_tag_validation(iph->protocol)) goto out; + /* fall through */ + case 0: + info = ntohs(icmph->un.frag.mtu); } break; case ICMP_SR_FAILED: @@ -886,16 +913,8 @@ int icmp_rcv(struct sk_buff *skb) ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto csum_error; - } + if (skb_checksum_simple_validate(skb)) + goto csum_error; if (!pskb_pull(skb, sizeof(*icmph))) goto error; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7defdc9ba16..db710b059ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -211,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = net_random() % max_delay; + int tv = prandom_u32() % max_delay; im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -220,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = net_random() % in_dev->mr_maxdelay; + int tv = prandom_u32() % in_dev->mr_maxdelay; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) @@ -229,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = net_random() % delay; + int tv = prandom_u32() % delay; if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); @@ -310,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) struct ip_sf_list *psf; int scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; @@ -463,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, } first = 1; psf_prev = NULL; - for (psf=*psf_list; psf; psf=psf_next) { + for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; @@ -520,7 +520,7 @@ empty_source: return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ - if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) { + if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) { igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } @@ -576,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) struct ip_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf=*ppsf; psf; psf = psf_next) { + for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) @@ -600,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) /* deleted MCA's */ pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; @@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; @@ -764,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev) static void igmp_timer_expire(unsigned long data) { - struct ip_mc_list *im=(struct ip_mc_list *)data; + struct ip_mc_list *im = (struct ip_mc_list *)data; struct in_device *in_dev = im->interface; spin_lock(&im->lock); @@ -794,10 +794,10 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) int i, scount; scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) { + for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != @@ -825,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) /* mark INCLUDE-mode sources */ scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) + for (i = 0; i < nsrcs; i++) if (srcs[i] == psf->sf_inaddr) { psf->sf_gsresp = 1; scount++; @@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto drop; - } + if (skb_checksum_simple_validate(skb)) + goto drop; ih = igmp_hdr(skb); switch (ih->type) { @@ -1103,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); @@ -1121,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; @@ -1134,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) } spin_unlock_bh(&in_dev->mc_tomb_lock); if (pmc) { - for (psf=pmc->tomb; psf; psf=psf_next) { + for (psf = pmc->tomb; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1167,7 +1159,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); - for (; psf; psf=psf_next) { + for (; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1557,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1630,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[sfmode]--; } err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; @@ -1650,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { @@ -1671,7 +1663,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1699,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc) struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && @@ -1716,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc) int new_in, rv; rv = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -1726,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc) if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) { + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; @@ -1748,7 +1740,7 @@ static int sf_setstate(struct ip_mc_list *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { @@ -1800,7 +1792,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, if (!delta) pmc->sfcount[sfmode]++; err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; @@ -1810,7 +1802,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, if (!delta) pmc->sfcount[sfmode]--; - for (j=0; j<i; j++) + for (j = 0; j < i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST @@ -1829,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { @@ -1844,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *nextpsf; - for (psf=pmc->tomb; psf; psf=nextpsf) { + for (psf = pmc->tomb; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } pmc->tomb = NULL; - for (psf=pmc->sources; psf; psf=nextpsf) { + for (psf = pmc->sources; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } @@ -1952,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_lock(); in_dev = ip_mc_find_dev(net, imr); + if (!in_dev) { + ret = -ENODEV; + goto out; + } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; @@ -1969,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - if (in_dev) - ip_mc_dec_group(in_dev, group); + ip_mc_dec_group(in_dev, group); rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } - if (!in_dev) - ret = -ENODEV; +out: rtnl_unlock(); return ret; } @@ -2043,7 +2037,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -2062,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); - for (j=i+1; j<psl->sl_count; j++) + for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; @@ -2088,7 +2082,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { - for (i=0; i<psl->sl_count; i++) + for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); @@ -2098,7 +2092,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -2106,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } if (rv == 0) /* address already there is an error */ goto done; - for (j=psl->sl_count-1; j>=i; j--) + for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; @@ -2305,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } - for (i=0; i<copycount; i++) { + for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; @@ -2350,7 +2344,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) if (!psl) goto unlock; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } @@ -2423,7 +2417,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { - for (psf=im->sources; psf; psf=psf->sf_next) { + for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } @@ -2762,6 +2756,7 @@ static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; +#endif static int igmp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -2785,8 +2780,9 @@ static struct notifier_block igmp_notifier = { .notifier_call = igmp_netdev_event, }; -int __init igmp_mc_proc_init(void) +int __init igmp_mc_init(void) { +#if defined(CONFIG_PROC_FS) int err; err = register_pernet_subsys(&igmp_net_ops); @@ -2800,5 +2796,7 @@ int __init igmp_mc_proc_init(void) reg_notif_fail: unregister_pernet_subsys(&igmp_net_ops); return err; -} +#else + return register_netdevice_notifier(&igmp_notifier); #endif +} diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fc0e649cc00..14d02ea905b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,19 +29,16 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - *low = net->ipv4.sysctl_local_ports.range[0]; - *high = net->ipv4.sysctl_local_ports.range[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + *low = net->ipv4.ip_local_ports.range[0]; + *high = net->ipv4.ip_local_ports.range[1]; + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); @@ -109,11 +106,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - smallest_rover = rover = net_random() % remaining + low; + smallest_rover = rover = prandom_u32() % remaining + low; smallest_size = -1; do { - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; @@ -408,7 +405,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct net *net = sock_net(sk); int flags = inet_sk_flowi_flags(sk); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, flags, @@ -445,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rcu_read_lock(); opt = rcu_dereference(newinet->inet_opt); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, @@ -680,6 +677,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; + newsk->sk_mark = inet_rsk(req)->ir_mark; + newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 56a964a553d..e34dccbc4d7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -106,6 +106,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->id.idiag_sport = inet->inet_sport; r->id.idiag_dport = inet->inet_dport; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = inet->inet_rcv_saddr; r->id.idiag_dst[0] = inet->inet_daddr; @@ -240,12 +244,19 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r->idiag_family = tw->tw_family; r->idiag_retrans = 0; + r->id.idiag_if = tw->tw_bound_dev_if; sock_diag_save_cookie(tw, r->id.idiag_cookie); + r->id.idiag_sport = tw->tw_sport; r->id.idiag_dport = tw->tw_dport; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = tw->tw_rcv_saddr; r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; r->idiag_timer = 3; r->idiag_expires = jiffies_to_msecs(tmo); @@ -726,8 +737,13 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, r->id.idiag_sport = inet->inet_sport; r->id.idiag_dport = ireq->ir_rmt_port; + + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = ireq->ir_loc_addr; r->id.idiag_dst[0] = ireq->ir_rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; @@ -914,12 +930,15 @@ skip_listen_ht: spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { int res; + int state; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next_normal; - if (!(r->idiag_states & (1 << sk->sk_state))) + state = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_substate : sk->sk_state; + if (!(r->idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index bb075fc9a14..3b01959bf4b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -208,7 +208,7 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) } work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0) { + while (work > 0 || force) { spin_lock(&nf->lru_lock); if (list_empty(&nf->lru_list)) { @@ -278,9 +278,10 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &hb->chain); + inet_frag_lru_add(nf, qp); spin_unlock(&hb->chain_lock); read_unlock(&f->lock); - inet_frag_lru_add(nf, qp); + return qp; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf279450..43116e8c8e1 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -274,7 +274,7 @@ struct sock *__inet_lookup_established(struct net *net, const __be32 daddr, const u16 hnum, const int dif) { - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; @@ -327,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, @@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - if (inet_is_reserved_local_port(port)) + if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 1975f52933c..f17ea49b28f 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -230,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, lro_desc->last_skb = skb; } -static void lro_add_frags(struct net_lro_desc *lro_desc, - int len, int hlen, int truesize, - struct skb_frag_struct *skb_frags, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *skb = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb->truesize += truesize; - - skb_frags[0].page_offset += hlen; - skb_frag_size_sub(&skb_frags[0], hlen); - - while (tcp_data_len > 0) { - *(lro_desc->next_frag) = *skb_frags; - tcp_data_len -= skb_frag_size(skb_frags); - lro_desc->next_frag++; - skb_frags++; - skb_shinfo(skb)->nr_frags++; - } -} static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, struct iphdr *iph, @@ -371,128 +348,6 @@ out: return 1; } - -static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *mac_hdr, - int hlen, __wsum sum, - u32 ip_summed) -{ - struct sk_buff *skb; - struct skb_frag_struct *skb_frags; - int data_len = len; - int hdr_len = min(len, hlen); - - skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); - if (!skb) - return NULL; - - skb_reserve(skb, lro_mgr->frag_align_pad); - skb->len = len; - skb->data_len = len - hdr_len; - skb->truesize += true_size; - skb->tail += hdr_len; - - memcpy(skb->data, mac_hdr, hdr_len); - - skb_frags = skb_shinfo(skb)->frags; - while (data_len > 0) { - *skb_frags = *frags; - data_len -= skb_frag_size(frags); - skb_frags++; - frags++; - skb_shinfo(skb)->nr_frags++; - } - - skb_shinfo(skb)->frags[0].page_offset += hdr_len; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); - - skb->ip_summed = ip_summed; - skb->csum = sum; - skb->protocol = eth_type_trans(skb, lro_mgr->dev); - return skb; -} - -static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *priv, __wsum sum) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - struct sk_buff *skb; - u64 flags; - void *mac_hdr; - int mac_hdr_len; - int hdr_len = LRO_MAX_PG_HLEN; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_frag_header || - lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, - (void *)&tcph, &flags, priv)) { - mac_hdr = skb_frag_address(frags); - goto out1; - } - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out1; - - hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); - mac_hdr_len = (int)((void *)(iph) - mac_hdr); - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out1; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) - goto out1; - - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, 0, lro_mgr->ip_summed_aggr); - if (!skb) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - iph = (void *)(skb->data + vlan_hdr_len); - tcph = (void *)((u8 *)skb->data + vlan_hdr_len - + IP_HDR_LEN(iph)); - - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return NULL; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) - goto out2; - - lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return NULL; - -out2: /* send aggregated packets to the stack */ - lro_flush(lro_mgr, lro_desc); - -out1: /* Original packet has to be posted to the stack */ - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, sum, lro_mgr->ip_summed); -out: - return skb; -} - void lro_receive_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, void *priv) @@ -506,23 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr, } EXPORT_SYMBOL(lro_receive_skb); -void lro_receive_frags(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, void *priv, __wsum sum) -{ - struct sk_buff *skb; - - skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum); - if (!skb) - return; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); -} -EXPORT_SYMBOL(lro_receive_frags); - void lro_flush_all(struct net_lro_mgr *lro_mgr) { int i; @@ -534,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr) } } EXPORT_SYMBOL(lro_flush_all); - -void lro_flush_pkt(struct net_lro_mgr *lro_mgr, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (lro_desc->active) - lro_flush(lro_mgr, lro_desc); -} -EXPORT_SYMBOL(lro_flush_pkt); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 33d5537881e..bd5f5928167 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -26,20 +26,7 @@ * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. - * At this moment this information consists only of ID field for the next - * outgoing IP packet. This field is incremented with each packet as encoded - * in inet_getid() function (include/net/inetpeer.h). - * At the moment of writing this notes identifier of IP packets is generated - * to be unpredictable using this code only for packets subjected - * (actually or potentially) to defragmentation. I.e. DF packets less than - * PMTU in size when local fragmentation is disabled use a constant ID and do - * not use this code (see ip_select_ident() in include/net/ip.h). * - * Route cache entries hold references to our nodes. - * New cache entries get references via lookup by destination IP address in - * the avl tree. The reference is grabbed only when it's needed i.e. only - * when we try to output IP packet which needs an unpredictable ID (see - * __ip_select_ident() in net/ipv4/route.c). * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can @@ -62,7 +49,6 @@ * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable - * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; @@ -109,13 +95,6 @@ static inline void flush_check(struct inet_peer_base *base, int family) } } -void inetpeer_invalidate_family(int family) -{ - atomic_t *fp = inetpeer_seq_ptr(family); - - atomic_inc(fp); -} - #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ /* Exported for sysctl_net_ipv4. */ @@ -127,7 +106,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min static void inetpeer_gc_worker(struct work_struct *work) { struct inet_peer *p, *n, *c; - LIST_HEAD(list); + struct list_head list; spin_lock_bh(&gc_lock); list_replace_init(&gc_list, &list); @@ -227,7 +206,7 @@ static int addr_compare(const struct inetpeer_addr *a, stackptr = _stack; \ *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ - u != peer_avl_empty; ) { \ + u != peer_avl_empty;) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ @@ -282,7 +261,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ for (u = rcu_deref_locked(*v, base); \ - u->avl_right != peer_avl_empty_rcu; ) { \ + u->avl_right != peer_avl_empty_rcu;) { \ v = &u->avl_right; \ *stackptr++ = v; \ u = rcu_deref_locked(*v, base); \ @@ -504,10 +483,6 @@ relookup: p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, - (daddr->family == AF_INET) ? - secure_ip_id(daddr->addr.a4) : - secure_ipv6_id(daddr->addr.a6)); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first @@ -529,7 +504,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 694de3b7aeb..3a83ce5efa8 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,24 @@ #include <net/route.h> #include <net/xfrm.h> +static bool ip_may_fragment(const struct sk_buff *skb) +{ + return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || + skb->ignore_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + + static int ip_forward_finish(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -54,10 +72,15 @@ static int ip_forward_finish(struct sk_buff *skb) int ip_forward(struct sk_buff *skb) { + u32 mtu; struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + /* that should never happen */ + if (skb->pkt_type != PACKET_HOST) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -67,9 +90,6 @@ int ip_forward(struct sk_buff *skb) if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS; - if (skb->pkt_type != PACKET_HOST) - goto drop; - skb_forward_csum(skb); /* @@ -88,11 +108,12 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + IPCB(skb)->flags |= IPSKB_FORWARDED; + mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); + if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->dst))); + htonl(mtu)); goto drop; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2481993a497..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -232,8 +232,9 @@ static void ip_expire(unsigned long arg) * "Fragment Reassembly Timeout" message, per RFC792. */ if (qp->user == IP_DEFRAG_AF_PACKET || - (qp->user == IP_DEFRAG_CONNTRACK_IN && - skb_rtable(head)->rt_type != RTN_LOCAL)) + ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && + (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && + (skb_rtable(head)->rt_type != RTN_LOCAL))) goto out_rcu_unlock; @@ -704,7 +705,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) return NULL; - skb->rxhash = 0; + skb_clear_hash(skb); } } return skb; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d7aea4c5b94..9b842544aea 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -178,7 +178,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info, else itn = net_generic(net, ipgre_net_id); - iph = (const struct iphdr *)skb->data; + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->daddr, iph->saddr, tpi->key); @@ -217,6 +217,7 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) iph->saddr, iph->daddr, tpi->key); if (tunnel) { + skb_pop_mac_header(skb); ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); return PACKET_RCVD; } @@ -277,7 +278,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_dropped++; return NETDEV_TX_OK; @@ -300,7 +301,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_dropped++; return NETDEV_TX_OK; @@ -409,7 +410,7 @@ static int ipgre_open(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_gre(dev_net(dev), &fl4, + rt = ip_route_output_gre(t->net, &fl4, t->parms.iph.daddr, t->parms.iph.saddr, t->parms.o_key, @@ -433,7 +434,7 @@ static int ipgre_close(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; - in_dev = inetdev_by_index(dev_net(dev), t->mlink); + in_dev = inetdev_by_index(t->net, t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); } @@ -462,6 +463,7 @@ static const struct net_device_ops ipgre_netdev_ops = { static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; + dev->type = ARPHRD_IPGRE; ip_tunnel_setup(dev, ipgre_net_id); } @@ -476,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev) dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; - dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; + dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { @@ -500,7 +502,6 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_IPGRE; dev->flags = IFF_NOARP; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->addr_len = 4; @@ -648,6 +649,7 @@ static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 054a3e97d82..3d4da2c16b6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,7 +314,7 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb)) { + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ec7264514a8..ad382499bac 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -167,7 +167,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); - for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. @@ -227,7 +227,7 @@ void ip_options_fragment(struct sk_buff *skb) continue; } optlen = optptr[1]; - if (optlen<2 || optlen>l) + if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); @@ -275,27 +275,31 @@ int ip_options_compile(struct net *net, for (l = opt->optlen; l > 0; ) { switch (*optptr) { - case IPOPT_END: - for (optptr++, l--; l>0; optptr++, l--) { + case IPOPT_END: + for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; } } goto eol; - case IPOPT_NOOP: + case IPOPT_NOOP: l--; optptr++; continue; } + if (unlikely(l < 2)) { + pp_ptr = optptr; + goto error; + } optlen = optptr[1]; - if (optlen<2 || optlen>l) { + if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } switch (*optptr) { - case IPOPT_SSRR: - case IPOPT_LSRR: + case IPOPT_SSRR: + case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; @@ -321,7 +325,7 @@ int ip_options_compile(struct net *net, opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; - case IPOPT_RR: + case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; @@ -349,7 +353,7 @@ int ip_options_compile(struct net *net, } opt->rr = optptr - iph; break; - case IPOPT_TIMESTAMP: + case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; @@ -364,19 +368,19 @@ int ip_options_compile(struct net *net, } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; - if (optptr[2]+3 > optptr[1]) { + if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } switch (optptr[3]&0xF) { - case IPOPT_TS_TSONLY: + case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; - case IPOPT_TS_TSANDADDR: - if (optptr[2]+7 > optptr[1]) { + case IPOPT_TS_TSANDADDR: + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } @@ -389,8 +393,8 @@ int ip_options_compile(struct net *net, opt->ts_needtime = 1; optptr[2] += 8; break; - case IPOPT_TS_PRESPEC: - if (optptr[2]+7 > optptr[1]) { + case IPOPT_TS_PRESPEC: + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } @@ -405,7 +409,7 @@ int ip_options_compile(struct net *net, opt->ts_needtime = 1; optptr[2] += 8; break; - default: + default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; @@ -433,7 +437,7 @@ int ip_options_compile(struct net *net, } opt->ts = optptr - iph; break; - case IPOPT_RA: + case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; @@ -441,7 +445,7 @@ int ip_options_compile(struct net *net, if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; - case IPOPT_CIPSO: + case IPOPT_CIPSO: if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; @@ -452,9 +456,9 @@ int ip_options_compile(struct net *net, goto error; } break; - case IPOPT_SEC: - case IPOPT_SID: - default: + case IPOPT_SEC: + case IPOPT_SID: + default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; @@ -572,7 +576,7 @@ void ip_forward_options(struct sk_buff *skb) optptr = raw + opt->srr; - for ( srrptr=optptr[2], srrspace = optptr[1]; + for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { @@ -628,7 +632,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (rt->rt_type != RTN_LOCAL) return -EINVAL; - for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 912402752f2..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb) skb_dst(skb)->dev, dst_output); } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out); +EXPORT_SYMBOL_GPL(ip_local_out_sk); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } +static int ip_finish_output_gso(struct sk_buff *skb) +{ + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + /* common case: locally created skb or seglen is <= mtu */ + if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + return ip_finish_output2(skb); + + /* Slowpath - GSO segment length is exceeding the dst MTU. + * + * This can happen in two cases: + * 1) TCP GRO packet, DF bit not set + * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly + * from host network stack. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = ip_fragment(segs, ip_finish_output2); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -220,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) + if (skb_is_gso(skb)) + return ip_finish_output_gso(skb); + + if (skb->len > ip_skb_dst_mtu(skb)) return ip_fragment(skb, ip_finish_output2); - else - return ip_finish_output2(skb); + + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -287,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -315,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) sizeof(fl4->saddr) + sizeof(fl4->daddr)); } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; @@ -371,7 +415,7 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; @@ -386,9 +430,9 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(skb, &rt->dst, sk, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -422,9 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - to->nf_trace = from->nf_trace; -#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif @@ -458,12 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || + mtu = ip_skb_dst_mtu(skb); + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { + IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(ip_skb_dst_mtu(skb))); + htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } @@ -473,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ + mtu = mtu - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); @@ -823,12 +865,11 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, - mtu-exthdrlen); + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -1147,11 +1188,11 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; if (cork->length + size > maxnonfragsize - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -1308,11 +1349,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc < IP_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. - * If local_df is set too, we still allow to fragment this frame + * If ignore_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc == IP_PMTUDISC_DO || inet->pmtudisc == IP_PMTUDISC_PROBE || @@ -1338,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt) { iph->ihl += opt->optlen>>2; @@ -1505,7 +1545,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, daddr = replyopts.opt.opt.faddr; } - flowi4_init_output(&fl4, arg->bound_dev_if, 0, + flowi4_init_output(&fl4, arg->bound_dev_if, + IP4_REPLY_MARK(net, skb->mark), RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), @@ -1550,7 +1591,7 @@ void __init ip_init(void) ip_rt_init(); inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) - igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) + igmp_mc_init(); #endif } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ddf32a6bc41..64741b93863 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -56,7 +56,6 @@ /* * SOL_IP control messages. */ -#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb)) static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { @@ -187,7 +186,8 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) } EXPORT_SYMBOL(ip_cmsg_recv); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, + bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; @@ -195,6 +195,22 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; +#if defined(CONFIG_IPV6) + if (allow_ipv6 && + cmsg->cmsg_level == SOL_IPV6 && + cmsg->cmsg_type == IPV6_PKTINFO) { + struct in6_pktinfo *src_info; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) + return -EINVAL; + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) + return -EINVAL; + ipc->oif = src_info->ipi6_ifindex; + ipc->addr = src_info->ipi6_addr.s6_addr32[3]; + continue; + } +#endif if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { @@ -390,7 +406,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; - struct sockaddr_in *sin; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in offender; @@ -416,7 +432,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + @@ -628,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->nodefrag = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; inet->pmtudisc = val; break; @@ -1051,14 +1066,15 @@ e_inval: * * To support IP_CMSG_PKTINFO option, we store rt_iif and specific * destination in skb->cb[] before dst drop. - * This way, receiver doesnt make cache line misses to read rtable. + * This way, receiver doesn't make cache line misses to read rtable. */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); + bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || + ipv6_sk_rxinfo(sk); - if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) && - skb_rtable(skb)) { + if (prepare && skb_rtable(skb)) { pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 90ff9570d7d..6f9de61dce5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/rculist.h> +#include <linux/err.h> #include <net/sock.h> #include <net/ip.h> @@ -61,57 +62,59 @@ #include <net/ip6_route.h> #endif -static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, - __be32 key, __be32 remote) +static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) { return hash_32((__force u32)key ^ (__force u32)remote, IP_TNL_HASH_BITS); } -/* Often modified stats are per cpu, other are shared (netdev->stats) */ -struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) +static void __tunnel_dst_set(struct ip_tunnel_dst *idst, + struct dst_entry *dst) { - int i; + struct dst_entry *old_dst; - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } + dst_clone(dst); + old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); + dst_release(old_dst); +} - tot->multicast = dev->stats.multicast; +static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +{ + __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); +} - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; +static void tunnel_dst_reset(struct ip_tunnel *t) +{ + tunnel_dst_set(t, NULL); +} - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; +void ip_tunnel_dst_reset_all(struct ip_tunnel *t) +{ + int i; - tot->collisions = dev->stats.collisions; + for_each_possible_cpu(i) + __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); +} +EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - return tot; +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) +{ + struct dst_entry *dst; + + rcu_read_lock(); + dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + if (dst) { + if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + tunnel_dst_reset(t); + dst_release(dst); + dst = NULL; + } + } + rcu_read_unlock(); + return (struct rtable *)dst; } -EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) @@ -146,7 +149,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; - hash = ip_tunnel_hash(itn, key, remote); + hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { @@ -166,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (remote != t->parms.iph.daddr || + t->parms.iph.saddr != 0 || !(t->dev->flags & IFF_UP)) continue; @@ -178,14 +182,15 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, cand = t; } - hash = ip_tunnel_hash(itn, key, 0); + hash = ip_tunnel_hash(key, 0); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { - if ((local != t->parms.iph.saddr && - (local != t->parms.iph.daddr || - !ipv4_is_multicast(local))) || - !(t->dev->flags & IFF_UP)) + if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && + (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) + continue; + + if (!(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) @@ -202,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (t->parms.i_key != key || + t->parms.iph.saddr != 0 || + t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; @@ -228,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, { unsigned int h; __be32 remote; + __be32 i_key = parms->i_key; if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) remote = parms->iph.daddr; else remote = 0; - h = ip_tunnel_hash(itn, parms->i_key, remote); + if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) + i_key = 0; + + h = ip_tunnel_hash(i_key, remote); return &itn->tunnels[h]; } @@ -257,6 +268,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; + __be16 flags = parms->i_flags; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); @@ -264,9 +276,9 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - key == t->parms.i_key && link == t->parms.link && - type == t->dev->type) + type == t->dev->type && + ip_tunnel_key_match(&t->parms, flags, key)) break; } return t; @@ -318,11 +330,10 @@ failed: return ERR_PTR(err); } -static inline struct rtable *ip_route_output_tunnel(struct net *net, - struct flowi4 *fl4, - int proto, - __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif) +static inline void init_tunnel_flow(struct flowi4 *fl4, + int proto, + __be32 daddr, __be32 saddr, + __be32 key, __u8 tos, int oif) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = oif; @@ -331,7 +342,6 @@ static inline struct rtable *ip_route_output_tunnel(struct net *net, fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; - return ip_route_output_key(net, fl4); } static int ip_tunnel_bind_dev(struct net_device *dev) @@ -350,14 +360,14 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_tunnel(tunnel->net, &fl4, - tunnel->parms.iph.protocol, - iph->daddr, iph->saddr, - tunnel->parms.o_key, - RT_TOS(iph->tos), - tunnel->parms.link); + init_tunnel_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link); + rt = ip_route_output_key(tunnel->net, &fl4); + if (!IS_ERR(rt)) { tdev = rt->dst.dev; + tunnel_dst_set(tunnel, &rt->dst); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -386,14 +396,13 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, struct ip_tunnel_parm *parms) { - struct ip_tunnel *nt, *fbt; + struct ip_tunnel *nt; struct net_device *dev; BUG_ON(!itn->fb_tunnel_dev); - fbt = netdev_priv(itn->fb_tunnel_dev); dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); if (IS_ERR(dev)) - return NULL; + return ERR_CAST(dev); dev->mtu = ip_tunnel_bind_dev(dev); @@ -405,15 +414,12 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, bool log_ecn_error) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); int err; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { - /* Looped back packet, drop it! */ - if (rt_is_output_route(skb_rtable(skb))) - goto drop; tunnel->dev->stats.multicast++; skb->pkt_type = PACKET_BROADCAST; } @@ -436,6 +442,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } + skb_reset_network_header(skb); + err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -532,8 +540,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; + bool connected; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + connected = (tunnel->parms.iph.daddr != 0); dst = tnl_params->daddr; if (dst == 0) { @@ -581,27 +591,38 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, #endif else goto tx_error; + + connected = false; } tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP)) { tos = inner_iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + connected = false; + } else if (skb->protocol == htons(ETH_P_IPV6)) { tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); + connected = false; + } } - rt = ip_route_output_tunnel(tunnel->net, &fl4, - protocol, - dst, tnl_params->saddr, - tunnel->parms.o_key, - RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + + rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; + + if (!rt) { + rt = ip_route_output_key(tunnel->net, &fl4); + + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (connected) + tunnel_dst_set(tunnel, &rt->dst); } + if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; @@ -618,6 +639,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst_link_failure(skb); } else tunnel->err_count = 0; @@ -646,12 +668,13 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dev->needed_headroom = max_headroom; if (skb_cow_head(skb, dev->needed_headroom)) { + ip_rt_put(rt); dev->stats.tx_dropped++; - dev_kfree_skb(skb); + kfree_skb(skb); return; } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -663,7 +686,7 @@ tx_error_icmp: #endif tx_error: dev->stats.tx_errors++; - dev_kfree_skb(skb); + kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_tunnel_xmit); @@ -696,25 +719,25 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } + ip_tunnel_dst_reset_all(t); netdev_state_change(dev); } int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; + struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: - t = NULL; - if (dev == itn->fb_tunnel_dev) + if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (t == NULL) - t = netdev_priv(dev); + if (t == NULL) + t = netdev_priv(dev); + } memcpy(p, &t->parms, sizeof(*p)); break; @@ -725,16 +748,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); - if (!(p->i_flags&TUNNEL_KEY)) - p->i_key = 0; - if (!(p->o_flags&TUNNEL_KEY)) - p->o_key = 0; + if (!(p->i_flags & VTI_ISVTI)) { + if (!(p->i_flags & TUNNEL_KEY)) + p->i_key = 0; + if (!(p->o_flags & TUNNEL_KEY)) + p->o_key = 0; + } t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (!t && (cmd == SIOCADDTUNNEL)) + if (!t && (cmd == SIOCADDTUNNEL)) { t = ip_tunnel_create(net, itn, p); - + err = PTR_ERR_OR_ZERO(t); + break; + } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { @@ -761,8 +788,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) if (t) { err = 0; ip_tunnel_update(itn, t, dev, p, true); - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + } else { + err = -ENOENT; + } break; case SIOCDELTUNNEL: @@ -811,6 +839,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); + free_percpu(tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -855,11 +884,12 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, */ if (!IS_ERR(itn->fb_tunnel_dev)) { itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); } rtnl_unlock(); - return PTR_RET(itn->fb_tunnel_dev); + return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); @@ -976,21 +1006,22 @@ int ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - int i, err; + int err; dev->destructor = ip_tunnel_dev_free; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_tstats *ipt_stats; - ipt_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ipt_stats->syncp); + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { + free_percpu(tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1015,6 +1046,8 @@ void ip_tunnel_uninit(struct net_device *dev) /* fb_tunnel_dev will be unregisted in net-exit call. */ if (itn->fb_tunnel_dev != dev) ip_tunnel_del(netdev_priv(dev)); + + ip_tunnel_dst_reset_all(tunnel); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 42ffbc8d65c..f4c987bb7e9 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,7 +46,7 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { @@ -56,7 +56,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, skb_scrub_packet(skb, xnet); - skb->rxhash = 0; + skb_clear_hash(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -74,9 +74,9 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out(skb); + err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; @@ -107,8 +107,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) nf_reset(skb); secpath_reset(skb); - if (!skb->l4_rxhash) - skb->rxhash = 0; + skb_clear_hash_if_not_l4(skb); skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); @@ -136,6 +135,14 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } + /* If packet is not gso and we are resolving any partial checksum, + * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL + * on the outer header without confusing devices that implement + * NETIF_F_IP_CSUM with encapsulation. + */ + if (csum_help) + skb->encapsulation = 0; + if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { err = skb_checksum_help(skb); if (unlikely(err)) @@ -149,3 +156,49 @@ error: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); + +/* Often modified stats are per cpu, other are shared (netdev->stats) */ +struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; + } + + tot->multicast = dev->stats.multicast; + + tot->rx_crc_errors = dev->stats.rx_crc_errors; + tot->rx_fifo_errors = dev->stats.rx_fifo_errors; + tot->rx_length_errors = dev->stats.rx_length_errors; + tot->rx_frame_errors = dev->stats.rx_frame_errors; + tot->rx_errors = dev->stats.rx_errors; + + tot->tx_fifo_errors = dev->stats.tx_fifo_errors; + tot->tx_carrier_errors = dev->stats.tx_carrier_errors; + tot->tx_dropped = dev->stats.tx_dropped; + tot->tx_aborted_errors = dev->stats.tx_aborted_errors; + tot->tx_errors = dev->stats.tx_errors; + + tot->collisions = dev->stats.collisions; + + return tot; +} +EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 52b802a0cd8..b8960f3527f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -34,6 +34,7 @@ #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> +#include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> @@ -49,8 +50,8 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; static int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); -/* We dont digest the packet therefore let the packet pass */ -static int vti_rcv(struct sk_buff *skb) +static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); @@ -60,79 +61,120 @@ static int vti_rcv(struct sk_buff *skb) tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel != NULL) { - struct pcpu_tstats *tstats; - u32 oldmark = skb->mark; - int ret; - - - /* temporarily mark the skb with the tunnel o_key, to - * only match policies with this mark. - */ - skb->mark = be32_to_cpu(tunnel->parms.o_key); - ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb); - skb->mark = oldmark; - if (!ret) - return -1; - - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - secpath_reset(skb); - skb->dev = tunnel->dev; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + skb->mark = be32_to_cpu(tunnel->parms.i_key); + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + +static int vti_rcv(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); +} + +static int vti_rcv_cb(struct sk_buff *skb, int err) +{ + unsigned short family; + struct net_device *dev; + struct pcpu_sw_netstats *tstats; + struct xfrm_state *x; + struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + + if (!tunnel) return 1; + + dev = tunnel->dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; } - return -1; + x = xfrm_input_state(skb); + family = x->inner_mode->afinfo->family; + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + return -EPERM; + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); + skb->dev = dev; + + tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; } -/* This function assumes it is being called from dev_queue_xmit() - * and that skb is filled properly by that function. - */ +static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) +{ + xfrm_address_t *daddr = (xfrm_address_t *)&dst; + xfrm_address_t *saddr = (xfrm_address_t *)&src; -static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!x || x->props.mode != XFRM_MODE_TUNNEL || + x->props.family != AF_INET) + return false; + + if (!dst) + return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); + + if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) + return false; + + return true; +} + +static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, + struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *tiph = &tunnel->parms.iph; - u8 tos; - struct rtable *rt; /* Route to the other host */ + struct ip_tunnel_parm *parms = &tunnel->parms; + struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - __be32 dst = tiph->daddr; - struct flowi4 fl4; int err; - if (skb->protocol != htons(ETH_P_IP)) - goto tx_error; - - tos = old_iph->tos; + if (!dst) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } - memset(&fl4, 0, sizeof(fl4)); - flowi4_init_output(&fl4, tunnel->parms.link, - be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos), - RT_SCOPE_UNIVERSE, - IPPROTO_IPIP, 0, - dst, tiph->saddr, 0, 0); - rt = ip_route_output_key(dev_net(dev), &fl4); - if (IS_ERR(rt)) { + dst_hold(dst); + dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; } - /* if there is no transform then this tunnel is not functional. - * Or if the xfrm is not mode tunnel. - */ - if (!rt->dst.xfrm || - rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { + + if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { dev->stats.tx_carrier_errors++; - ip_rt_put(rt); + dst_release(dst); goto tx_error_icmp; } - tdev = rt->dst.dev; + + tdev = dst->dev; if (tdev == dev) { - ip_rt_put(rt); + dst_release(dst); dev->stats.collisions++; goto tx_error; } @@ -146,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) tunnel->err_count = 0; } - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - nf_reset(skb); + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(skb); @@ -162,10 +202,102 @@ tx_error_icmp: dst_link_failure(skb); tx_error: dev->stats.tx_errors++; - dev_kfree_skb(skb); + kfree_skb(skb); return NETDEV_TX_OK; } +/* This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ +static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + + skb->mark = be32_to_cpu(tunnel->parms.o_key); + + switch (skb->protocol) { + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + default: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + + return vti_xmit(skb, dev, &fl); +} + +static int vti4_err(struct sk_buff *skb, u32 info) +{ + __be32 spi; + __u32 mark; + struct xfrm_state *x; + struct ip_tunnel *tunnel; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct ip_comp_hdr *ipch; + struct net *net = dev_net(skb->dev); + const struct iphdr *iph = (const struct iphdr *)skb->data; + int protocol = iph->protocol; + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!tunnel) + return -1; + + mark = be32_to_cpu(tunnel->parms.o_key); + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + else + ipv4_redirect(skb, net, 0, 0, protocol, 0); + xfrm_state_put(x); + + return 0; +} + static int vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -181,12 +313,19 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } + if (!(p.i_flags & GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags & GRE_KEY)) + p.o_key = 0; + + p.i_flags = VTI_ISVTI; + err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p.i_flags |= GRE_KEY | VTI_ISVTI; + p.i_flags |= GRE_KEY; p.o_flags |= GRE_KEY; } @@ -207,6 +346,7 @@ static const struct net_device_ops vti_netdev_ops = { static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; + dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } @@ -218,13 +358,11 @@ static int vti_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN; dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; @@ -241,9 +379,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; } -static struct xfrm_tunnel_notifier vti_handler __read_mostly = { +static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv, - .priority = 1, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, }; static int __net_init vti_init_net(struct net *net) @@ -287,6 +444,8 @@ static void vti_netlink_parms(struct nlattr *data[], if (!data) return; + parms->i_flags = VTI_ISVTI; + if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); @@ -382,10 +541,31 @@ static int __init vti_init(void) err = register_pernet_device(&vti_net_ops); if (err < 0) return err; - err = xfrm4_mode_tunnel_input_register(&vti_handler); + err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) { unregister_pernet_device(&vti_net_ops); pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); + if (err < 0) { + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) { + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; } err = rtnl_link_register(&vti_link_ops); @@ -395,7 +575,9 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_mode_tunnel_input_deregister(&vti_handler); + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); return err; } @@ -403,8 +585,13 @@ rtnl_link_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) + if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) + pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) + pr_info("vti close: can't deregister tunnel\n"); + unregister_pernet_device(&vti_net_ops); } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 826be4cb482..c0855d50a3f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -23,7 +23,7 @@ #include <net/protocol.h> #include <net/sock.h> -static void ipcomp4_err(struct sk_buff *skb, u32 info) +static int ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; @@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); + + return 0; } /* We always hold one tunnel user reference to indicate a tunnel */ @@ -147,6 +149,11 @@ out: return err; } +static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ipcomp_type = { .description = "IPCOMP4", .owner = THIS_MODULE, @@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = { .output = ipcomp_output }; -static const struct net_protocol ipcomp4_protocol = { +static struct xfrm4_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ipcomp4_rcv_cb, .err_handler = ipcomp4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init ipcomp4_init(void) @@ -170,7 +178,7 @@ static int __init ipcomp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); return -EAGAIN; @@ -180,7 +188,7 @@ static int __init ipcomp4_init(void) static void __exit ipcomp4_fini(void) { - if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index efa1138fa52..b3e86ea7b71 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -273,7 +273,7 @@ static int __init ic_open_devs(void) msleep(1); - if time_before(jiffies, next_msg) + if (time_before(jiffies, next_msg)) continue; elapsed = jiffies_to_msecs(jiffies - start); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fe3e9f7f1f0..62eaa005e14 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -149,13 +149,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->dev->ifindex, 0, IPPROTO_IPIP, 0); + t->parms.link, 0, IPPROTO_IPIP, 0); err = 0; goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, IPPROTO_IPIP, 0); err = 0; goto out; @@ -228,7 +228,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_errors++; return NETDEV_TX_OK; @@ -486,4 +486,5 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 421a24934ff..65bcaa78904 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { - struct ipmr_result res; - struct fib_lookup_arg arg = { .result = &res, }; int err; + struct ipmr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); @@ -452,7 +455,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_iif = skb->skb_iif, + .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; @@ -481,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev) dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; - dev->netdev_ops = ®_vif_netdev_ops, + dev->netdev_ops = ®_vif_netdev_ops; dev->destructor = free_netdev; dev->features |= NETIF_F_NETNS_LOCAL; } @@ -1660,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(skb, skb_dst(skb), NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -2252,13 +2255,14 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - u32 portid, u32 seq, struct mfc_cache *c, int cmd) + u32 portid, u32 seq, struct mfc_cache *c, int cmd, + int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI); + nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; @@ -2326,7 +2330,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, if (skb == NULL) goto errout; - err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd); + err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; @@ -2365,7 +2369,8 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (ipmr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE) < 0) + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) goto done; next_entry: e++; @@ -2379,7 +2384,8 @@ next_entry: if (ipmr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE) < 0) { + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) { spin_unlock_bh(&mfc_unres_lock); goto done; } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c3e0adea9c2..7ebd6e37875 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); if (IS_ERR(dst)) - return PTR_ERR(dst);; + return PTR_ERR(dst); skb_dst_set(skb, dst); } #endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 40d56073cd1..a26ce035e3f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -39,23 +39,38 @@ config NF_CONNTRACK_PROC_COMPAT config NF_TABLES_IPV4 depends on NF_TABLES tristate "IPv4 nf_tables support" - -config NFT_REJECT_IPV4 - depends on NF_TABLES_IPV4 - tristate "nf_tables IPv4 reject support" + help + This option enables the IPv4 support for nf_tables. config NFT_CHAIN_ROUTE_IPV4 depends on NF_TABLES_IPV4 tristate "IPv4 nf_tables route chain support" + help + This option enables the "route" chain for IPv4 in nf_tables. This + chain type is used to force packet re-routing after mangling header + fields such as the source, destination, type of service and + the packet mark. config NFT_CHAIN_NAT_IPV4 depends on NF_TABLES_IPV4 depends on NF_NAT_IPV4 && NFT_NAT tristate "IPv4 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv4 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + default NFT_REJECT + tristate config NF_TABLES_ARP depends on NF_TABLES tristate "ARP nf_tables support" + help + This option enables the ARP support for nf_tables. config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19df72b7ba8..90b82405331 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -28,9 +28,9 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o -obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cde072..f95b6f93814 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 718dfbd30cb..99e810f8467 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b969131ad1c..5b6e0df4ccf 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -17,10 +17,6 @@ #include <linux/udp.h> #include <linux/icmp.h> #include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> -#include <net/route.h> -#include <net/dst.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_REJECT.h> @@ -28,128 +24,12 @@ #include <linux/netfilter_bridge.h> #endif +#include <net/netfilter/ipv4/nf_reject.h> + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); -/* Send RST reply */ -static void send_reset(struct sk_buff *oldskb, int hook) -{ - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); - if (oth == NULL) - return; - - /* No RST for RST. */ - if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; - - /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; - - skb_reserve(nskb, LL_MAX_HEADER); - - skb_reset_network_header(nskb); - niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - niph->version = 4; - niph->ihl = sizeof(struct iphdr) / 4; - niph->tos = 0; - niph->id = 0; - niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; - niph->check = 0; - niph->saddr = oiph->daddr; - niph->daddr = oiph->saddr; - - skb_reset_transport_header(nskb); - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - memset(tcph, 0, sizeof(*tcph)); - tcph->source = oth->dest; - tcph->dest = oth->source; - tcph->doff = sizeof(struct tcphdr) / 4; - - if (oth->ack) - tcph->seq = oth->ack_seq; - else { - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + - oldskb->len - ip_hdrlen(oldskb) - - (oth->doff << 2)); - tcph->ack = 1; - } - - tcph->rst = 1; - tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, - niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)tcph - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - /* ip_route_me_harder expects skb->dst to be set */ - skb_dst_set_noref(nskb, skb_dst(oldskb)); - - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) - goto free_nskb; - - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - - /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) - goto free_nskb; - - nf_ct_attach(nskb, oldskb); - -#ifdef CONFIG_BRIDGE_NETFILTER - /* If we use ip_local_out for bridged traffic, the MAC source on - * the RST will be ours, instead of the destination's. This confuses - * some routers/firewalls, and they drop the packet. So we need to - * build the eth header using the original destination's MAC as the - * source, and send the RST packet directly. - */ - if (oldskb->nf_bridge) { - struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) - goto free_nskb; - dev_queue_xmit(nskb); - } else -#endif - ip_local_out(nskb); - - return; - - free_nskb: - kfree_skb(nskb); -} - -static inline void send_unreach(struct sk_buff *skb_in, int code) -{ - icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); -} - static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -157,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: - send_unreach(skb, ICMP_NET_UNREACH); + nf_send_unreach(skb, ICMP_NET_UNREACH); break; case IPT_ICMP_HOST_UNREACHABLE: - send_unreach(skb, ICMP_HOST_UNREACH); + nf_send_unreach(skb, ICMP_HOST_UNREACH); break; case IPT_ICMP_PROT_UNREACHABLE: - send_unreach(skb, ICMP_PROT_UNREACH); + nf_send_unreach(skb, ICMP_PROT_UNREACH); break; case IPT_ICMP_PORT_UNREACHABLE: - send_unreach(skb, ICMP_PORT_UNREACH); + nf_send_unreach(skb, ICMP_PORT_UNREACH); break; case IPT_ICMP_NET_PROHIBITED: - send_unreach(skb, ICMP_NET_ANO); + nf_send_unreach(skb, ICMP_NET_ANO); break; case IPT_ICMP_HOST_PROHIBITED: - send_unreach(skb, ICMP_HOST_ANO); + nf_send_unreach(skb, ICMP_HOST_ANO); break; case IPT_ICMP_ADMIN_PROHIBITED: - send_unreach(skb, ICMP_PKT_FILTERED); + nf_send_unreach(skb, ICMP_PKT_FILTERED); break; case IPT_TCP_RESET: - send_reset(skb, par->hooknum); + nf_send_reset(skb, par->hooknum); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f13bd91d9a5..a313c3fbeb4 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -423,6 +423,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) static struct xt_target synproxy_tg4_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV4, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg4, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg4_check, diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c49dcd0284a..4bfaedf9b34 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -89,11 +89,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ipv4_is_multicast(iph->daddr)) { if (ipv4_is_zeronet(iph->saddr)) return ipv4_is_local_multicast(iph->daddr) ^ invert; - flow.flowi4_iif = 0; - } else { - flow.flowi4_iif = LOOPBACK_IFINDEX; } - + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_oif = 0; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ee2886126e3..f1787c04a4d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -91,17 +91,9 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, if (nf_ct_is_untracked(ct)) return NF_ACCEPT; - nat = nfct_nat(ct); - if (!nat) { - /* NAT module was loaded late. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index ecd8bec411c..8127dc80286 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -548,9 +548,3 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) module_init(nf_conntrack_l3proto_ipv4_init); module_exit(nf_conntrack_l3proto_ipv4_fini); - -void need_ipv4_conntrack(void) -{ - return; -} -EXPORT_SYMBOL_GPL(need_ipv4_conntrack); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 12e13bd82b5..b8f6381c7d0 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,7 +22,6 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -/* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) { int err; @@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) err = ip_defrag(skb, user); local_bh_enable(); - if (!err) + if (!err) { ip_send_check(ip_hdr(skb)); + skb->ignore_df = 1; + } return err; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9eea059dd62..574f7ebba0b 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -229,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); nated_port = 0; break; diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 61a942265e8..7c676671329 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -461,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, } if (subid < 40) { - optr [0] = 0; - optr [1] = subid; + optr[0] = 0; + optr[1] = subid; } else if (subid < 80) { - optr [0] = 1; - optr [1] = subid - 40; + optr[0] = 1; + optr[1] = subid - 40; } else { - optr [0] = 2; - optr [1] = subid - 80; + optr[0] = 2; + optr[1] = subid - 80; } *len = 2; @@ -1198,8 +1198,8 @@ static int snmp_translate(struct nf_conn *ct, map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); } else { /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); + map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); + map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip); } if (map.from == map.to) diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 3e67ef1c676..19412a4063f 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -14,10 +14,30 @@ #include <linux/netfilter_arp.h> #include <net/netfilter/nf_tables.h> +static unsigned int +nft_do_chain_arp(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, ops, skb, in, out); + + return nft_do_chain(&pkt, ops); +} + static struct nft_af_info nft_af_arp __read_mostly = { .family = NFPROTO_ARP, .nhooks = NF_ARP_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + [NF_ARP_FORWARD] = nft_do_chain_arp, + }, }; static int nf_tables_arp_init_net(struct net *net) @@ -48,32 +68,14 @@ static struct pernet_operations nf_tables_arp_net_ops = { .exit = nf_tables_arp_exit_net, }; -static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_arp = { - .family = NFPROTO_ARP, +static const struct nf_chain_type filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_ARP, + .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), - .fn = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - [NF_ARP_FORWARD] = nft_do_chain_arp, - }, }; static int __init nf_tables_arp_init(void) diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 0f4cbfeb19b..6820c8c4084 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -18,14 +18,25 @@ #include <net/ip.h> #include <net/netfilter/nf_tables_ipv4.h> +static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + return nft_do_chain(&pkt, ops); +} + static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct nft_pktinfo pkt; - if (unlikely(skb->len < sizeof(struct iphdr) || ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { if (net_ratelimit()) @@ -33,19 +44,24 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, "packet\n"); return NF_ACCEPT; } - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - return nft_do_chain_pktinfo(&pkt, ops); + return nft_do_chain_ipv4(ops, skb, in, out, okfn); } -static struct nft_af_info nft_af_ipv4 __read_mostly = { +struct nft_af_info nft_af_ipv4 __read_mostly = { .family = NFPROTO_IPV4, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, [NF_INET_LOCAL_OUT] = nft_ipv4_output, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, }, }; +EXPORT_SYMBOL_GPL(nft_af_ipv4); static int nf_tables_ipv4_init_net(struct net *net) { @@ -75,42 +91,28 @@ static struct pernet_operations nf_tables_ipv4_net_ops = { .exit = nf_tables_ipv4_exit_net, }; -static unsigned int -nft_do_chain_ipv4(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_ipv4 = { - .family = NFPROTO_IPV4, +static const struct nf_chain_type filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), - .fn = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, }; static int __init nf_tables_ipv4_init(void) { + int ret; + nft_register_chain_type(&filter_ipv4); - return register_pernet_subsys(&nf_tables_ipv4_net_ops); + ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_ipv4); + + return ret; } static void __exit nf_tables_ipv4_exit(void) diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index cf2c792cd97..3964157d826 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -48,15 +48,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); - nat = nfct_nat(ct); - if (nat == NULL) { - /* Conntrack module was loaded late, can't add extension. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) - return NF_ACCEPT; - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: @@ -75,7 +69,7 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_ACCEPT) return ret; if (!nf_nat_initialized(ct, maniptype)) { @@ -164,21 +158,21 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_nat_ipv4 = { - .family = NFPROTO_IPV4, +static const struct nf_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .fn = { + .hooks = { [NF_INET_PRE_ROUTING] = nf_nat_prerouting, [NF_INET_POST_ROUTING] = nf_nat_postrouting, [NF_INET_LOCAL_OUT] = nf_nat_output, [NF_INET_LOCAL_IN] = nf_nat_fn, }, - .me = THIS_MODULE, }; static int __init nft_chain_nat_init(void) diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 4e6bf9a3d7a..125b66766c0 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -47,7 +47,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, daddr = iph->daddr; tos = iph->tos; - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_DROP && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -61,15 +61,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_route_ipv4 = { - .family = NFPROTO_IPV4, +static const struct nf_chain_type nft_chain_route_ipv4 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_OUT), - .fn = { + .hooks = { [NF_INET_LOCAL_OUT] = nf_route_table_hook, }, - .me = THIS_MODULE, }; static int __init nft_chain_route_init(void) diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index fff5ba1a33b..e79718a382f 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,108 +17,59 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/icmp.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> -struct nft_reject { - enum nft_reject_types type:8; - u8 icmp_code; -}; - -static void nft_reject_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0); + nf_send_unreach(pkt->skb, priv->icmp_code); break; case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); break; } data[NFT_REG_VERDICT].verdict = NF_DROP; } +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); -static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, - [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, -}; - -static int nft_reject_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_reject *priv = nft_expr_priv(expr); - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type)) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - } - - return 0; - -nla_put_failure: - return -1; -} - -static struct nft_expr_type nft_reject_type; -static const struct nft_expr_ops nft_reject_ops = { - .type = &nft_reject_type, +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { + .type = &nft_reject_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), - .eval = nft_reject_eval, + .eval = nft_reject_ipv4_eval, .init = nft_reject_init, .dump = nft_reject_dump, }; -static struct nft_expr_type nft_reject_type __read_mostly = { +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, .name = "reject", - .ops = &nft_reject_ops, + .ops = &nft_reject_ipv4_ops, .policy = nft_reject_policy, .maxattr = NFTA_REJECT_MAX, .owner = THIS_MODULE, }; -static int __init nft_reject_module_init(void) +static int __init nft_reject_ipv4_module_init(void) { - return nft_register_expr(&nft_reject_type); + return nft_register_expr(&nft_reject_ipv4_type); } -static void __exit nft_reject_module_exit(void) +static void __exit nft_reject_ipv4_module_exit(void) { - nft_unregister_expr(&nft_reject_type); + nft_unregister_expr(&nft_reject_ipv4_type); } -module_init(nft_reject_module_init); -module_exit(nft_reject_module_exit); +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("reject"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 242e7f4ed6f..044a0ddf6a7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -53,8 +53,12 @@ #include <net/transp_v6.h> #endif +struct ping_table { + struct hlist_nulls_head hash[PING_HTABLE_SIZE]; + rwlock_t lock; +}; -struct ping_table ping_table; +static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); @@ -232,15 +236,15 @@ exit: static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { - kgid_t *data = net->ipv4.sysctl_ping_group_range; + kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } @@ -248,26 +252,33 @@ int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); - struct group_info *group_info = get_current_groups(); - int i, j, count = group_info->ngroups; + struct group_info *group_info; + int i, j, count; kgid_t low, high; + int ret = 0; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; + group_info = get_current_groups(); + count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); for (j = 0; j < cp_count; j++) { kgid_t gid = group_info->blocks[i][j]; if (gid_lte(low, gid) && gid_lte(gid, high)) - return 0; + goto out_release_group; } count -= cp_count; } - return -EACCES; + ret = -EACCES; + +out_release_group: + put_group_info(group_info); + return ret; } EXPORT_SYMBOL_GPL(ping_init_sock); @@ -316,6 +327,9 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr_len < sizeof(*addr)) return -EINVAL; + if (addr->sin6_family != AF_INET6) + return -EINVAL; + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); @@ -668,8 +682,8 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, } EXPORT_SYMBOL_GPL(ping_common_sendmsg); -int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; @@ -696,7 +710,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) @@ -720,7 +734,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) return err; if (ipc.opt) @@ -869,7 +883,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Copy the address and add cmsg data. */ if (family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); if (sin) { sin->sin_family = AF_INET; @@ -886,8 +900,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else if (family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *ip6 = ipv6_hdr(skb); - struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); if (sin6) { sin6->sin6_family = AF_INET6; @@ -903,7 +916,12 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } if (inet6_sk(sk)->rxopt.all) - pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb); + pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); + if (skb->protocol == htons(ETH_P_IPV6) && + inet6_sk(sk)->rxopt.all) + pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); + else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) + ip_cmsg_recv(msg, skb); #endif } else { BUG(); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8ecd7ad959b..ae0af9386f7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), + SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), @@ -280,6 +281,11 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), + SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), + SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), + SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), + SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), + SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_SENTINEL }; @@ -333,22 +339,22 @@ static void icmp_put(struct seq_file *seq) atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_printf(seq, " OutMsgs OutErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } @@ -373,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); @@ -389,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -404,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udp_statistics, + snmp_fold_field(net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -415,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udplite_statistics, + snmp_fold_field(net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -452,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.net_statistics, + snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -462,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 23c3e5b5bb5..2c65160565e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -493,7 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (msg->msg_namelen) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; @@ -524,7 +524,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) goto out; if (ipc.opt) @@ -575,7 +575,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | + inet_sk_flowi_flags(sk) | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); @@ -690,7 +690,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f8da2827801..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -89,6 +89,7 @@ #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> +#include <linux/jhash.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -112,9 +113,6 @@ #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) -/* IPv4 datagram length is stored into 16bit field (tot_len) */ -#define IP_MAX_MTU 0xFFFF - #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; @@ -142,11 +140,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) -{ -} - static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); @@ -165,7 +158,6 @@ static struct dst_ops ipv4_dst_ops = { .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, - .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, @@ -197,7 +189,7 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -465,39 +457,45 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { + atomic_t id; + u32 stamp32; +}; + +static struct ip_ident_bucket *ip_idents __read_mostly; + +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. */ -static void ip_select_fb_ident(struct iphdr *iph) +u32 ip_idents_reserve(u32 hash, int segs) { - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; + struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(bucket->stamp32); + u32 now = (u32)jiffies; + u32 delta = 0; - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); + if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + delta = prandom_u32_max(now - old); + + return atomic_add_return(segs + delta, &bucket->id) - segs; } +EXPORT_SYMBOL(ip_idents_reserve); -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +void __ip_select_ident(struct iphdr *iph, int segs) { - struct net *net = dev_net(dst->dev); - struct inet_peer *peer; + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; - peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); - if (peer) { - iph->id = htons(inet_getid(peer, more)); - inet_putpeer(peer); - return; - } + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); - ip_select_fb_ident(iph); + hash = jhash_3words((__force u32)iph->daddr, + (__force u32)iph->saddr, + iph->protocol, + ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); @@ -700,7 +698,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, out_unlock: spin_unlock_bh(&fnhe_lock); - return; } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, @@ -1003,6 +1000,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, struct flowi4 fl4; struct rtable *rt; + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); @@ -1020,6 +1020,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); @@ -1032,7 +1036,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - struct dst_entry *dst; + struct dst_entry *odst = NULL; bool new = false; bh_lock_sock(sk); @@ -1040,16 +1044,17 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) if (!ip_sk_accept_pmtu(sk)) goto out; - rt = (struct rtable *) __sk_dst_get(sk); + odst = sk_dst_get(sk); - if (sock_owned_by_user(sk) || !rt) { + if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - if (!__sk_dst_check(sk, 0)) { + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1059,8 +1064,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); - dst = dst_check(&rt->dst, 0); - if (!dst) { + if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); @@ -1072,10 +1076,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) } if (new) - __sk_dst_set(sk, &rt->dst); + sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); + dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); @@ -1139,7 +1144,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1529,7 +1534,7 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *out_dev; unsigned int flags = 0; bool do_cache; - u32 itag; + u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); @@ -1600,6 +1605,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1698,25 +1704,27 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res); - if (err != 0) + if (err != 0) { + if (!IN_DEV_FORWARD(in_dev)) + err = -EHOSTUNREACH; goto no_route; - - RT_CACHE_STAT_INC(in_slow_tot); + } if (res.type == RTN_BROADCAST) goto brd_input; if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, - LOOPBACK_IFINDEX, - dev, in_dev, &itag); + 0, dev, in_dev, &itag); if (err < 0) goto martian_source_keep_err; goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; @@ -1771,6 +1779,7 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; @@ -2223,7 +2232,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; new->dev = ort->dst.dev; if (new->dev) @@ -2362,7 +2371,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, } } else #endif - if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) goto nla_put_failure; } @@ -2473,11 +2482,6 @@ errout_free: goto errout; } -int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - return skb->len; -} - void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); @@ -2715,6 +2719,12 @@ int __init ip_rt_init(void) { int rc = 0; + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b95331e6c07..c86624b36a6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -121,7 +121,7 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ - diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; @@ -303,6 +303,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->ir_rmt_port = th->source; ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->ir_mark = inet_request_mark(sk, skb); ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -339,7 +340,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 38c8ec90ff6..79a007c5255 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,10 +45,10 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { - write_seqlock(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = range[0]; - net->ipv4.sysctl_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_seqlock(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = range[0]; + net->ipv4.ip_local_ports.range[1] = range[1]; + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -57,7 +57,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos) { struct net *net = - container_of(table->data, struct net, ipv4.sysctl_local_ports.range); + container_of(table->data, struct net, ipv4.ip_local_ports.range); int ret; int range[2]; struct ctl_table tmp = { @@ -87,14 +87,14 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); + container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } /* Update system visible IP port range */ @@ -102,11 +102,11 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); - write_seqlock(&net->ipv4.sysctl_local_ports.lock); + container_of(table->data, struct net, ipv4.ping_group_range.range); + write_seqlock(&net->ipv4.ip_local_ports.lock); data[0] = low; data[1] = high; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &ip_ttl_max, }, { - .procname = "ip_no_pmtu_disc", - .data = &ipv4_config.no_pmtu_disc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "ip_nonlocal_bind", .data = &sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), @@ -444,13 +437,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_local_reserved_ports", - .data = NULL, /* initialized in sysctl_ipv4_init */ - .maxlen = 65536, - .mode = 0644, - .proc_handler = proc_do_large_bitmap, - }, - { .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, .maxlen = sizeof(int), @@ -707,7 +693,7 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { + { .procname = "tcp_thin_dupack", .data = &sysctl_tcp_thin_dupack, .maxlen = sizeof(int), @@ -812,7 +798,7 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ping_group_range", - .data = &init_net.ipv4.sysctl_ping_group_range, + .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, .mode = 0644, .proc_handler = ipv4_ping_group_range, @@ -826,11 +812,46 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ip_local_port_range", - .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range), - .data = &init_net.ipv4.sysctl_local_ports.range, + .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), + .data = &init_net.ipv4.ip_local_ports.range, .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_reserved_ports", + .data = &init_net.ipv4.sysctl_local_reserved_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, + { + .procname = "ip_no_pmtu_disc", + .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_forward_use_pmtu", + .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "fwmark_reflect", + .data = &init_net.ipv4.sysctl_fwmark_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fwmark_accept", + .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -851,26 +872,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[i].data += (void *)net - (void *)&init_net; } - /* - * Sane defaults - nobody may create ping sockets. - * Boot scripts should set this to distro-specific group. - */ - net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); - net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); - - /* - * Set defaults for local port range - */ - seqlock_init(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = 32768; - net->ipv4.sysctl_local_ports.range[1] = 61000; - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) goto err_reg; + net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!net->ipv4.sysctl_local_reserved_ports) + goto err_ports; + return 0; +err_ports: + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); @@ -882,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { struct ctl_table *table; + kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); @@ -895,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; - struct ctl_table *i; - - for (i = ipv4_table; i->procname; i++) { - if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { - i->data = sysctl_local_reserved_ports; - break; - } - } - if (!i->procname) - return -EINVAL; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0ca87547bec..9d2118e5fbc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -381,13 +381,13 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); + __skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -622,19 +622,21 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) } /* If a not yet filled skb is pushed, do not send it if - * we have packets in Qdisc or NIC queues : + * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize - * than pure acks (dataless) packets, the last check prevents - * autocorking if we only have an ACK in Qdisc/NIC queues. + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && sysctl_tcp_autocorking && + skb != tcp_write_queue_head(sk) && atomic_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -660,7 +662,11 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &tp->tsq_flags); } - return; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. + */ + if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) + return; } if (flags & MSG_MORE) @@ -1038,7 +1044,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } } -static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, + int *copied, size_t size) { struct tcp_sock *tp = tcp_sk(sk); int err, flags; @@ -1053,11 +1060,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) if (unlikely(tp->fastopen_req == NULL)) return -ENOBUFS; tp->fastopen_req->data = msg; + tp->fastopen_req->size = size; flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, flags); - *size = tp->fastopen_req->copied; + *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); return err; } @@ -1077,7 +1085,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flags = msg->msg_flags; if (flags & MSG_FASTOPEN) { - err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) @@ -1100,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); - goto out; + goto out_nopush; } err = -EINVAL; @@ -1274,6 +1282,7 @@ wait_for_memory: out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush: release_sock(sk); return copied + copied_syn; @@ -1660,11 +1669,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && net_dma_find_channel()) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif @@ -2223,7 +2232,7 @@ adjudge_to_death: /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill + * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. @@ -2333,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); - tp->srtt = 0; + tp->srtt_us = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; @@ -2777,8 +2786,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; - info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; - info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_rtt = tp->srtt_us >> 3; + info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; @@ -2788,6 +2797,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; + + info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? + sk->sk_pacing_rate : ~0ULL; + info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? + sk->sk_max_pacing_rate : ~0ULL; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2903,6 +2917,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; + + case TCP_FASTOPEN: + if (icsk->icsk_accept_queue.fastopenq != NULL) + val = icsk->icsk_accept_queue.fastopenq->max_qlen; + else + val = 0; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 821846fb0a7..d5de69bc04f 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -140,13 +140,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index ad37bf18ae4..7b09d8b49fa 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -276,27 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } -/* RFC2861 Check whether we are limited by application or congestion window - * This is the inverse of cwnd check in tcp_tso_should_defer - */ -bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 left; - - if (in_flight >= tp->snd_cwnd) - return true; - - left = tp->snd_cwnd - in_flight; - if (sk_can_gso(sk) && - left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size && - left < sk->sk_gso_max_segs) - return true; - return left <= tcp_max_tso_deferred_mss(tp); -} -EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); - /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but @@ -338,11 +317,11 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ @@ -362,21 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window with halving. */ -u32 tcp_reno_min_cwnd(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); - struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; /* Initial congestion control used (until SYN) @@ -388,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = { .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 828e4c3ffba..a9bd8a4828a 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -304,13 +304,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { @@ -409,7 +408,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; ratio += cnt; - ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); + ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ @@ -476,10 +475,6 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); - /* hystart needs ms clock resolution */ - if (hystart && HZ < 1000) - cubictcp.flags |= TCP_CONG_RTT_STAMP; - return tcp_register_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f195d9316e5..9771563ab56 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -72,25 +72,224 @@ error: kfree(ctx); return err; } -/* Computes the fastopen cookie for the IP path. - * The path is a 128 bits long (pad with zeros for IPv4). - * - * The caller must check foc->len to determine if a valid cookie - * has been generated successfully. -*/ -void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, - struct tcp_fastopen_cookie *foc) +static bool __tcp_fastopen_cookie_gen(const void *path, + struct tcp_fastopen_cookie *foc) { - __be32 path[4] = { src, dst, 0, 0 }; struct tcp_fastopen_context *ctx; + bool ok = false; tcp_fastopen_init_key_once(true); rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); + crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; + ok = true; } rcu_read_unlock(); + return ok; +} + +/* Generate the fastopen cookie by doing aes128 encryption on both + * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 + * addresses. For the longer IPv6 addresses use CBC-MAC. + * + * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. + */ +static bool tcp_fastopen_cookie_gen(struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *foc) +{ + if (req->rsk_ops->family == AF_INET) { + const struct iphdr *iph = ip_hdr(syn); + + __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; + return __tcp_fastopen_cookie_gen(path, foc); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (req->rsk_ops->family == AF_INET6) { + const struct ipv6hdr *ip6h = ipv6_hdr(syn); + struct tcp_fastopen_cookie tmp; + + if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + struct in6_addr *buf = (struct in6_addr *) tmp.val; + int i = 4; + + for (i = 0; i < 4; i++) + buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; + return __tcp_fastopen_cookie_gen(buf, foc); + } + } +#endif + return false; +} + +static bool tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) +{ + struct tcp_sock *tp; + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + struct sock *child; + + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + return false; + + spin_lock(&queue->fastopenq->lock); + queue->fastopenq->qlen++; + spin_unlock(&queue->fastopenq->lock); + + /* Initialize the child socket. Have to fix some values to take + * into account the child is a Fast Open socket and is created + * only out of the bits carried in the SYN packet. + */ + tp = tcp_sk(child); + + tp->fastopen_rsk = req; + /* Do a hold on the listner sk so that if the listener is being + * closed, the child that has been accepted can live on and still + * access listen_lock. + */ + sock_hold(sk); + tcp_rsk(req)->listener = sk; + + /* RFC1323: The window in SYN & SYN/ACK segments is never + * scaled. So correct it appropriately. + */ + tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + + /* Activate the retrans timer so that SYNACK can be retransmitted. + * The request socket is not added to the SYN table of the parent + * because it's been added to the accept queue directly. + */ + inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); + + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, child); + + /* Now finish processing the fastopen child socket. */ + inet_csk(child)->icsk_af_ops->rebuild_header(child); + tcp_init_congestion_control(child); + tcp_mtup_init(child); + tcp_init_metrics(child); + tcp_init_buffer_space(child); + + /* Queue the data carried in the SYN packet. We need to first + * bump skb's refcnt because the caller will attempt to free it. + * + * XXX (TFO) - we honor a zero-payload TFO request for now, + * (any reason not to?) but no need to queue the skb since + * there is no data. How about SYN+FIN? + */ + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { + skb = skb_get(skb); + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + skb_set_owner_r(skb, child); + __skb_queue_tail(&child->sk_receive_queue, skb); + tp->syn_data_acked = 1; + } + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + sk->sk_data_ready(sk); + bh_unlock_sock(child); + sock_put(child); + WARN_ON(req->sk == NULL); + return true; +} +EXPORT_SYMBOL(tcp_fastopen_create_child); + +static bool tcp_fastopen_queue_check(struct sock *sk) +{ + struct fastopen_queue *fastopenq; + + /* Make sure the listener has enabled fastopen, and we don't + * exceed the max # of pending TFO requests allowed before trying + * to validating the cookie in order to avoid burning CPU cycles + * unnecessarily. + * + * XXX (TFO) - The implication of checking the max_qlen before + * processing a cookie request is that clients can't differentiate + * between qlen overflow causing Fast Open to be disabled + * temporarily vs a server not supporting Fast Open at all. + */ + fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq == NULL || fastopenq->max_qlen == 0) + return false; + + if (fastopenq->qlen >= fastopenq->max_qlen) { + struct request_sock *req1; + spin_lock(&fastopenq->lock); + req1 = fastopenq->rskq_rst_head; + if ((req1 == NULL) || time_after(req1->expires, jiffies)) { + spin_unlock(&fastopenq->lock); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); + return false; + } + fastopenq->rskq_rst_head = req1->dl_next; + fastopenq->qlen--; + spin_unlock(&fastopenq->lock); + reqsk_free(req1); + } + return true; +} + +/* Returns true if we should perform Fast Open on the SYN. The cookie (foc) + * may be updated and return the client in the SYN-ACK later. E.g., Fast Open + * cookie request (foc->len == 0). + */ +bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) +{ + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; + bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + + if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + (syn_data || foc->len >= 0) && + tcp_fastopen_queue_check(sk))) { + foc->len = -1; + return false; + } + + if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + goto fastopen; + + if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + foc->len == TCP_FASTOPEN_COOKIE_SIZE && + foc->len == valid_foc.len && + !memcmp(foc->val, valid_foc.val, foc->len)) { + /* Cookie is valid. Create a (full) child socket to accept + * the data in SYN before returning a SYN-ACK to ack the + * data. If we fail to create the socket, fall back and + * ack the ISN only but includes the same cookie. + * + * Note: Data-less SYN with valid cookie is allowed to send + * data in SYN_RECV state. + */ +fastopen: + if (tcp_fastopen_create_child(sk, skb, dst, req)) { + foc->len = -1; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return true; + } + } + + NET_INC_STATS_BH(sock_net(sk), foc->len ? + LINUX_MIB_TCPFASTOPENPASSIVEFAIL : + LINUX_MIB_TCPFASTOPENCOOKIEREQD); + *foc = valid_foc; + return false; } +EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8ed9305dfdf..1c4908280d9 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,12 +109,12 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) @@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, .cong_avoid = hstcp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "highspeed" diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4a194acfd92..031361311a8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -227,12 +227,12 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 478fe82611b..d8f8f05a495 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -21,7 +21,7 @@ struct hybla { u32 rho2; /* Rho * Rho, integer part */ u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho2_7ls; /* Rho^2, <<7 */ - u32 minrtt; /* Minimum smoothed round trip time value seen */ + u32 minrtt_us; /* Minimum smoothed round trip time value seen */ }; /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ @@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk) { struct hybla *ca = inet_csk_ca(sk); - ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho_3ls = max_t(u32, + tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), + 8U); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >> 7; @@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk) hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; tp->snd_cwnd = ca->rho; } @@ -85,8 +87,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -94,16 +95,16 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ - if (tp->srtt < ca->minrtt){ + if (tp->srtt_us < ca->minrtt_us) { hybla_recalc_param(sk); - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; } - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (!ca->hybla_en) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } @@ -166,7 +167,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 8a520996f3d..5999b3972e6 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -23,7 +23,6 @@ #define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */ #define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */ #define ALPHA_BASE ALPHA_SCALE /* 1.0 */ -#define U32_MAX ((u32)~0U) #define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */ #define BETA_SHIFT 6 @@ -256,8 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); @@ -266,7 +264,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, update_params(sk); /* RFC2861 only increase cwnd if fully utilized */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In slow start */ @@ -326,10 +324,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, } static struct tcp_congestion_ops tcp_illinois __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c53b7f35c51..40639c288dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -667,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); - long m = mrtt; /* RTT */ + long m = mrtt_us; /* RTT */ + u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -688,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if (m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -707,27 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (m > 0) m >>= 3; } else { - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ } - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - if (tp->mdev > tp->mdev_max) { - tp->mdev_max = tp->mdev; - if (tp->mdev_max > tp->rttvar) - tp->rttvar = tp->mdev_max; + tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev_us > tp->mdev_max_us) { + tp->mdev_max_us = tp->mdev_us; + if (tp->mdev_max_us > tp->rttvar_us) + tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { - if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; + if (tp->mdev_max_us < tp->rttvar_us) + tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = tcp_rto_min(sk); + tp->mdev_max_us = tcp_rto_min_us(sk); } } else { /* no previous measure. */ - tp->srtt = m << 3; /* take the measured time to be rtt */ - tp->mdev = m << 1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ + tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); + tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; } + tp->srtt_us = max(1U, srtt); } /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -742,18 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk) u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (HZ << 3); + rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); rate *= max(tp->snd_cwnd, tp->packets_out); - /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), - * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) - * We probably need usec resolution in the future. - * Note: This also takes care of possible srtt=0 case, - * when tcp_rtt_estimator() was not yet called. - */ - if (tp->srtt > 8 + 2) - do_div(rate, tp->srtt); + if (likely(tp->srtt_us)) + do_div(rate, tp->srtt_us); /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store @@ -766,7 +761,7 @@ static void tcp_update_pacing_rate(struct sock *sk) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -1111,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans && + if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1120,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; - int flag; - s32 rtt; /* RTT measured by SACKing never-retransmitted data */ + int reord; + int fack_count; + long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + int flag; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1167,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) { new_len += mss; - if (new_len > skb->len) + if (new_len >= skb->len) return 0; } pkt_len = new_len; } - err = tcp_fragment(sk, skb, pkt_len, mss); + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1184,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, u32 xmit_time) + int dup_sack, int pcount, + const struct skb_mstamp *xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (tp->undo_marker && tp->undo_retrans && + if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) @@ -1225,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk, if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; /* Pick the earliest sequence sacked for RTT */ - if (state->rtt < 0) - state->rtt = tcp_time_stamp - xmit_time; + if (state->rtt_us < 0) { + struct skb_mstamp now; + + skb_mstamp_get(&now); + state->rtt_us = skb_mstamp_us_delta(&now, + xmit_time); + } } if (sacked & TCPCB_LOST) { @@ -1285,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1563,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1620,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, s32 *sack_rtt) + u32 prior_snd_una, long *sack_rtt_us) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1638,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; - state.rtt = -1; + state.rtt_us = -1L; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1822,7 +1823,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt = state.rtt; + *sack_rtt_us = state.rtt_us; return state.flag; } @@ -1892,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; - tp->undo_retrans = 0; + tp->undo_retrans = -1; } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1943,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2032,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * available, or RTO is scheduled to fire first. */ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt) + (flag & FLAG_ECE) || !tp->srtt_us) return false; - delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + delay = max(usecs_to_jiffies(tp->srtt_us >> 5), + msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) return false; @@ -2237,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = skb_shinfo(skb)->gso_size; - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, + mss, GFP_ATOMIC); if (err < 0) break; cnt = packets; @@ -2660,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tp->undo_retrans = tp->retrans_out ? : -1; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) @@ -2680,13 +2685,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) bool recovered = !before(tp->snd_una, tp->high_seq); if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - if (flag & FLAG_ORIG_SACK_ACKED) { - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - tcp_try_undo_loss(sk, true); + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) return; - } + if (after(tp->snd_nxt, tp->high_seq) && (flag & FLAG_DATA_SACKED || is_dupack)) { tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ @@ -2882,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - s32 seq_rtt, s32 sack_rtt) + long seq_rtt_us, long sack_rtt_us) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2892,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * is acked (RFC6298). */ if (flag & FLAG_RETRANS_DATA_ACKED) - seq_rtt = -1; + seq_rtt_us = -1L; - if (seq_rtt < 0) - seq_rtt = sack_rtt; + if (seq_rtt_us < 0) + seq_rtt_us = sack_rtt_us; /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -2903,14 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - if (seq_rtt < 0) + if (seq_rtt_us < 0) return false; - tcp_rtt_estimator(sk, seq_rtt); + tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ @@ -2922,22 +2926,23 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { struct tcp_sock *tp = tcp_sk(sk); - s32 seq_rtt = -1; + long seq_rtt_us = -1L; if (synack_stamp && !tp->total_retrans) - seq_rtt = tcp_time_stamp - synack_stamp; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() */ - if (!tp->srtt) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (!tp->srtt_us) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); + + icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -3020,26 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, s32 sack_rtt) + u32 prior_snd_una, long sack_rtt_us) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - u32 now = tcp_time_stamp; + struct skb_mstamp first_ackt, last_ackt, now; + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + u32 reord = tp->packets_out; bool fully_acked = true; - int flag = 0; + long ca_seq_rtt_us = -1L; + long seq_rtt_us = -1L; + struct sk_buff *skb; u32 pkts_acked = 0; - u32 reord = tp->packets_out; - u32 prior_sacked = tp->sacked_out; - s32 seq_rtt = -1; - s32 ca_seq_rtt = -1; - ktime_t last_ackt = net_invalid_timestamp(); bool rtt_update; + int flag = 0; + + first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 acked_pcount; u8 sacked = scb->sacked; + u32 acked_pcount; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3061,11 +3067,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } + last_ackt = skb->skb_mstamp; + WARN_ON_ONCE(last_ackt.v64 == 0); + if (!first_ackt.v64) + first_ackt = last_ackt; + if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) @@ -3111,7 +3117,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); + skb_mstamp_get(&now); + if (first_ackt.v64) { + seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); + ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops @@ -3139,25 +3151,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - s32 rtt_us = -1; - - /* Is the ACK triggering packet unambiguous? */ - if (!(flag & FLAG_RETRANS_DATA_ACKED)) { - /* High resolution needed and available? */ - if (ca_ops->flags & TCP_CONG_RTT_STAMP && - !ktime_equal(last_ackt, - net_invalid_timestamp())) - rtt_us = ktime_us_delta(ktime_get_real(), - last_ackt); - else if (ca_seq_rtt >= 0) - rtt_us = jiffies_to_usecs(ca_seq_rtt); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } - } else if (skb && rtt_update && sack_rtt >= 0 && - sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { + } else if (skb && rtt_update && sack_rtt_us >= 0 && + sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3367,12 +3365,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; u32 prior_fackets; int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - s32 sack_rtt = -1; + long sack_rtt_us = -1L; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3400,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= FLAG_SND_UNA_ADVANCED; prior_fackets = tp->fackets_out; - prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3430,7 +3426,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; @@ -3449,12 +3445,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + sack_rtt_us); acked -= tp->packets_out; /* Advance cwnd if state allows */ if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked, prior_in_flight); + tcp_cong_avoid(sk, ack, acked); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3472,8 +3469,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) - tcp_update_pacing_rate(sk); + tcp_update_pacing_rate(sk); return 1; no_queue: @@ -3502,7 +3498,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -3686,7 +3682,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) int opcode = *ptr++; int opsize; - switch(opcode) { + switch (opcode) { case TCPOPT_EOL: return NULL; case TCPOPT_NOP: @@ -4046,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ - for (i=this_sack+1; i < num_sacks; i++) + for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; @@ -4416,7 +4412,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } @@ -4706,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - /* Limited by application or receiver window. */ - u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; - } - tp->snd_cwnd_used = 0; - } - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4917,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); tp->urg_data = TCP_URG_VALID | tmp; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } } @@ -5003,11 +4977,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } else if (chunk > 0) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } out: return copied_early; @@ -5278,7 +5252,7 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } } @@ -5398,9 +5372,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, break; } tcp_rearm_rto(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; + if (tp->syn_data_acked) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); return false; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 06721392475..77cccda1ad0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -173,7 +173,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, - orig_sport, orig_dport, sk, true); + orig_sport, orig_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) @@ -336,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; struct sk_buff *skb; - struct request_sock *req; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; __u32 remaining; int err; struct net *net = dev_net(icmp_skb->dev); @@ -378,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); - req = tp->fastopen_rsk; seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt) && - (req == NULL || seq != tcp_rsk(req)->snt_isn)) { - /* For a Fast Open socket, allow seq to be snt_isn. */ + !between(seq, snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -426,16 +426,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) break; if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff) + !icsk->icsk_backoff || fastopen) break; - /* XXX (TFO) - revisit the following logic for TFO */ - if (sock_owned_by_user(sk)) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : + inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT) << icsk->icsk_backoff; tcp_bound_rto(sk); @@ -462,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } - /* XXX (TFO) - if it's a TFO socket and has been accepted, rather - * than following the TCP_SYN_RECV case and closing the socket, - * we ignore the ICMP error and keep trying like a fully established - * socket. Is this the right thing to do? - */ - if (req && req->sk == NULL) - goto out; - switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: @@ -502,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can f.e. if SYNs crossed, - or Fast Open. - */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; @@ -822,18 +815,19 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - u16 queue_mapping) + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; - struct sk_buff * skb; + struct sk_buff *skb; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, NULL); + skb = tcp_make_synack(sk, dst, req, foc); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -852,10 +846,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - int res = tcp_v4_send_synack(sk, NULL, req, 0); + int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); - if (!res) + if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } return res; } @@ -878,8 +874,6 @@ bool tcp_syn_flood_action(struct sock *sk, bool want_cookie = false; struct listen_sock *lopt; - - #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { msg = "Sending cookies"; @@ -1260,187 +1254,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { }; #endif -static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct tcp_fastopen_cookie *valid_foc) -{ - bool skip_cookie = false; - struct fastopen_queue *fastopenq; - - if (likely(!fastopen_cookie_present(foc))) { - /* See include/net/tcp.h for the meaning of these knobs */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || - ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) - skip_cookie = true; /* no cookie to validate */ - else - return false; - } - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - /* A FO option is present; bump the counter. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - - /* Make sure the listener has enabled fastopen, and we don't - * exceed the max # of pending TFO requests allowed before trying - * to validating the cookie in order to avoid burning CPU cycles - * unnecessarily. - * - * XXX (TFO) - The implication of checking the max_qlen before - * processing a cookie request is that clients can't differentiate - * between qlen overflow causing Fast Open to be disabled - * temporarily vs a server not supporting Fast Open at all. - */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || - fastopenq == NULL || fastopenq->max_qlen == 0) - return false; - - if (fastopenq->qlen >= fastopenq->max_qlen) { - struct request_sock *req1; - spin_lock(&fastopenq->lock); - req1 = fastopenq->rskq_rst_head; - if ((req1 == NULL) || time_after(req1->expires, jiffies)) { - spin_unlock(&fastopenq->lock); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); - /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ - foc->len = -1; - return false; - } - fastopenq->rskq_rst_head = req1->dl_next; - fastopenq->qlen--; - spin_unlock(&fastopenq->lock); - reqsk_free(req1); - } - if (skip_cookie) { - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - return true; - } - - if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { - if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || - memcmp(&foc->val[0], &valid_foc->val[0], - TCP_FASTOPEN_COOKIE_SIZE) != 0) - return false; - valid_foc->len = -1; - } - /* Acknowledge the data received from the peer. */ - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - return true; - } else if (foc->len == 0) { /* Client requesting a cookie */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENCOOKIEREQD); - } else { - /* Client sent a cookie with wrong size. Treat it - * the same as invalid and return a valid one. - */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - } - return false; -} - -static int tcp_v4_conn_req_fastopen(struct sock *sk, - struct sk_buff *skb, - struct sk_buff *skb_synack, - struct request_sock *req) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; - const struct inet_request_sock *ireq = inet_rsk(req); - struct sock *child; - int err; - - req->num_retrans = 0; - req->num_timeout = 0; - req->sk = NULL; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - kfree_skb(skb_synack); - return -1; - } - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); - err = net_xmit_eval(err); - if (!err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; - /* XXX (TFO) - is it ok to ignore error and continue? */ - - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); - - /* Initialize the child socket. Have to fix some values to take - * into account the child is a Fast Open socket and is created - * only out of the bits carried in the SYN packet. - */ - tp = tcp_sk(child); - - tp->fastopen_rsk = req; - /* Do a hold on the listner sk so that if the listener is being - * closed, the child that has been accepted can live on and still - * access listen_lock. - */ - sock_hold(sk); - tcp_rsk(req)->listener = sk; - - /* RFC1323: The window in SYN & SYN/ACK segments is never - * scaled. So correct it appropriately. - */ - tp->snd_wnd = ntohs(tcp_hdr(skb)->window); - - /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent - * because it's been added to the accept queue directly. - */ - inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, - TCP_TIMEOUT_INIT, TCP_RTO_MAX); - - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); - - /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_init_buffer_space(child); - - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * - * XXX (TFO) - we honor a zero-payload TFO request for now. - * (Any reason not to?) - */ - if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { - /* Don't queue the skb if there is no payload in SYN. - * XXX (TFO) - How about SYN+FIN? - */ - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - } else { - skb = skb_get(skb); - skb_dst_drop(skb); - __skb_pull(skb, tcp_hdr(skb)->doff * 4); - skb_set_owner_r(skb, child); - __skb_queue_tail(&child->sk_receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - tp->syn_data_acked = 1; - } - sk->sk_data_ready(sk, 0); - bh_unlock_sock(child); - sock_put(child); - WARN_ON(req->sk == NULL); - return 0; -} - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tmp_opt; @@ -1451,12 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - bool want_cookie = false; + bool want_cookie = false, fastopen; struct flowi4 fl4; struct tcp_fastopen_cookie foc = { .len = -1 }; - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; - struct sk_buff *skb_synack; - int do_fastopen; + int err; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1507,6 +1318,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(skb); + ireq->ir_mark = inet_request_mark(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -1555,52 +1367,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } - tcp_rsk(req)->snt_isn = isn; - - if (dst == NULL) { - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto drop_and_free; - } - do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); - - /* We don't call tcp_v4_send_synack() directly because we need - * to make sure a child socket can be created successfully before - * sending back synack! - * - * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() - * (or better yet, call tcp_send_synack() in the child context - * directly, but will have to fix bunch of other code first) - * after syn_recv_sock() except one will need to first fix the - * latter to remove its dependency on the current implementation - * of tcp_v4_send_synack()->tcp_select_initial_window(). - */ - skb_synack = tcp_make_synack(sk, dst, req, - fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); - - if (skb_synack) { - __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); - } else + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) goto drop_and_free; - if (likely(!do_fastopen)) { - int err; - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); - err = net_xmit_eval(err); + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v4_send_synack(sk, dst, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { if (err || want_cookie) goto drop_and_free; tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->listener = NULL; - /* Add the request_sock to the SYN table */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - if (fastopen_cookie_present(&foc) && foc.len != 0) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) - goto drop_and_free; + } return 0; @@ -1668,7 +1452,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } sk_setup_caps(newsk, dst); - tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && @@ -1745,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v4_check(skb->len, iph->saddr, - iph->daddr, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - } - - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, IPPROTO_TCP, 0); - - if (skb->len <= 76) { - return __skb_checksum_complete(skb); - } - return 0; -} - - /* The socket must have it's spinlock held when we get * here. * @@ -1961,7 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb) * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ - if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) + + if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) goto csum_error; th = tcp_hdr(skb); @@ -2629,7 +2391,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, { __be32 dest, src; __u16 destp, srcp; - long delta = tw->tw_ttd - jiffies; + s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = tw->tw_daddr; src = tw->tw_rcv_saddr; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 991d62a2f9b..1e70fa8fa79 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } /** @@ -315,11 +314,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) } static struct tcp_congestion_ops tcp_lp __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_lp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index f7e522c558b..f7a2ec3ac58 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -102,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) +static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned long long val; int ret = 0; - switch (cft->private) { + buf = strstrip(buf); + + switch (of_cft(of)->private) { case RES_LIMIT: /* see memcontrol.c */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; ret = tcp_update_limit(memcg, val); @@ -121,7 +123,7 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) @@ -168,17 +170,18 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_css(css); + memcg = mem_cgroup_from_css(of_css(of)); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - return 0; + return nbytes; - switch (event) { + switch (of_cft(of)->private) { case RES_MAX_USAGE: res_counter_reset_max(&cg_proto->memory_allocated); break; @@ -187,13 +190,13 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static struct cftype tcp_files[] = { { .name = "kmem.tcp.limit_in_bytes", - .write_string = tcp_cgroup_write, + .write = tcp_cgroup_write, .read_u64 = tcp_cgroup_read, .private = RES_LIMIT, }, @@ -205,13 +208,13 @@ static struct cftype tcp_files[] = { { .name = "kmem.tcp.failcnt", .private = RES_FAILCNT, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { .name = "kmem.tcp.max_usage_in_bytes", .private = RES_MAX_USAGE, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { } /* terminate */ @@ -219,7 +222,7 @@ static struct cftype tcp_files[] = { static int __init tcp_memcontrol_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); return 0; } __initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 06493736fbc..4fe04180598 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -22,6 +22,10 @@ int sysctl_tcp_nometrics_save __read_mostly; +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, + const struct inetpeer_addr *daddr, + struct net *net, unsigned int hash); + struct tcp_fastopen_metrics { u16 mss; u16 syn_loss:10; /* Recurring Fast Open SYN losses */ @@ -29,14 +33,20 @@ struct tcp_fastopen_metrics { struct tcp_fastopen_cookie cookie; }; +/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility + * Kernel only stores RTT and RTTVAR in usec resolution + */ +#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) + struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; - struct inetpeer_addr tcpm_addr; + struct inetpeer_addr tcpm_saddr; + struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; u32 tcpm_ts; u32 tcpm_ts_stamp; u32 tcpm_lock; - u32 tcpm_vals[TCP_METRIC_MAX + 1]; + u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; struct rcu_head rcu_head; @@ -54,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm, return tm->tcpm_vals[idx]; } -static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, - enum tcp_metric_index idx) -{ - return msecs_to_jiffies(tm->tcpm_vals[idx]); -} - static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) @@ -67,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, tm->tcpm_vals[idx] = val; } -static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, - enum tcp_metric_index idx, - u32 val) -{ - tm->tcpm_vals[idx] = jiffies_to_msecs(val); -} - static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { @@ -96,9 +93,11 @@ struct tcpm_hash_bucket { static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, +static void tcpm_suck_dst(struct tcp_metrics_block *tm, + const struct dst_entry *dst, bool fastopen_clear) { + u32 msval; u32 val; tm->tcpm_stamp = jiffies; @@ -116,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, val |= 1 << TCP_METRIC_REORDERING; tm->tcpm_lock = val; - tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); - tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); + msval = dst_metric_raw(dst, RTAX_RTT); + tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + + msval = dst_metric_raw(dst, RTAX_RTTVAR); + tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); @@ -130,16 +132,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, } } +#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + tcpm_suck_dst(tm, dst, false); +} + +#define TCP_METRICS_RECLAIM_DEPTH 5 +#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, - struct inetpeer_addr *addr, - unsigned int hash, - bool reclaim) + struct inetpeer_addr *saddr, + struct inetpeer_addr *daddr, + unsigned int hash) { struct tcp_metrics_block *tm; struct net *net; + bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); net = dev_net(dst->dev); + + /* While waiting for the spin-lock the cache might have been populated + * with this entry and so we have to check again. + */ + tm = __tcp_get_metrics(saddr, daddr, net, hash); + if (tm == TCP_METRICS_RECLAIM_PTR) { + reclaim = true; + tm = NULL; + } + if (tm) { + tcpm_check_stamp(tm, dst); + goto out_unlock; + } + if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; @@ -155,7 +183,8 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } - tm->tcpm_addr = *addr; + tm->tcpm_saddr = *saddr; + tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, true); @@ -169,17 +198,6 @@ out_unlock: return tm; } -#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) - -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) -{ - if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) - tcpm_suck_dst(tm, dst, false); -} - -#define TCP_METRICS_RECLAIM_DEPTH 5 -#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL - static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) { if (tm) @@ -189,7 +207,8 @@ static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, in return NULL; } -static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, + const struct inetpeer_addr *daddr, struct net *net, unsigned int hash) { struct tcp_metrics_block *tm; @@ -197,7 +216,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *a for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, addr)) + if (addr_same(&tm->tcpm_saddr, saddr) && + addr_same(&tm->tcpm_daddr, daddr)) break; depth++; } @@ -208,19 +228,22 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; - addr.family = req->rsk_ops->family; - switch (addr.family) { + saddr.family = req->rsk_ops->family; + daddr.family = req->rsk_ops->family; + switch (daddr.family) { case AF_INET: - addr.addr.a4 = inet_rsk(req)->ir_rmt_addr; - hash = (__force unsigned int) addr.addr.a4; + saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; + daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; + hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; + *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; + *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -233,7 +256,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) break; } tcpm_check_stamp(tm, dst); @@ -243,32 +267,44 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; - addr.family = tw->tw_family; - switch (addr.family) { - case AF_INET: - addr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) addr.addr.a4; - break; + if (tw->tw_family == AF_INET) { + saddr.family = AF_INET; + saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } #if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr; - hash = ipv6_addr_hash(&tw->tw_v6_daddr); - break; + else if (tw->tw_family == AF_INET6) { + if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; + hash = ipv6_addr_hash(&tw->tw_v6_daddr); + } + } #endif - default: + else return NULL; - } net = twsk_net(tw); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) break; } return tm; @@ -279,38 +315,45 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, bool create) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; - bool reclaim; - addr.family = sk->sk_family; - switch (addr.family) { - case AF_INET: - addr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) addr.addr.a4; - break; + if (sk->sk_family == AF_INET) { + saddr.family = AF_INET; + saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } #if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr; - hash = ipv6_addr_hash(&sk->sk_v6_daddr); - break; + else if (sk->sk_family == AF_INET6) { + if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; + hash = ipv6_addr_hash(&sk->sk_v6_daddr); + } + } #endif - default: + else return NULL; - } net = dev_net(dst->dev); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); - tm = __tcp_get_metrics(&addr, net, hash); - reclaim = false; - if (tm == TCP_METRICS_RECLAIM_PTR) { - reclaim = true; + tm = __tcp_get_metrics(&saddr, &daddr, net, hash); + if (tm == TCP_METRICS_RECLAIM_PTR) tm = NULL; - } if (!tm && create) - tm = tcpm_new(dst, &addr, hash, reclaim); + tm = tcpm_new(dst, &saddr, &daddr, hash); else tcpm_check_stamp(tm, dst); @@ -338,7 +381,7 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); rcu_read_lock(); - if (icsk->icsk_backoff || !tp->srtt) { + if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results. @@ -353,8 +396,8 @@ void tcp_update_metrics(struct sock *sk) if (!tm) goto out_unlock; - rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); - m = rtt - tp->srtt; + rtt = tcp_metric_get(tm, TCP_METRIC_RTT); + m = rtt - tp->srtt_us; /* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is @@ -362,10 +405,10 @@ void tcp_update_metrics(struct sock *sk) */ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0) - rtt = tp->srtt; + rtt = tp->srtt_us; else rtt -= (m >> 3); - tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); + tcp_metric_set(tm, TCP_METRIC_RTT, rtt); } if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { @@ -376,16 +419,16 @@ void tcp_update_metrics(struct sock *sk) /* Scale deviation to rttvar fixed point */ m >>= 1; - if (m < tp->mdev) - m = tp->mdev; + if (m < tp->mdev_us) + m = tp->mdev_us; - var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); + var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var) var = m; else var -= (var - m) >> 2; - tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); + tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); } if (tcp_in_initial_slowstart(tp)) { @@ -482,7 +525,7 @@ void tcp_init_metrics(struct sock *sk) tp->reordering = val; } - crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); + crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); reset: /* The initial RTT measurement from the SYN/SYN-ACK is not ideal @@ -505,18 +548,20 @@ reset: * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - if (crtt > tp->srtt) { + if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ - crtt >>= 3; + crtt /= 8 * USEC_PER_MSEC; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); - } else if (tp->srtt == 0) { + } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious * retransmission. */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); + tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been @@ -724,15 +769,21 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, struct nlattr *nest; int i; - switch (tm->tcpm_addr.family) { + switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_addr.addr.a4) < 0) + tm->tcpm_daddr.addr.a4) < 0) + goto nla_put_failure; + if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, + tm->tcpm_saddr.addr.a4) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, - tm->tcpm_addr.addr.a6) < 0) + tm->tcpm_daddr.addr.a6) < 0) + goto nla_put_failure; + if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, + tm->tcpm_saddr.addr.a6) < 0) goto nla_put_failure; break; default: @@ -757,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; - for (i = 0; i < TCP_METRIC_MAX + 1; i++) { - if (!tm->tcpm_vals[i]) + for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { + u32 val = tm->tcpm_vals[i]; + + if (!val) continue; - if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) + if (i == TCP_METRIC_RTT) { + if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (i == TCP_METRIC_RTTVAR) { + if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (nla_put_u32(msg, i + 1, val) < 0) goto nla_put_failure; n++; } @@ -855,44 +922,66 @@ done: return skb->len; } -static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, - unsigned int *hash, int optional) +static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional, int v4, int v6) { struct nlattr *a; - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; + a = info->attrs[v4]; if (a) { addr->family = AF_INET; addr->addr.a4 = nla_get_be32(a); - *hash = (__force unsigned int) addr->addr.a4; + if (hash) + *hash = (__force unsigned int) addr->addr.a4; return 0; } - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; + a = info->attrs[v6]; if (a) { if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; addr->family = AF_INET6; memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); - *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); + if (hash) + *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); return 0; } return optional ? 1 : -EAFNOSUPPORT; } +static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional) +{ + return __parse_nl_addr(info, addr, hash, optional, + TCP_METRICS_ATTR_ADDR_IPV4, + TCP_METRICS_ATTR_ADDR_IPV6); +} + +static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) +{ + return __parse_nl_addr(info, addr, NULL, 0, + TCP_METRICS_ATTR_SADDR_IPV4, + TCP_METRICS_ATTR_SADDR_IPV6); +} + static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct sk_buff *msg; struct net *net = genl_info_net(info); void *reply; int ret; + bool src = true; - ret = parse_nl_addr(info, &addr, &hash, 0); + ret = parse_nl_addr(info, &daddr, &hash, 0); if (ret < 0) return ret; + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -907,7 +996,8 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { ret = tcp_metrics_fill_info(msg, tm); break; } @@ -962,32 +1052,38 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) struct tcpm_hash_bucket *hb; struct tcp_metrics_block *tm; struct tcp_metrics_block __rcu **pp; - struct inetpeer_addr addr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; + bool src = true, found = false; - ret = parse_nl_addr(info, &addr, &hash, 1); + ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; if (ret > 0) return tcp_metrics_flush_all(net); + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hb = net->ipv4.tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); - for (tm = deref_locked_genl(*pp); tm; - pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { - if (addr_same(&tm->tcpm_addr, &addr)) { + for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { *pp = tm->tcpm_next; - break; + kfree_rcu(tm, rcu_head); + found = true; + } else { + pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); - if (!tm) + if (!found) return -ESRCH; - kfree_rcu(tm, rcu_head); return 0; } @@ -1063,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net) tm = next; } } - if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) - vfree(net->ipv4.tcp_metrics_hash); - else - kfree(net->ipv4.tcp_metrics_hash); + kvfree(net->ipv4.tcp_metrics_hash); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 97b68415986..e68e0d4af6c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -297,6 +297,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; + tw->tw_flowlabel = np->flow_label >> 12; tw->tw_ipv6only = np->ipv6only; } #endif @@ -361,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_openreq_init_rwin(struct request_sock *req, + struct sock *sk, struct dst_entry *dst) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; + int mss = dst_metric_advmss(dst); + + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + ireq->rcv_wscale = rcv_wscale; +} +EXPORT_SYMBOL(tcp_openreq_init_rwin); + static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, struct request_sock *req) { @@ -397,8 +429,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->rcv_isn); - newtp->srtt = 0; - newtp->mdev = TCP_TIMEOUT_INIT; + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -425,7 +457,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); - skb_queue_head_init(&newtp->out_of_order_queue); + __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; @@ -744,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 05606353c7e..55046ecd083 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -57,10 +57,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; @@ -97,9 +99,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->check = newcheck; if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = - csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); + th->check = gso_make_checksum(skb, ~th->check); seq += mss; if (copy_destructor) { @@ -133,12 +133,10 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); + th->check = gso_make_checksum(skb, ~th->check); out: return segs; } -EXPORT_SYMBOL(tcp_gso_segment); struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -197,7 +195,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto out_check_final; found: - flush = NAPI_GRO_CB(p)->flush; + /* Include the IP ID check below from the inner most IP hdr */ + flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; flush |= (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); @@ -230,17 +229,16 @@ out_check_final: pp = head; out: - NAPI_GRO_CB(skb)->flush |= flush; + NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; } -EXPORT_SYMBOL(tcp_gro_receive); int tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); - skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; @@ -272,6 +270,7 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb) static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { + /* Use the IP hdr immediately proceeding for this transport */ const struct iphdr *iph = skb_gro_network_header(skb); __wsum wsum; @@ -279,7 +278,7 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff * if (NAPI_GRO_CB(skb)->flush) goto skip_csum; - wsum = skb->csum; + wsum = NAPI_GRO_CB(skb)->csum; switch (skb->ip_summed) { case CHECKSUM_NONE: @@ -303,14 +302,14 @@ skip_csum: return tcp_gro_receive(head, skb); } -static int tcp4_gro_complete(struct sk_buff *skb) +static int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), - iph->saddr, iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, + iph->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; return tcp_gro_complete(skb); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 993da005e08..179b51e6bda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { tcp_rearm_rto(sk); } + + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, + tcp_skb_pcount(skb)); } /* SND.NXT, if window was not shrunk. @@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); @@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ + if (new_win == 0) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; @@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk) new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ - if (new_win == 0) + if (new_win == 0) { tp->pred_flags = 0; + if (old_win) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTOZEROWINDOWADV); + } else if (old_win == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + } return new_win; } @@ -408,7 +421,7 @@ struct tcp_out_options { * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). @@ -614,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk, if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } - if (foc != NULL) { + if (foc != NULL && foc->len >= 0) { u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { @@ -681,7 +694,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. - * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. @@ -698,12 +711,13 @@ static void tcp_tsq_handler(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); + tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + 0, GFP_ATOMIC); } /* - * One tasklest per cpu tries to send more skbs. + * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when - * transfering tsq->head because tcp_wfree() might + * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(unsigned long data) @@ -766,6 +780,17 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_TSQ_DEFERRED)) tcp_tsq_handler(sk); + /* Here begins the tricky part : + * We are called from release_sock() with : + * 1) BH disabled + * 2) sk_lock.slock spinlock held + * 3) socket owned by us (sk->sk_lock.owned == 1) + * + * But following code is meant to be called from BH handlers, + * so we should keep BH disabled, but early release socket ownership + */ + sock_release_ownership(sk); + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { tcp_write_timer_handler(sk); __sock_put(sk); @@ -797,7 +822,7 @@ void __init tcp_tasklet_init(void) /* * Write buffer destructor automatically called from kfree_skb. - * We cant xmit new skbs from this context, as we might already + * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) @@ -853,18 +878,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); if (clone_it) { - const struct sk_buff *fclone = skb + 1; - - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); - - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + skb_mstamp_get(&skb->skb_mstamp); if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -872,6 +886,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -958,7 +974,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -1058,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, - unsigned int mss_now) + unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -1073,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, nsize, gfp); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ @@ -1364,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out >= tp->snd_cwnd) { + /* Track the maximum number of outstanding packets in each + * window, and remember whether we were cwnd-limited then. + */ + if (!before(tp->snd_una, tp->max_packets_seq) || + tp->packets_out > tp->max_packets_out) { + tp->max_packets_out = tp->packets_out; + tp->max_packets_seq = tp->snd_nxt; + tp->is_cwnd_limited = is_cwnd_limited; + } + + if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1384,23 +1431,51 @@ static void tcp_cwnd_validate(struct sock *sk) } } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + * skb_pcount = skb->len / mss_now */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int max_segs) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb) +{ + if (skb->len < tcp_skb_pcount(skb) * mss_now) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, + int nonagle) +{ + return partial && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, max_len; + u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; @@ -1413,7 +1488,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b if (max_len <= needed) return max_len; - return needed - needed % mss_now; + partial = needed % mss_now; + /* If last segment is not a full MSS, check if Nagle rules allow us + * to include this last segment in this skb. + * Otherwise, we'll split the skb at last MSS boundary + */ + if (tcp_nagle_check(partial != 0, tp, nonagle)) + return needed - partial; + + return needed; } /* Can at least one segment of SKB be sent right now, according to the @@ -1453,28 +1536,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, return tso_segs; } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml, tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int mss_now, int nonagle) -{ - return skb->len < mss_now && - ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} /* Return true if the Nagle test allows this packet to be * sent now. @@ -1495,7 +1556,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; @@ -1564,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now); + return tcp_fragment(sk, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp); if (unlikely(buff == NULL)) @@ -1607,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + bool *is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1671,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if (!tp->tso_deferred) tp->tso_deferred = 1 | (jiffies << 1); + if (cong_win < send_win && cong_win < skb->len) + *is_cwnd_limited = true; + return true; send_now: @@ -1831,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; + bool is_cwnd_limited = false; sent_pkts = 0; @@ -1855,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { + is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -1871,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) + if (!push_one && + tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) break; } @@ -1890,7 +1958,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); - break; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + */ + smp_mb__after_atomic(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + break; } limit = mss_now; @@ -1898,7 +1972,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, - sk->sk_gso_max_segs)); + sk->sk_gso_max_segs), + nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -1929,7 +2004,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); - tcp_cwnd_validate(sk); + tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); @@ -1940,7 +2015,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, tlp_time_stamp, rto_time_stamp; - u32 rtt = tp->srtt >> 3; + u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) return false; @@ -1962,7 +2037,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -1993,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk) return true; } +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct sk_buff *fclone = skb + 1; + + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && + fclone->fclone == SKB_FCLONE_CLONE)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } + return false; +} + /* When probe timeout (PTO) fires, send a new segment if one exists, else * retransmit the last segment. */ @@ -2018,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb)) goto rearm_timer; + if (skb_still_in_host_queue(sk, skb)) + goto rearm_timer; + pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) + if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + GFP_ATOMIC))) goto rearm_timer; skb = tcp_write_queue_tail(sk); } @@ -2031,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - /* Probe with zero data doesn't trigger fast recovery. */ - if (skb->len > 0) - err = __tcp_retransmit_skb(sk, skb); + err = __tcp_retransmit_skb(sk, skb); /* Record snd_nxt for loss detection. */ if (likely(!err)) @@ -2047,7 +2143,6 @@ rearm_timer: if (likely(!err)) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); - return; } /* Push out any pending frames which were held back due to @@ -2145,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk) */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); - int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int allowed_space = tcp_full_space(sk); + int full_space = min_t(int, tp->window_clamp, allowed_space); int window; if (mss > full_space) @@ -2158,7 +2254,19 @@ u32 __tcp_select_window(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); - if (free_space < mss) + /* free_space might become our new window, make sure we don't + * increase it due to wscale. + */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + /* if free space is less than mss estimate, or is below 1/16th + * of the maximum allowed, try to move to zero-window, else + * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and + * new incoming data is dropped due to memory limits. + * With large window, mss test triggers way too late in order + * to announce zero window in time before rmem limit kicks in. + */ + if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } @@ -2313,6 +2421,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss; + int err; /* Inconslusive MTU probe */ if (icsk->icsk_mtup.probe_size) { @@ -2326,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; + if (skb_still_in_host_queue(sk, skb)) + return -EBUSY; + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); @@ -2348,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -EAGAIN; if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss)) + if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { int oldpcount = tcp_skb_pcount(skb); @@ -2376,11 +2488,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; } else { - return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + + if (likely(!err)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + /* Update global TCP statistics. */ + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans++; + } + return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) @@ -2389,11 +2511,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err = __tcp_retransmit_skb(sk, skb); if (err == 0) { - /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - - tp->total_retrans++; - #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); @@ -2408,15 +2525,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans += tcp_skb_pcount(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; - } else { + } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } + + if (tp->undo_retrans < 0) + tp->undo_retrans = 0; + tp->undo_retrans += tcp_skb_pcount(skb); return err; } @@ -2677,7 +2796,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, int tcp_header_size; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2692,27 +2811,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - - /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); - - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; - } - memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) @@ -2747,7 +2845,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -2762,7 +2860,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2884,7 +2982,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; - syn_data = skb_copy_expand(syn, skb_headroom(syn), space, + space = min_t(size_t, space, fo->size); + + /* limit to order-0 allocations */ + space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + + syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, sk->sk_allocation); if (syn_data == NULL) goto fallback; @@ -2914,9 +3017,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) tcp_connect_queue_skb(sk, data); fo->copied = data->len; + /* syn_data is about to be sent, we need to take current time stamps + * for the packets that are in write queue : SYN packet and DATA + */ + skb_mstamp_get(&syn->skb_mstamp); + data->skb_mstamp = syn->skb_mstamp; + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { tp->syn_data = (fo->copied > 0); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } syn_data = NULL; @@ -3004,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk) * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ - if (tp->srtt) { - int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); + if (tp->srtt_us) { + int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), + TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; @@ -3133,7 +3243,7 @@ int tcp_write_wakeup(struct sock *sk) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) + if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 8b97d71e193..3b66610d415 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -38,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.1"); -static int port __read_mostly = 0; +static int port __read_mostly; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); @@ -46,7 +46,7 @@ static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); module_param(bufsize, uint, 0); -static unsigned int fwmark __read_mostly = 0; +static unsigned int fwmark __read_mostly; MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); module_param(fwmark, uint, 0); @@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, p->snd_wnd = tp->snd_wnd; p->rcv_wnd = tp->rcv_wnd; p->ssthresh = tcp_current_ssthresh(sk); - p->srtt = tp->srtt >> 3; + p->srtt = tp->srtt_us >> 3; tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 19ea6c2951f..8250949b885 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,12 +15,11 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) @@ -39,7 +38,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "scalable", diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 64f0354c84c..286227abed1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -165,6 +165,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true); + if (tp->syn_data) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = true; diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 06cae62bf20..9a5e05f27f4 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) return min(tp->snd_ssthresh, tp->snd_cwnd-1); } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } @@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u32 rtt, diff; u64 target_cwnd; @@ -306,11 +305,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_vegas_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 326475a9486..27b9825753d 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,19 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) tcp_veno_init(sk); } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } /* limited by applications */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* We do the Veno calculations only if we got enough rtt samples */ @@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u64 target_cwnd; u32 rtt; @@ -203,7 +202,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) } static struct tcp_congestion_ops tcp_veno __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 76a1e23259e..b94a04ae2ed 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a347a078ee0..599b79b8eac 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -3,7 +3,7 @@ * YeAH TCP * * For further details look at: - * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf * */ #include <linux/mm.h> @@ -15,13 +15,13 @@ #include "tcp_vegas.h" -#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck -#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt -#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss -#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion -#define TCP_YEAH_PHY 8 //lin maximum delta from base -#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss -#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count +#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */ +#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */ +#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */ +#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */ +#define TCP_YEAH_PHY 8 /* maximum delta from base */ +#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */ +#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */ #define TCP_SCALABLE_AI_CNT 100U @@ -69,13 +69,12 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) @@ -214,9 +213,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { if (yeah->doing_reno_now < TCP_YEAH_RHO) { reduction = yeah->lastQ; - reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + reduction = min(reduction, max(tp->snd_cwnd>>1, 2U)); - reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); } else reduction = max(tp->snd_cwnd>>1, 2U); @@ -227,11 +226,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { } static struct tcp_congestion_ops tcp_yeah __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, .cong_avoid = tcp_yeah_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 44f6a20fa29..7d5a8661df7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -223,7 +223,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = net_random(); + rand = prandom_u32(); first = (((u64)rand * remaining) >> 32) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && - !inet_is_reserved_local_port(snum)) + !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); @@ -560,15 +560,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { - struct sock *sk; const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) - return sk; - else - return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - udptable); + return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, @@ -731,13 +727,12 @@ EXPORT_SYMBOL(udp_flush_pending_frames); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); - struct sk_buff *frags = skb_shinfo(skb)->frag_list; int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; - if (!frags) { + if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ @@ -746,15 +741,17 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { + struct sk_buff *frags; + /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - do { + skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; - } while ((frags = frags->next)); + } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; @@ -766,6 +763,43 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) } EXPORT_SYMBOL_GPL(udp4_hwcsum); +/* Function to set UDP checksum for an IPv4 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v4_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp_set_csum); + static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { struct sock *sk = skb->sk; @@ -789,7 +823,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; @@ -906,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -935,7 +969,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, + sk->sk_family == AF_INET6); if (err) return err; if (ipc.opt) @@ -990,7 +1025,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk), faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); @@ -1230,7 +1265,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int peeked, off = 0; @@ -1498,6 +1533,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { int ret; + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), @@ -1549,8 +1588,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; + } rc = 0; @@ -1603,12 +1645,16 @@ static void flush_stack(struct sock **stack, unsigned int count, kfree_skb(skb1); } -static void udp_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +/* For TCP sockets, sk_rx_dst is protected by socket lock + * For UDP, we use xchg() to guard against concurrent changes. + */ +static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { - struct dst_entry *dst = skb_dst(skb); + struct dst_entry *old; dst_hold(dst); - sk->sk_rx_dst = dst; + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); } /* @@ -1671,7 +1717,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { - const struct iphdr *iph; int err; UDP_SKB_CB(skb)->partial_cov = 0; @@ -1683,22 +1728,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return err; } - iph = ip_hdr(skb); - if (uh->check == 0) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - if (!skb_csum_unnecessary(skb)) - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, proto, 0); - /* Probably, we should checksum udp header (it should be in cache - * in any case) and data in tiny packets (< rx copybreak). - */ - - return 0; + return skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); } /* @@ -1739,15 +1770,16 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - if (skb->sk) { + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); int ret; - sk = skb->sk; - if (unlikely(sk->sk_rx_dst == NULL)) - udp_sk_rx_dst_set(sk, skb); + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); ret = udp_queue_rcv_skb(sk, skb); - + sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 */ @@ -1832,6 +1864,10 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; + /* Do not bother scanning a too big list */ + if (hslot->count > 10) + return NULL; + rcu_read_lock(); begin: count = 0; @@ -1884,7 +1920,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; - INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr) + INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); rcu_read_lock(); @@ -1913,17 +1949,20 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, void udp_v4_early_demux(struct sk_buff *skb) { - const struct iphdr *iph = ip_hdr(skb); - const struct udphdr *uh = udp_hdr(skb); + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct udphdr *uh; struct sock *sk; struct dst_entry *dst; - struct net *net = dev_net(skb->dev); int dif = skb->dev->ifindex; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return; + iph = ip_hdr(skb); + uh = udp_hdr(skb); + if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -1974,7 +2013,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); - int val; + int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); @@ -1984,6 +2023,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (get_user(val, (int __user *)optval)) return -EFAULT; + valbool = val ? 1 : 0; + switch (optname) { case UDP_CORK: if (val != 0) { @@ -2013,6 +2054,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } break; + case UDP_NO_CHECK6_TX: + up->no_check6_tx = valbool; + break; + + case UDP_NO_CHECK6_RX: + up->no_check6_rx = valbool; + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ @@ -2095,6 +2144,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = up->encap_type; break; + case UDP_NO_CHECK6_TX: + val = up->no_check6_tx; + break; + + case UDP_NO_CHECK6_RX: + val = up->no_check6_rx; + break; + /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: @@ -2474,11 +2531,16 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; int mac_len = skb->mac_len; int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; netdev_features_t enc_features; - int outer_hlen; + int udp_offset, outer_hlen; + unsigned int oldlen; + bool need_csum; + + oldlen = (u16)~skb->len; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; @@ -2490,17 +2552,25 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = skb_inner_network_offset(skb); skb->protocol = htons(ETH_P_TEB); + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); + if (need_csum) + skb->encap_hdr_csum = 1; + /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); goto out; + } outer_hlen = skb_tnl_header_len(skb); + udp_offset = outer_hlen - tnl_hlen; skb = segs; do { struct udphdr *uh; - int udp_offset = outer_hlen - tnl_hlen; + int len; skb_reset_inner_headers(skb); skb->encapsulation = 1; @@ -2511,31 +2581,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); + len = skb->len - udp_offset; uh = udp_hdr(skb); - uh->len = htons(skb->len - udp_offset); + uh->len = htons(len); - /* csum segment if tunnel sets skb with csum. */ - if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) { - struct iphdr *iph = ip_hdr(skb); - - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - udp_offset, - IPPROTO_UDP, 0); - uh->check = csum_fold(skb_checksum(skb, udp_offset, - skb->len - udp_offset, 0)); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; + if (need_csum) { + __be32 delta = htonl(oldlen + len); - } else if (protocol == htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - u32 len = skb->len - udp_offset; + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + uh->check = gso_make_checksum(skb, ~uh->check); - uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - len, IPPROTO_UDP, 0); - uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; } skb->protocol = protocol; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 83206de2bc7..546d2d439dd 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -14,6 +14,17 @@ #include <net/udp.h> #include <net/protocol.h> +static DEFINE_SPINLOCK(udp_offload_lock); +static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; + +#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) + +struct udp_offload_priv { + struct udp_offload *offload; + struct rcu_head rcu; + struct udp_offload_priv __rcu *next; +}; + static int udp4_ufo_send_check(struct sk_buff *skb) { if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -41,6 +52,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; + int offset; + __wsum csum; + + if (skb->encapsulation && + (skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { + segs = skb_udp_tunnel_segment(skb, features); + goto out; + } mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -52,8 +72,10 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM | + SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; @@ -63,35 +85,162 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ - if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) - segs = skb_udp_tunnel_segment(skb, features); - else { - int offset; - __wsum csum; - - /* Do software UFO. Complete and fill in the UDP checksum as - * HW cannot do checksum of UDP packets sent as multiple - * IP fragments. - */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - - segs = skb_segment(skb, features); - } + segs = skb_segment(skb, features); out: return segs; } +int udp_add_offload(struct udp_offload *uo) +{ + struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); + + if (!new_offload) + return -ENOMEM; + + new_offload->offload = uo; + + spin_lock(&udp_offload_lock); + new_offload->next = udp_offload_base; + rcu_assign_pointer(udp_offload_base, new_offload); + spin_unlock(&udp_offload_lock); + + return 0; +} +EXPORT_SYMBOL(udp_add_offload); + +static void udp_offload_free_routine(struct rcu_head *head) +{ + struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); + kfree(ou_priv); +} + +void udp_del_offload(struct udp_offload *uo) +{ + struct udp_offload_priv __rcu **head = &udp_offload_base; + struct udp_offload_priv *uo_priv; + + spin_lock(&udp_offload_lock); + + uo_priv = udp_deref_protected(*head); + for (; uo_priv != NULL; + uo_priv = udp_deref_protected(*head)) { + if (uo_priv->offload == uo) { + rcu_assign_pointer(*head, + udp_deref_protected(uo_priv->next)); + goto unlock; + } + head = &uo_priv->next; + } + pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); +unlock: + spin_unlock(&udp_offload_lock); + if (uo_priv != NULL) + call_rcu(&uo_priv->rcu, udp_offload_free_routine); +} +EXPORT_SYMBOL(udp_del_offload); + +static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct udp_offload_priv *uo_priv; + struct sk_buff *p, **pp = NULL; + struct udphdr *uh, *uh2; + unsigned int hlen, off; + int flush = 1; + + if (NAPI_GRO_CB(skb)->udp_mark || + (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + goto out; + + /* mark that this skb passed once through the udp gro layer */ + NAPI_GRO_CB(skb)->udp_mark = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + uh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!uh)) + goto out; + } + + rcu_read_lock(); + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_receive) + goto unflush; + } + goto out_unlock; + +unflush: + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = (struct udphdr *)(p->data + off); + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + pp = uo_priv->offload->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + return pp; +} + +static int udp_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct udp_offload_priv *uo_priv; + __be16 newlen = htons(skb->len - nhoff); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + int err = -ENOSYS; + + uh->len = newlen; + + rcu_read_lock(); + + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_complete) + break; + } + + if (uo_priv != NULL) + err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + + rcu_read_unlock(); + return err; +} + static const struct net_offload udpv4_offload = { .callbacks = { .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, + .gro_receive = udp_gro_receive, + .gro_complete = udp_gro_complete, }, }; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2c46acd4cc3..3b3efbda48e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, - .no_check = 0, /* must checksum (RFC 3828) */ .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 1f12c8b4586..aac6197b7a7 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -37,15 +37,6 @@ drop: return NET_RX_DROP; } -int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - return xfrm_input(skb, nexthdr, spi, encap_type); -} -EXPORT_SYMBOL(xfrm4_rcv_encap); - int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct iphdr *iph = ip_hdr(skb); diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e3db3f91511..71acd0014f2 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); skb_set_network_header(skb, -x->props.header_len - - hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); + hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); if (x->sel.family != AF_INET6) skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 31b18152528..91771a7c802 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -15,65 +15,6 @@ #include <net/ip.h> #include <net/xfrm.h> -/* Informational hook. The decap is still done here. */ -static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly; -static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); - -int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -EEXIST; - int priority = handler->priority; - - mutex_lock(&xfrm4_mode_tunnel_input_mutex); - - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t->priority > priority) - break; - if (t->priority == priority) - goto err; - - } - - handler->next = *pprev; - rcu_assign_pointer(*pprev, handler); - - ret = 0; - -err: - mutex_unlock(&xfrm4_mode_tunnel_input_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); - -int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -ENOENT; - - mutex_lock(&xfrm4_mode_tunnel_input_mutex); - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t == handler) { - *pprev = handler->next; - ret = 0; - break; - } - } - mutex_unlock(&xfrm4_mode_tunnel_input_mutex); - synchronize_net(); - - return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); - static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { struct iphdr *inner_iph = ipip_hdr(skb); @@ -117,24 +58,18 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - ip_select_ident(skb, dst->child, NULL); top_iph->ttl = ip4_dst_hoplimit(dst->child); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; + ip_select_ident(skb, NULL); return 0; } -#define for_each_input_rcu(head, handler) \ - for (handler = rcu_dereference(head); \ - handler != NULL; \ - handler = rcu_dereference(handler->next)) - static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_tunnel_notifier *handler; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; - for_each_input_rcu(rcv_notify_handlers, handler) - handler->handler(skb); - err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index baa0f63731f..d5f6bd9a210 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) goto out; mtu = dst_mtu(skb_dst(skb)); @@ -62,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - - skb->protocol = htons(ETH_P_IP); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; return x->outer_mode->output2(x, skb); } @@ -73,27 +70,34 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER - if (!skb_dst(skb)->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + skb->protocol = htons(ETH_P_IP); +#ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IP); return xfrm_output(skb); } -int xfrm4_output(struct sk_buff *skb) +static int __xfrm4_output(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x = skb_dst(skb)->xfrm; + +#ifdef CONFIG_NETFILTER + if (!x) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm4_output(struct sock *sk, struct sk_buff *skb) +{ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, dst->dev, - x->outer_mode->afinfo->output_finish, + NULL, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e1a63930a96..6156f68a1e9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -325,6 +325,7 @@ void __init xfrm4_init(void) xfrm4_state_init(); xfrm4_policy_init(); + xfrm4_protocol_init(); #ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm4_net_ops); #endif diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c new file mode 100644 index 00000000000..a2ce0101eaa --- /dev/null +++ b/net/ipv4/xfrm4_protocol.c @@ -0,0 +1,301 @@ +/* xfrm4_protocol.c - Generic xfrm protocol multiplexer. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/tunnel4.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; +static DEFINE_MUTEX(xfrm4_protocol_mutex); + +static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_handlers; + case IPPROTO_AH: + return &ah4_handlers; + case IPPROTO_COMP: + return &ipcomp4_handlers; + } + + return NULL; +} + +#define for_each_protocol_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->cb_handler(skb, err)) <= 0) + return ret; + + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_cb); + +int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + if (!head) + goto out; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) + return ret; + +out: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_encap); + +static int xfrm4_esp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(esp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(esp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ah_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ah4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret;; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ah4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ipcomp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static const struct net_protocol esp4_protocol = { + .handler = xfrm4_esp_rcv, + .err_handler = xfrm4_esp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ah4_protocol = { + .handler = xfrm4_ah_rcv, + .err_handler = xfrm4_ah_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ipcomp4_protocol = { + .handler = xfrm4_ipcomp_rcv, + .err_handler = xfrm4_ipcomp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static struct xfrm_input_afinfo xfrm4_input_afinfo = { + .family = AF_INET, + .owner = THIS_MODULE, + .callback = xfrm4_rcv_cb, +}; + +static inline const struct net_protocol *netproto(unsigned char protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_protocol; + case IPPROTO_AH: + return &ah4_protocol; + case IPPROTO_COMP: + return &ipcomp4_protocol; + } + + return NULL; +} + +int xfrm4_protocol_register(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + bool add_netproto = false; + int ret = -EEXIST; + int priority = handler->priority; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) + add_netproto = true; + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t->priority < priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm4_protocol_mutex); + + if (add_netproto) { + if (inet_add_protocol(netproto(protocol), protocol)) { + pr_err("%s: can't add protocol\n", __func__); + ret = -EAGAIN; + } + } + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_register); + +int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + int ret = -ENOENT; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) { + if (inet_del_protocol(netproto(protocol), protocol) < 0) { + pr_err("%s: can't remove protocol\n", __func__); + ret = -EAGAIN; + } + } + + mutex_unlock(&xfrm4_protocol_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_deregister); + +void __init xfrm4_protocol_init(void) +{ + xfrm_input_register_afinfo(&xfrm4_input_afinfo); +} +EXPORT_SYMBOL(xfrm4_protocol_init); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 0b2a0641526..542074c00c7 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -16,7 +16,7 @@ static int xfrm4_init_flags(struct xfrm_state *x) { - if (ipv4_config.no_pmtu_disc) + if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) x->props.flags |= XFRM_STATE_NOPMTUDISC; return 0; } diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index d92e5586783..438a73aa777 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -138,6 +138,7 @@ config INET6_XFRM_MODE_ROUTEOPTIMIZATION config IPV6_VTI tristate "Virtual (secure) IPv6: tunneling" select IPV6_TUNNEL + select NET_IP_TUNNEL depends on INET6_XFRM_MODE_TUNNEL ---help--- Tunneling means encapsulating data of one protocol type within diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 17bb830872d..2fe68364bb2 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -16,7 +16,7 @@ ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ - xfrm6_output.o + xfrm6_output.o xfrm6_protocol.o ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index be4dbbd17d3..5667b3003af 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -133,10 +133,12 @@ static int ipv6_count_addresses(struct inet6_dev *idev); static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; static DEFINE_SPINLOCK(addrconf_hash_lock); -static void addrconf_verify(unsigned long); +static void addrconf_verify(void); +static void addrconf_verify_rtnl(void); +static void addrconf_verify_work(struct work_struct *); -static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0); -static DEFINE_SPINLOCK(addrconf_verify_lock); +static struct workqueue_struct *addrconf_wq; +static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work); static void addrconf_join_anycast(struct inet6_ifaddr *ifp); static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); @@ -151,7 +153,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, u32 flags, u32 noflags); static void addrconf_dad_start(struct inet6_ifaddr *ifp); -static void addrconf_dad_timer(unsigned long data); +static void addrconf_dad_work(struct work_struct *w); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); @@ -247,9 +249,9 @@ static void addrconf_del_rs_timer(struct inet6_dev *idev) __in6_dev_put(idev); } -static void addrconf_del_dad_timer(struct inet6_ifaddr *ifp) +static void addrconf_del_dad_work(struct inet6_ifaddr *ifp) { - if (del_timer(&ifp->dad_timer)) + if (cancel_delayed_work(&ifp->dad_work)) __in6_ifa_put(ifp); } @@ -261,31 +263,26 @@ static void addrconf_mod_rs_timer(struct inet6_dev *idev, mod_timer(&idev->rs_timer, jiffies + when); } -static void addrconf_mod_dad_timer(struct inet6_ifaddr *ifp, - unsigned long when) +static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp, + unsigned long delay) { - if (!timer_pending(&ifp->dad_timer)) + if (!delayed_work_pending(&ifp->dad_work)) in6_ifa_hold(ifp); - mod_timer(&ifp->dad_timer, jiffies + when); + mod_delayed_work(addrconf_wq, &ifp->dad_work, delay); } static int snmp6_alloc_dev(struct inet6_dev *idev) { int i; - if (snmp_mib_init((void __percpu **)idev->stats.ipv6, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + idev->stats.ipv6 = alloc_percpu(struct ipstats_mib); + if (!idev->stats.ipv6) goto err_ip; for_each_possible_cpu(i) { struct ipstats_mib *addrconf_stats; - addrconf_stats = per_cpu_ptr(idev->stats.ipv6[0], i); + addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i); u64_stats_init(&addrconf_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - addrconf_stats = per_cpu_ptr(idev->stats.ipv6[1], i); - u64_stats_init(&addrconf_stats->syncp); -#endif } @@ -303,7 +300,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) err_icmpmsg: kfree(idev->stats.icmpv6dev); err_icmp: - snmp_mib_free((void __percpu **)idev->stats.ipv6); + free_percpu(idev->stats.ipv6); err_ip: return -ENOMEM; } @@ -442,6 +439,8 @@ static int inet6_netconf_msgsize_devconf(int type) if (type == -1 || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); #endif + if (type == -1 || type == NETCONFA_PROXY_NEIGH) + size += nla_total_size(4); return size; } @@ -475,6 +474,10 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, devconf->mc_forwarding) < 0) goto nla_put_failure; #endif + if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) + goto nla_put_failure; + return nlmsg_end(skb, nlh); nla_put_failure: @@ -509,6 +512,7 @@ errout: static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, + [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, }; static int inet6_netconf_get_devconf(struct sk_buff *in_skb, @@ -744,8 +748,9 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) in6_dev_put(ifp->idev); - if (del_timer(&ifp->dad_timer)) - pr_notice("Timer is still running, when freeing ifa=%p\n", ifp); + if (cancel_delayed_work(&ifp->dad_work)) + pr_notice("delayed DAD work was pending while freeing ifa=%p\n", + ifp); if (ifp->state != INET6_IFADDR_STATE_DEAD) { pr_warn("Freeing alive inet6 address %p\n", ifp); @@ -842,8 +847,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, spin_lock_init(&ifa->lock); spin_lock_init(&ifa->state_lock); - setup_timer(&ifa->dad_timer, addrconf_dad_timer, - (unsigned long)ifa); + INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work); INIT_HLIST_NODE(&ifa->addr_lst); ifa->scope = scope; ifa->prefix_len = pfxlen; @@ -893,15 +897,97 @@ out: goto out2; } +enum cleanup_prefix_rt_t { + CLEANUP_PREFIX_RT_NOP, /* no cleanup action for prefix route */ + CLEANUP_PREFIX_RT_DEL, /* delete the prefix route */ + CLEANUP_PREFIX_RT_EXPIRE, /* update the lifetime of the prefix route */ +}; + +/* + * Check, whether the prefix for ifp would still need a prefix route + * after deleting ifp. The function returns one of the CLEANUP_PREFIX_RT_* + * constants. + * + * 1) we don't purge prefix if address was not permanent. + * prefix is managed by its own lifetime. + * 2) we also don't purge, if the address was IFA_F_NOPREFIXROUTE. + * 3) if there are no addresses, delete prefix. + * 4) if there are still other permanent address(es), + * corresponding prefix is still permanent. + * 5) if there are still other addresses with IFA_F_NOPREFIXROUTE, + * don't purge the prefix, assume user space is managing it. + * 6) otherwise, update prefix lifetime to the + * longest valid lifetime among the corresponding + * addresses on the device. + * Note: subsequent RA will update lifetime. + **/ +static enum cleanup_prefix_rt_t +check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) +{ + struct inet6_ifaddr *ifa; + struct inet6_dev *idev = ifp->idev; + unsigned long lifetime; + enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_DEL; + + *expires = jiffies; + + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (ifa == ifp) + continue; + if (!ipv6_prefix_equal(&ifa->addr, &ifp->addr, + ifp->prefix_len)) + continue; + if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE)) + return CLEANUP_PREFIX_RT_NOP; + + action = CLEANUP_PREFIX_RT_EXPIRE; + + spin_lock(&ifa->lock); + + lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); + /* + * Note: Because this address is + * not permanent, lifetime < + * LONG_MAX / HZ here. + */ + if (time_before(*expires, ifa->tstamp + lifetime * HZ)) + *expires = ifa->tstamp + lifetime * HZ; + spin_unlock(&ifa->lock); + } + + return action; +} + +static void +cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) +{ + struct rt6_info *rt; + + rt = addrconf_get_prefix_route(&ifp->addr, + ifp->prefix_len, + ifp->idev->dev, + 0, RTF_GATEWAY | RTF_DEFAULT); + if (rt) { + if (del_rt) + ip6_del_rt(rt); + else { + if (!(rt->rt6i_flags & RTF_EXPIRES)) + rt6_set_expires(rt, expires); + ip6_rt_put(rt); + } + } +} + + /* This function wants to get referenced ifp and releases it before return */ static void ipv6_del_addr(struct inet6_ifaddr *ifp) { - struct inet6_ifaddr *ifa, *ifn; - struct inet6_dev *idev = ifp->idev; int state; - int deleted = 0, onlink = 0; - unsigned long expires = jiffies; + enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP; + unsigned long expires; + + ASSERT_RTNL(); spin_lock_bh(&ifp->state_lock); state = ifp->state; @@ -915,7 +1001,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) hlist_del_init_rcu(&ifp->addr_lst); spin_unlock_bh(&addrconf_hash_lock); - write_lock_bh(&idev->lock); + write_lock_bh(&ifp->idev->lock); if (ifp->flags&IFA_F_TEMPORARY) { list_del(&ifp->tmp_list); @@ -926,87 +1012,23 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) __in6_ifa_put(ifp); } - list_for_each_entry_safe(ifa, ifn, &idev->addr_list, if_list) { - if (ifa == ifp) { - list_del_init(&ifp->if_list); - __in6_ifa_put(ifp); + if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) + action = check_cleanup_prefix_route(ifp, &expires); - if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) - break; - deleted = 1; - continue; - } else if (ifp->flags & IFA_F_PERMANENT) { - if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, - ifp->prefix_len)) { - if (ifa->flags & IFA_F_PERMANENT) { - onlink = 1; - if (deleted) - break; - } else { - unsigned long lifetime; - - if (!onlink) - onlink = -1; - - spin_lock(&ifa->lock); - - lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); - /* - * Note: Because this address is - * not permanent, lifetime < - * LONG_MAX / HZ here. - */ - if (time_before(expires, - ifa->tstamp + lifetime * HZ)) - expires = ifa->tstamp + lifetime * HZ; - spin_unlock(&ifa->lock); - } - } - } - } - write_unlock_bh(&idev->lock); + list_del_init(&ifp->if_list); + __in6_ifa_put(ifp); + + write_unlock_bh(&ifp->idev->lock); - addrconf_del_dad_timer(ifp); + addrconf_del_dad_work(ifp); ipv6_ifa_notify(RTM_DELADDR, ifp); inet6addr_notifier_call_chain(NETDEV_DOWN, ifp); - /* - * Purge or update corresponding prefix - * - * 1) we don't purge prefix here if address was not permanent. - * prefix is managed by its own lifetime. - * 2) if there're no addresses, delete prefix. - * 3) if there're still other permanent address(es), - * corresponding prefix is still permanent. - * 4) otherwise, update prefix lifetime to the - * longest valid lifetime among the corresponding - * addresses on the device. - * Note: subsequent RA will update lifetime. - * - * --yoshfuji - */ - if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { - struct in6_addr prefix; - struct rt6_info *rt; - - ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); - - rt = addrconf_get_prefix_route(&prefix, - ifp->prefix_len, - ifp->idev->dev, - 0, RTF_GATEWAY | RTF_DEFAULT); - - if (rt) { - if (onlink == 0) { - ip6_del_rt(rt); - rt = NULL; - } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { - rt6_set_expires(rt, expires); - } - } - ip6_rt_put(rt); + if (action != CLEANUP_PREFIX_RT_NOP) { + cleanup_prefix_route(ifp, expires, + action == CLEANUP_PREFIX_RT_DEL); } /* clean up prefsrc entries */ @@ -1080,8 +1102,11 @@ retry: * Lifetime is greater than REGEN_ADVANCE time units. In particular, * an implementation must not create a temporary address with a zero * Preferred Lifetime. + * Use age calculation as in addrconf_verify to avoid unnecessary + * temporary addresses being generated. */ - if (tmp_prefered_lft <= regen_advance) { + age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + if (tmp_prefered_lft <= regen_advance + age) { in6_ifa_put(ifp); in6_dev_put(idev); ret = -1; @@ -1202,7 +1227,7 @@ static int ipv6_get_saddr_eval(struct net *net, * | d is scope of the destination. * B-d | \ * | \ <- smaller scope is better if - * B-15 | \ if scope is enough for destinaion. + * B-15 | \ if scope is enough for destination. * | ret = B - scope (-1 <= scope >= d <= 15). * d-C-1 | / * |/ <- greater is better @@ -1414,7 +1439,9 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, struct inet6_ifaddr *ifp; int err = -EADDRNOTAVAIL; - list_for_each_entry(ifp, &idev->addr_list, if_list) { + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (ifp->scope > IFA_LINK) + break; if (ifp->scope == IFA_LINK && !(ifp->flags & banned_flags)) { *addr = ifp->addr; @@ -1576,7 +1603,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) { if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); - addrconf_del_dad_timer(ifp); + addrconf_del_dad_work(ifp); ifp->flags |= IFA_F_TENTATIVE; if (dad_failed) ifp->flags |= IFA_F_DADFAILED; @@ -1597,20 +1624,21 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) spin_unlock_bh(&ifp->lock); } ipv6_del_addr(ifp); - } else + } else { ipv6_del_addr(ifp); + } } static int addrconf_dad_end(struct inet6_ifaddr *ifp) { int err = -ENOENT; - spin_lock(&ifp->state_lock); + spin_lock_bh(&ifp->state_lock); if (ifp->state == INET6_IFADDR_STATE_DAD) { ifp->state = INET6_IFADDR_STATE_POSTDAD; err = 0; } - spin_unlock(&ifp->state_lock); + spin_unlock_bh(&ifp->state_lock); return err; } @@ -1643,7 +1671,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) } } - addrconf_dad_stop(ifp, 1); + spin_lock_bh(&ifp->state_lock); + /* transition from _POSTDAD to _ERRDAD */ + ifp->state = INET6_IFADDR_STATE_ERRDAD; + spin_unlock_bh(&ifp->state_lock); + + addrconf_mod_dad_work(ifp, 0); } /* Join to solicited addr multicast group. */ @@ -1652,6 +1685,8 @@ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; + ASSERT_RTNL(); + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1663,6 +1698,8 @@ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; + ASSERT_RTNL(); + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1673,7 +1710,10 @@ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - if (ifp->prefix_len == 127) /* RFC 6164 */ + + ASSERT_RTNL(); + + if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) @@ -1684,7 +1724,10 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - if (ifp->prefix_len == 127) /* RFC 6164 */ + + ASSERT_RTNL(); + + if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) @@ -1818,6 +1861,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) return addrconf_ifid_sit(eui, dev); case ARPHRD_IPGRE: return addrconf_ifid_gre(eui, dev); + case ARPHRD_6LOWPAN: case ARPHRD_IEEE802154: return addrconf_ifid_eui64(eui, dev); case ARPHRD_IEEE1394: @@ -1834,7 +1878,9 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) { + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (ifp->scope > IFA_LINK) + break; if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { memcpy(eui, ifp->addr.s6_addr+8, 8); err = 0; @@ -2240,11 +2286,13 @@ ok: return; } - ifp->flags |= IFA_F_MANAGETEMPADDR; update_lft = 0; create = 1; + spin_lock_bh(&ifp->lock); + ifp->flags |= IFA_F_MANAGETEMPADDR; ifp->cstamp = jiffies; ifp->tokenized = tokenized; + spin_unlock_bh(&ifp->lock); addrconf_dad_start(ifp); } @@ -2295,7 +2343,7 @@ ok: create, now); in6_ifa_put(ifp); - addrconf_verify(0); + addrconf_verify(); } } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); @@ -2429,8 +2477,11 @@ static int inet6_addr_add(struct net *net, int ifindex, valid_lft, prefered_lft); if (!IS_ERR(ifp)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - expires, flags); + if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, + expires, flags); + } + /* * Note that section 3.1 of RFC 4429 indicates * that the Optimistic flag should not be set for @@ -2441,15 +2492,15 @@ static int inet6_addr_add(struct net *net, int ifindex, manage_tempaddrs(idev, ifp, valid_lft, prefered_lft, true, jiffies); in6_ifa_put(ifp); - addrconf_verify(0); + addrconf_verify_rtnl(); return 0; } return PTR_ERR(ifp); } -static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *pfx, - unsigned int plen) +static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, + const struct in6_addr *pfx, unsigned int plen) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; @@ -2472,7 +2523,12 @@ static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *p in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); + if (!(ifp->flags & IFA_F_TEMPORARY) && + (ifa_flags & IFA_F_MANAGETEMPADDR)) + manage_tempaddrs(idev, ifp, 0, 0, false, + jiffies); ipv6_del_addr(ifp); + addrconf_verify_rtnl(); return 0; } } @@ -2512,7 +2568,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) return -EFAULT; rtnl_lock(); - err = inet6_addr_del(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, ireq.ifr6_prefixlen); rtnl_unlock(); return err; @@ -2524,7 +2580,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, struct inet6_ifaddr *ifp; ifp = ipv6_add_addr(idev, addr, NULL, plen, - scope, IFA_F_PERMANENT, 0, 0); + scope, IFA_F_PERMANENT, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -2625,8 +2682,18 @@ static void init_loopback(struct net_device *dev) if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE)) continue; - if (sp_ifa->rt) - continue; + if (sp_ifa->rt) { + /* This dst has been added to garbage list when + * lo device down, release this obsolete dst and + * reallocate a new router for ifa. + */ + if (sp_ifa->rt->dst.obsolete > 0) { + ip6_rt_put(sp_ifa->rt); + sp_ifa->rt = NULL; + } else { + continue; + } + } sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false); @@ -2652,7 +2719,8 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr #endif - ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, 0, 0); + ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); addrconf_dad_start(ifp); @@ -2673,7 +2741,8 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_INFINIBAND) && (dev->type != ARPHRD_IEEE802154) && (dev->type != ARPHRD_IEEE1394) && - (dev->type != ARPHRD_TUNNEL6)) { + (dev->type != ARPHRD_TUNNEL6) && + (dev->type != ARPHRD_6LOWPAN)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } @@ -2739,21 +2808,11 @@ static void addrconf_gre_config(struct net_device *dev) ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) addrconf_add_linklocal(idev, &addr); + else + addrconf_prefix_route(&addr, 64, dev, 0, 0); } #endif -static inline int -ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) -{ - struct in6_addr lladdr; - - if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) { - addrconf_add_linklocal(idev, &lladdr); - return 0; - } - return -1; -} - static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -2870,7 +2929,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } /* - * MTU falled under IPV6_MIN_MTU. + * if MTU under IPV6_MIN_MTU. * Stop IPv6 on this interface. */ @@ -2962,7 +3021,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { hlist_del_init_rcu(&ifa->addr_lst); - addrconf_del_dad_timer(ifa); + addrconf_del_dad_work(ifa); goto restart; } } @@ -3000,7 +3059,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) while (!list_empty(&idev->addr_list)) { ifa = list_first_entry(&idev->addr_list, struct inet6_ifaddr, if_list); - addrconf_del_dad_timer(ifa); + addrconf_del_dad_work(ifa); list_del(&ifa->if_list); @@ -3096,20 +3155,20 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); ifp->dad_probes = idev->cnf.dad_transmits; - addrconf_mod_dad_timer(ifp, rand_num); + addrconf_mod_dad_work(ifp, rand_num); } -static void addrconf_dad_start(struct inet6_ifaddr *ifp) +static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; addrconf_join_solict(dev, &ifp->addr); - net_srandom(ifp->addr.s6_addr32[3]); + prandom_seed((__force u32) ifp->addr.s6_addr32[3]); read_lock_bh(&idev->lock); spin_lock(&ifp->lock); @@ -3154,25 +3213,68 @@ out: read_unlock_bh(&idev->lock); } -static void addrconf_dad_timer(unsigned long data) +static void addrconf_dad_start(struct inet6_ifaddr *ifp) +{ + bool begin_dad = false; + + spin_lock_bh(&ifp->state_lock); + if (ifp->state != INET6_IFADDR_STATE_DEAD) { + ifp->state = INET6_IFADDR_STATE_PREDAD; + begin_dad = true; + } + spin_unlock_bh(&ifp->state_lock); + + if (begin_dad) + addrconf_mod_dad_work(ifp, 0); +} + +static void addrconf_dad_work(struct work_struct *w) { - struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_ifaddr *ifp = container_of(to_delayed_work(w), + struct inet6_ifaddr, + dad_work); struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; + enum { + DAD_PROCESS, + DAD_BEGIN, + DAD_ABORT, + } action = DAD_PROCESS; + + rtnl_lock(); + + spin_lock_bh(&ifp->state_lock); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) { + action = DAD_BEGIN; + ifp->state = INET6_IFADDR_STATE_DAD; + } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) { + action = DAD_ABORT; + ifp->state = INET6_IFADDR_STATE_POSTDAD; + } + spin_unlock_bh(&ifp->state_lock); + + if (action == DAD_BEGIN) { + addrconf_dad_begin(ifp); + goto out; + } else if (action == DAD_ABORT) { + addrconf_dad_stop(ifp, 1); + goto out; + } + if (!ifp->dad_probes && addrconf_dad_end(ifp)) goto out; - write_lock(&idev->lock); + write_lock_bh(&idev->lock); if (idev->dead || !(idev->if_flags & IF_READY)) { - write_unlock(&idev->lock); + write_unlock_bh(&idev->lock); goto out; } spin_lock(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_DEAD) { spin_unlock(&ifp->lock); - write_unlock(&idev->lock); + write_unlock_bh(&idev->lock); goto out; } @@ -3183,7 +3285,7 @@ static void addrconf_dad_timer(unsigned long data) ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); - write_unlock(&idev->lock); + write_unlock_bh(&idev->lock); addrconf_dad_completed(ifp); @@ -3191,16 +3293,35 @@ static void addrconf_dad_timer(unsigned long data) } ifp->dad_probes--; - addrconf_mod_dad_timer(ifp, - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); + addrconf_mod_dad_work(ifp, + NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); spin_unlock(&ifp->lock); - write_unlock(&idev->lock); + write_unlock_bh(&idev->lock); /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); + rtnl_unlock(); +} + +/* ifp->idev must be at least read locked */ +static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ifpiter; + struct inet6_dev *idev = ifp->idev; + + list_for_each_entry_reverse(ifpiter, &idev->addr_list, if_list) { + if (ifpiter->scope > IFA_LINK) + break; + if (ifp != ifpiter && ifpiter->scope == IFA_LINK && + (ifpiter->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE| + IFA_F_OPTIMISTIC|IFA_F_DADFAILED)) == + IFA_F_PERMANENT) + return false; + } + return true; } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) @@ -3209,7 +3330,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) struct in6_addr lladdr; bool send_rs, send_mld; - addrconf_del_dad_timer(ifp); + addrconf_del_dad_work(ifp); /* * Configure the address for reception. Now it is valid. @@ -3222,14 +3343,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) */ read_lock_bh(&ifp->idev->lock); - spin_lock(&ifp->lock); - send_mld = ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL && - ifp->idev->valid_ll_addr_cnt == 1; + send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); send_rs = send_mld && ipv6_accept_ra(ifp->idev) && ifp->idev->cnf.rtr_solicits > 0 && (dev->flags&IFF_LOOPBACK) == 0; - spin_unlock(&ifp->lock); read_unlock_bh(&ifp->idev->lock); /* While dad is in progress mld report's source address is in6_addrany. @@ -3367,12 +3485,12 @@ static void if6_seq_stop(struct seq_file *seq, void *v) static int if6_seq_show(struct seq_file *seq, void *v) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; - seq_printf(seq, "%pi6 %02x %02x %02x %03x %8s\n", + seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", &ifp->addr, ifp->idev->dev->ifindex, ifp->prefix_len, ifp->scope, - ifp->flags, + (u8) ifp->flags, ifp->idev->dev->name); return 0; } @@ -3453,26 +3571,31 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) * Periodic address status verification */ -static void addrconf_verify(unsigned long foo) +static void addrconf_verify_rtnl(void) { unsigned long now, next, next_sec, next_sched; struct inet6_ifaddr *ifp; int i; + ASSERT_RTNL(); + rcu_read_lock_bh(); - spin_lock(&addrconf_verify_lock); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); - del_timer(&addr_chk_timer); + cancel_delayed_work(&addr_chk_work); for (i = 0; i < IN6_ADDR_HSIZE; i++) { restart: - hlist_for_each_entry_rcu_bh(ifp, - &inet6_addr_lst[i], addr_lst) { + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) { unsigned long age; - if (ifp->flags & IFA_F_PERMANENT) + /* When setting preferred_lft to a value not zero or + * infinity, while valid_lft is infinity + * IFA_F_PERMANENT has a non-infinity life time. + */ + if ((ifp->flags & IFA_F_PERMANENT) && + (ifp->prefered_lft == INFINITY_LIFE_TIME)) continue; spin_lock(&ifp->lock); @@ -3497,7 +3620,8 @@ restart: ifp->flags |= IFA_F_DEPRECATED; } - if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) + if ((ifp->valid_lft != INFINITY_LIFE_TIME) && + (time_before(ifp->tstamp + ifp->valid_lft * HZ, next))) next = ifp->tstamp + ifp->valid_lft * HZ; spin_unlock(&ifp->lock); @@ -3558,13 +3682,22 @@ restart: ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", now, next, next_sec, next_sched); - - addr_chk_timer.expires = next_sched; - add_timer(&addr_chk_timer); - spin_unlock(&addrconf_verify_lock); + mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now); rcu_read_unlock_bh(); } +static void addrconf_verify_work(struct work_struct *w) +{ + rtnl_lock(); + addrconf_verify_rtnl(); + rtnl_unlock(); +} + +static void addrconf_verify(void) +{ + mod_delayed_work(addrconf_wq, &addr_chk_work, 0); +} + static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local, struct in6_addr **peer_pfx) { @@ -3598,6 +3731,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *pfx, *peer_pfx; + u32 ifa_flags; int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); @@ -3609,7 +3743,13 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) if (pfx == NULL) return -EINVAL; - return inet6_addr_del(net, ifm->ifa_index, pfx, ifm->ifa_prefixlen); + ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + + /* We ignore other flags so far. */ + ifa_flags &= IFA_F_MANAGETEMPADDR; + + return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, + ifm->ifa_prefixlen); } static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, @@ -3619,6 +3759,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, clock_t expires; unsigned long timeout; bool was_managetempaddr; + bool had_prefixroute; + + ASSERT_RTNL(); if (!valid_lft || (prefered_lft > valid_lft)) return -EINVAL; @@ -3647,8 +3790,11 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, spin_lock_bh(&ifp->lock); was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR; + had_prefixroute = ifp->flags & IFA_F_PERMANENT && + !(ifp->flags & IFA_F_NOPREFIXROUTE); ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | - IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR); + IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | + IFA_F_NOPREFIXROUTE); ifp->flags |= ifa_flags; ifp->tstamp = jiffies; ifp->valid_lft = valid_lft; @@ -3658,8 +3804,22 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, if (!(ifp->flags&IFA_F_TENTATIVE)) ipv6_ifa_notify(0, ifp); - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, - expires, flags); + if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, + expires, flags); + } else if (had_prefixroute) { + enum cleanup_prefix_rt_t action; + unsigned long rt_expires; + + write_lock_bh(&ifp->idev->lock); + action = check_cleanup_prefix_route(ifp, &rt_expires); + write_unlock_bh(&ifp->idev->lock); + + if (action != CLEANUP_PREFIX_RT_NOP) { + cleanup_prefix_route(ifp, rt_expires, + action == CLEANUP_PREFIX_RT_DEL); + } + } if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) @@ -3668,7 +3828,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, !was_managetempaddr, jiffies); } - addrconf_verify(0); + addrconf_verify_rtnl(); return 0; } @@ -3713,13 +3873,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; /* We ignore other flags so far. */ - ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR; + ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | + IFA_F_NOPREFIXROUTE; ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (ifa == NULL) { /* * It would be best to check for !NLM_F_CREATE here but - * userspace alreay relies on not having to provide this. + * userspace already relies on not having to provide this. */ return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, ifm->ifa_prefixlen, ifa_flags, @@ -3797,7 +3958,8 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), ifa->idev->dev->ifindex); - if (!(ifa->flags&IFA_F_PERMANENT)) { + if (!((ifa->flags&IFA_F_PERMANENT) && + (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; valid = ifa->valid_lft; if (preferred != INFINITY_LIFE_TIME) { @@ -4196,7 +4358,7 @@ static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, memset(&stats[items], 0, pad); } -static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib, +static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, int items, int bytes, size_t syncpoff) { int i; @@ -4216,7 +4378,7 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, { switch (attrtype) { case IFLA_INET6_STATS: - __snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6, + __snmp6_fill_stats64(stats, idev->stats.ipv6, IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: @@ -4296,6 +4458,8 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) bool update_rs = false; struct in6_addr ll_addr; + ASSERT_RTNL(); + if (token == NULL) return -EINVAL; if (ipv6_addr_any(token)) @@ -4344,7 +4508,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) } write_unlock_bh(&idev->lock); - addrconf_verify(0); + addrconf_verify_rtnl(); return 0; } @@ -4542,29 +4706,17 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); } -static void update_valid_ll_addr_cnt(struct inet6_ifaddr *ifp, int count) -{ - write_lock_bh(&ifp->idev->lock); - spin_lock(&ifp->lock); - if (((ifp->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE|IFA_F_OPTIMISTIC| - IFA_F_DADFAILED)) == IFA_F_PERMANENT) && - (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) - ifp->idev->valid_ll_addr_cnt += count; - WARN_ON(ifp->idev->valid_ll_addr_cnt < 0); - spin_unlock(&ifp->lock); - write_unlock_bh(&ifp->idev->lock); -} - static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); + if (event) + ASSERT_RTNL(); + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); switch (event) { case RTM_NEWADDR: - update_valid_ll_addr_cnt(ifp, 1); - /* * If the address was optimistic * we inserted the route at the start of @@ -4580,8 +4732,6 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ifp->idev->dev, 0, 0); break; case RTM_DELADDR: - update_valid_ll_addr_cnt(ifp, -1); - if (ifp->idev->cnf.forwarding) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); @@ -4728,6 +4878,46 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } +static +int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int ret; + int old, new; + + old = *valp; + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + new = *valp; + + if (write && old != new) { + struct net *net = ctl->extra2; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (valp == &net->ipv6.devconf_dflt->proxy_ndp) + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + else if (valp == &net->ipv6.devconf_all->proxy_ndp) + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + else { + struct inet6_dev *idev = ctl->extra1; + + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + idev->dev->ifindex, + &idev->cnf); + } + rtnl_unlock(); + } + + return ret; +} + + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; @@ -4914,7 +5104,7 @@ static struct addrconf_sysctl_table .data = &ipv6_devconf.proxy_ndp, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = addrconf_sysctl_proxy_ndp, }, { .procname = "accept_source_route", @@ -5131,6 +5321,12 @@ int __init addrconf_init(void) if (err < 0) goto out_addrlabel; + addrconf_wq = create_workqueue("ipv6_addrconf"); + if (!addrconf_wq) { + err = -ENOMEM; + goto out_nowq; + } + /* The addrconf netdev notifier requires that loopback_dev * has it's ipv6 private information allocated and setup * before it can bring up and give link-local addresses @@ -5161,11 +5357,9 @@ int __init addrconf_init(void) register_netdevice_notifier(&ipv6_dev_notf); - addrconf_verify(0); + addrconf_verify(); - err = rtnl_af_register(&inet6_ops); - if (err < 0) - goto errout_af; + rtnl_af_register(&inet6_ops); err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, NULL); @@ -5189,9 +5383,10 @@ int __init addrconf_init(void) return 0; errout: rtnl_af_unregister(&inet6_ops); -errout_af: unregister_netdevice_notifier(&ipv6_dev_notf); errlo: + destroy_workqueue(addrconf_wq); +out_nowq: unregister_pernet_subsys(&addrconf_ops); out_addrlabel: ipv6_addr_label_cleanup(); @@ -5227,7 +5422,8 @@ void addrconf_cleanup(void) for (i = 0; i < IN6_ADDR_HSIZE; i++) WARN_ON(!hlist_empty(&inet6_addr_lst[i])); spin_unlock_bh(&addrconf_hash_lock); - - del_timer(&addr_chk_timer); + cancel_delayed_work(&addr_chk_work); rtnl_unlock(); + + destroy_workqueue(addrconf_wq); } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 4c11cbcf830..e6960457f62 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -123,7 +123,7 @@ static void snmp6_free_dev(struct inet6_dev *idev) { kfree(idev->stats.icmpv6msgdev); kfree(idev->stats.icmpv6dev); - snmp_mib_free((void __percpu **)idev->stats.ipv6); + free_percpu(idev->stats.ipv6); } /* Nobody refers to this device, we may destroy it. */ diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index b30ad3741b4..731e1e1722d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -6,7 +6,7 @@ */ /* * Author: - * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> + * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> */ #include <linux/kernel.h> @@ -22,14 +22,13 @@ #if 0 #define ADDRLABEL(x...) printk(x) #else -#define ADDRLABEL(x...) do { ; } while(0) +#define ADDRLABEL(x...) do { ; } while (0) #endif /* * Policy Table */ -struct ip6addrlbl_entry -{ +struct ip6addrlbl_entry { #ifdef CONFIG_NET_NS struct net *lbl_net; #endif @@ -88,39 +87,39 @@ static const __net_initconst struct ip6addrlbl_init_table { /* ::/0 */ .prefix = &in6addr_any, .label = 1, - },{ /* fc00::/7 */ - .prefix = &(struct in6_addr){{{ 0xfc }}}, + }, { /* fc00::/7 */ + .prefix = &(struct in6_addr){ { { 0xfc } } } , .prefixlen = 7, .label = 5, - },{ /* fec0::/10 */ - .prefix = &(struct in6_addr){{{ 0xfe, 0xc0 }}}, + }, { /* fec0::/10 */ + .prefix = &(struct in6_addr){ { { 0xfe, 0xc0 } } }, .prefixlen = 10, .label = 11, - },{ /* 2002::/16 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x02 }}}, + }, { /* 2002::/16 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x02 } } }, .prefixlen = 16, .label = 2, - },{ /* 3ffe::/16 */ - .prefix = &(struct in6_addr){{{ 0x3f, 0xfe }}}, + }, { /* 3ffe::/16 */ + .prefix = &(struct in6_addr){ { { 0x3f, 0xfe } } }, .prefixlen = 16, .label = 12, - },{ /* 2001::/32 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x01 }}}, + }, { /* 2001::/32 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x01 } } }, .prefixlen = 32, .label = 6, - },{ /* 2001:10::/28 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x01, 0x00, 0x10 }}}, + }, { /* 2001:10::/28 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x01, 0x00, 0x10 } } }, .prefixlen = 28, .label = 7, - },{ /* ::ffff:0:0 */ - .prefix = &(struct in6_addr){{{ [10] = 0xff, [11] = 0xff }}}, + }, { /* ::ffff:0:0 */ + .prefix = &(struct in6_addr){ { { [10] = 0xff, [11] = 0xff } } }, .prefixlen = 96, .label = 4, - },{ /* ::/96 */ + }, { /* ::/96 */ .prefix = &in6addr_any, .prefixlen = 96, .label = 3, - },{ /* ::1/128 */ + }, { /* ::1/128 */ .prefix = &in6addr_loopback, .prefixlen = 128, .label = 0, @@ -441,7 +440,7 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh) if (label == IPV6_ADDR_LABEL_DEFAULT) return -EINVAL; - switch(nlh->nlmsg_type) { + switch (nlh->nlmsg_type) { case RTM_NEWADDRLABEL: if (ifal->ifal_index && !__dev_get_by_index(net, ifal->ifal_index)) @@ -505,12 +504,13 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { if (idx >= s_idx && net_eq(ip6addrlbl_net(p), net)) { - if ((err = ip6addrlbl_fill(skb, p, - ip6addrlbl_table.seq, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWADDRLABEL, - NLM_F_MULTI)) <= 0) + err = ip6addrlbl_fill(skb, p, + ip6addrlbl_table.seq, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWADDRLABEL, + NLM_F_MULTI); + if (err <= 0) break; } idx++; @@ -527,7 +527,7 @@ static inline int ip6addrlbl_msgsize(void) + nla_total_size(4); /* IFAL_LABEL */ } -static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh) +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct ifaddrlblmsg *ifal; @@ -568,7 +568,8 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh) goto out; } - if (!(skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL))) { + skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); + if (!skb) { ip6addrlbl_put(p); return -ENOBUFS; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4fbdb7046d2..7cb4392690d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -106,7 +106,6 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, struct inet_protosw *answer; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; @@ -162,7 +161,6 @@ lookup_protocol: sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); @@ -176,7 +174,6 @@ lookup_protocol: sock_init_data(sock, sk); err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; @@ -213,7 +210,7 @@ lookup_protocol: inet->mc_list = NULL; inet->rcv_tos = 0; - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; @@ -661,7 +658,7 @@ int inet6_sk_rebuild_header(struct sock *sk) final_p = fl6_update_dst(&fl6, np->opt, &final); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { sk->sk_route_caps = 0; sk->sk_err_soft = -PTR_ERR(dst); @@ -683,8 +680,7 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & - *(__be32 *)skb_network_header(skb)) && + (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || @@ -716,33 +712,25 @@ static int __net_init ipv6_init_mibs(struct net *net) { int i; - if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udp_stats_in6) return -ENOMEM; - if (snmp_mib_init((void __percpu **)net->mib.udplite_stats_in6, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_stats_in6) goto err_udplite_mib; - if (snmp_mib_init((void __percpu **)net->mib.ipv6_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ipv6_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet6_stats; - af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics[0], i); + af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i); u64_stats_init(&af_inet6_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics[1], i); - u64_stats_init(&af_inet6_stats->syncp); -#endif } - if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics, - sizeof(struct icmpv6_mib), - __alignof__(struct icmpv6_mib)) < 0) + net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); + if (!net->mib.icmpv6_statistics) goto err_icmp_mib; net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), GFP_KERNEL); @@ -751,22 +739,22 @@ static int __net_init ipv6_init_mibs(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); + free_percpu(net->mib.icmpv6_statistics); err_icmp_mib: - snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); + free_percpu(net->mib.ipv6_statistics); err_ip_mib: - snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); + free_percpu(net->mib.udplite_stats_in6); err_udplite_mib: - snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); + free_percpu(net->mib.udp_stats_in6); return -ENOMEM; } static void ipv6_cleanup_mibs(struct net *net) { - snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); - snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); - snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); - snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); + free_percpu(net->mib.udp_stats_in6); + free_percpu(net->mib.udplite_stats_in6); + free_percpu(net->mib.ipv6_statistics); + free_percpu(net->mib.icmpv6_statistics); kfree(net->mib.icmpv6msg_statistics); } @@ -776,6 +764,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; + net->ipv6.sysctl.flowlabel_consistency = 1; atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 81e496a2e00..72a4930bdc0 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -346,6 +346,10 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) struct ip_auth_hdr *ah; struct ah_data *ahp; struct tmp_ext *iph_ext; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; @@ -359,15 +363,22 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) if (extlen) extlen += sizeof(*iph_ext); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } err = -ENOMEM; - iph_base = ah_alloc_tmp(ahash, nfrags, IPV6HDR_BASELEN + extlen); + iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN + + extlen + seqhi_len); if (!iph_base) goto out; iph_ext = ah_tmp_ext(iph_base); - icv = ah_tmp_icv(ahash, iph_ext, extlen); + seqhi = (__be32 *)((char *)iph_ext + extlen); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; ah = ip_auth_hdr(skb); memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -411,10 +422,15 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah6_output_done, skb); AH_SKB_CB(skb)->tmp = iph_base; @@ -514,6 +530,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) int nexthdr; int nfrags; int err = -ENOMEM; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) goto out; @@ -550,14 +570,22 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, hdr_len); - work_iph = ah_alloc_tmp(ahash, nfrags, hdr_len + ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + + work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len + + ahp->icv_trunc_len + seqhi_len); if (!work_iph) goto out; - auth_data = ah_tmp_auth(work_iph, hdr_len); - icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); + auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); + seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memcpy(work_iph, ip6h, hdr_len); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -572,10 +600,16 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) ip6h->flow_lbl[2] = 0; ip6h->hop_limit = 0; - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); + + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(seqhisg, seqhi, seqhi_len); + } - ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah6_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; @@ -609,8 +643,8 @@ out: return err; } -static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) +static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; @@ -619,17 +653,19 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) - return; + return 0; x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); if (!x) - return; + return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); else ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static int ah6_init_state(struct xfrm_state *x) @@ -714,6 +750,11 @@ static void ah6_destroy(struct xfrm_state *x) kfree(ahp); } +static int ah6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ah6_type = { .description = "AH6", @@ -727,10 +768,11 @@ static const struct xfrm_type ah6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static const struct inet6_protocol ah6_protocol = { +static struct xfrm6_protocol ah6_protocol = { .handler = xfrm6_rcv, + .cb_handler = ah6_rcv_cb, .err_handler = ah6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init ah6_init(void) @@ -740,7 +782,7 @@ static int __init ah6_init(void) return -EAGAIN; } - if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { + if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah6_type, AF_INET6); return -EAGAIN; @@ -751,7 +793,7 @@ static int __init ah6_init(void) static void __exit ah6_fini(void) { - if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) + if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 5a80f15a9de..21018324468 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -383,6 +383,17 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, return found; } +/* check if this anycast address is link-local on given interface or + * is global + */ +bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, + const struct in6_addr *addr) +{ + return ipv6_chk_acast_addr(net, + (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ? + dev : NULL), + addr); +} #ifdef CONFIG_PROC_FS struct ac6_iter_state { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 8dfe1f4d3c1..c3bf2d2e519 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -73,7 +73,6 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - usin->sin6_addr = flowlabel->dst; } } @@ -171,7 +170,7 @@ ipv4_connected: opt = flowlabel ? flowlabel->opt : np->opt; final_p = fl6_update_dst(&fl6, opt, &final); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); err = 0; if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -206,6 +205,16 @@ out: } EXPORT_SYMBOL_GPL(ip6_datagram_connect); +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); + if (sin6->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + return ip6_datagram_connect(sk, uaddr, addr_len); +} +EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); + void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -323,7 +332,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; - struct sockaddr_in6 *sin; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; @@ -349,7 +358,6 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - sin = (struct sockaddr_in6 *)msg->msg_name; if (sin) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; @@ -379,10 +387,12 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; + if (np->rxopt.all) + ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) - ip6_datagram_recv_ctl(sk, msg, skb); + ip6_datagram_recv_specific_ctl(sk, msg, skb); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); @@ -430,8 +440,8 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; - struct sockaddr_in6 *sin; struct ip6_mtuinfo mtu_info; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); int err; int copied; @@ -453,7 +463,6 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); - sin = (struct sockaddr_in6 *)msg->msg_name; if (sin) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; @@ -474,20 +483,34 @@ out: } -int ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, - struct sk_buff *skb) +void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - unsigned char *nh = skb_network_header(skb); + bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; - src_info.ipi6_ifindex = opt->iif; - src_info.ipi6_addr = ipv6_hdr(skb)->daddr; + if (is_ipv6) { + src_info.ipi6_ifindex = IP6CB(skb)->iif; + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; + } else { + src_info.ipi6_ifindex = + PKTINFO_SKB_CB(skb)->ipi_ifindex; + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + &src_info.ipi6_addr); + } put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } +} + +void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; @@ -605,7 +628,13 @@ int ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } } - return 0; +} + +void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + ip6_datagram_recv_common_ctl(sk, msg, skb); + ip6_datagram_recv_specific_ctl(sk, msg, skb); } EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); @@ -670,7 +699,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && !ipv6_chk_addr(net, &src_info->ipi6_addr, - strict ? dev : NULL, 0)) + strict ? dev : NULL, 0) && + !ipv6_chk_acast_addr_src(net, dev, + &src_info->ipi6_addr)) err = -EINVAL; else fl6->saddr = src_info->ipi6_addr; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 6eef8a7e35f..d15da137714 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -421,8 +421,8 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) net_adj) & ~(blksize - 1)) + net_adj - 2; } -static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) +static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; @@ -431,18 +431,20 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) - return; + return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); if (!x) - return; + return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); else ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static void esp6_destroy(struct xfrm_state *x) @@ -614,6 +616,11 @@ error: return err; } +static int esp6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type esp6_type = { .description = "ESP6", @@ -628,10 +635,11 @@ static const struct xfrm_type esp6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static const struct inet6_protocol esp6_protocol = { - .handler = xfrm6_rcv, +static struct xfrm6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .cb_handler = esp6_rcv_cb, .err_handler = esp6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init esp6_init(void) @@ -640,7 +648,7 @@ static int __init esp6_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { + if (xfrm6_protocol_register(&esp6_protocol, IPPROTO_ESP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&esp6_type, AF_INET6); return -EAGAIN; @@ -651,7 +659,7 @@ static int __init esp6_init(void) static void __exit esp6_fini(void) { - if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) + if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 140748debc4..8af3eb57f43 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -212,7 +212,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { - if (target < 0) + if (target < 0 || found) break; return -ENOENT; } diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c index cf77f3abfd0..447a7fbd1bb 100644 --- a/net/ipv6/exthdrs_offload.c +++ b/net/ipv6/exthdrs_offload.c @@ -25,11 +25,11 @@ int __init ipv6_exthdrs_offload_init(void) int ret; ret = inet6_add_offload(&rthdr_offload, IPPROTO_ROUTING); - if (!ret) + if (ret) goto out; ret = inet6_add_offload(&dstopt_offload, IPPROTO_DSTOPTS); - if (!ret) + if (ret) goto out_rt; out: diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index e27591635f9..b4d5e1d97c1 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -122,7 +122,11 @@ out: static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct rt6_info *rt = (struct rt6_info *) arg->result; - struct net_device *dev = rt->rt6i_idev->dev; + struct net_device *dev = NULL; + + if (rt->rt6i_idev) + dev = rt->rt6i_idev->dev; + /* do not accept result if the route does * not meet the required prefix length */ @@ -165,7 +169,7 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) return 0; } - if (r->tclass && r->tclass != ((ntohl(fl6->flowlabel) >> 20) & 0xff)) + if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel)) return 0; return 1; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index eef8d945b36..f6c84a6eb23 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -67,6 +67,7 @@ #include <net/icmp.h> #include <net/xfrm.h> #include <net/inet_common.h> +#include <net/dsfield.h> #include <asm/uaccess.h> @@ -315,8 +316,10 @@ static void mip6_addr_swap(struct sk_buff *skb) static inline void mip6_addr_swap(struct sk_buff *skb) {} #endif -struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, - struct sock *sk, struct flowi6 *fl6) +static struct dst_entry *icmpv6_route_lookup(struct net *net, + struct sk_buff *skb, + struct sock *sk, + struct flowi6 *fl6) { struct dst_entry *dst, *dst2; struct flowi6 fl2; @@ -397,6 +400,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) int len; int hlimit; int err = 0; + u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) @@ -410,7 +414,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) */ addr_type = ipv6_addr_type(&hdr->daddr); - if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0)) + if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) || + ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr)) saddr = &hdr->daddr; /* @@ -462,6 +467,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) fl6.daddr = hdr->saddr; if (saddr) fl6.saddr = *saddr; + fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; @@ -470,6 +476,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) sk = icmpv6_xmit_lock(net); if (sk == NULL) return; + sk->sk_mark = mark; np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6)) @@ -489,12 +496,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) if (IS_ERR(dst)) goto out; - if (ipv6_addr_is_multicast(&fl6.daddr)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -516,7 +518,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) np->tclass, NULL, &fl6, (struct rt6_info *)dst, MSG_DONTWAIT, np->dontfrag); if (err) { - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, @@ -551,10 +553,14 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; int err = 0; int hlimit; + u8 tclass; + u32 mark = IP6_REPLY_MARK(net, skb->mark); saddr = &ipv6_hdr(skb)->daddr; - if (!ipv6_unicast_destination(skb)) + if (!ipv6_unicast_destination(skb) && + !(net->ipv6.sysctl.anycast_src_echo_reply && + ipv6_anycast_destination(skb))) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); @@ -567,11 +573,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.saddr = *saddr; fl6.flowi6_oif = skb->dev->ifindex; fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); if (sk == NULL) return; + sk->sk_mark = mark; np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) @@ -586,12 +594,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (IS_ERR(dst)) goto out; - if (ipv6_addr_is_multicast(&fl6.daddr)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); idev = __in6_dev_get(skb->dev); @@ -599,8 +602,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; + tclass = ipv6_get_dsfield(ipv6_hdr(skb)); err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl6, + sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6, (struct rt6_info *)dst, MSG_DONTWAIT, np->dontfrag); @@ -694,22 +698,11 @@ static int icmpv6_rcv(struct sk_buff *skb) saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; - /* Perform checksum. */ - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len, - IPPROTO_ICMPV6, 0)); - if (__skb_checksum_complete(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG - "ICMPv6 checksum failed [%pI6c > %pI6c]\n", - saddr, daddr); - goto csum_error; - } + if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ICMPv6 checksum failed [%pI6c > %pI6c]\n", + saddr, daddr); + goto csum_error; } if (!pskb_pull(skb, sizeof(*hdr))) @@ -984,7 +977,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL -struct ctl_table ipv6_icmp_table_template[] = { +static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", .data = &init_net.ipv6.sysctl.icmpv6_time, diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 77bb8afb141..a245e5ddffb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -81,12 +81,12 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, final_p = fl6_update_dst(fl6, np->opt, &final); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; - fl6->flowi6_mark = sk->sk_mark; + fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); security_req_classify_flow(req, flowi6_to_flowi(fl6)); - dst = ip6_dst_lookup_flow(sk, fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (IS_ERR(dst)) return NULL; @@ -216,7 +216,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { - dst = ip6_dst_lookup_flow(sk, fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!IS_ERR(dst)) __inet6_csk_dst_store(sk, dst, NULL, NULL); @@ -224,9 +224,8 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, return dst; } -int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused) +int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) { - struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; struct dst_entry *dst; diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 72d198b8e4d..9a4d7322fb2 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -75,23 +75,50 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) return err; } - if (uh->check == 0) { - /* RFC 2460 section 8.1 says that we SHOULD log - this error. Well, it is reasonable. - */ - LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); - return 1; - } - if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, - skb->len, proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; + /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) + * we accept a checksum of zero here. When we find the socket + * for the UDP packet we'll check if that socket allows zero checksum + * for IPv6 (set by socket option). + */ + return skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); +} +EXPORT_SYMBOL(udp6_csum_init); + +/* Function to set UDP checksum for an IPv6 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp6_set_csum(bool nocheck, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { - if (!skb_csum_unnecessary(skb)) - skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len, proto, 0)); + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - return 0; + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v6_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } } -EXPORT_SYMBOL(udp6_csum_init); +EXPORT_SYMBOL(udp6_set_csum); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5550a8113a6..cb4459bd1d2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -9,14 +9,12 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. - */ - -/* - * Changes: - * Yuji SEKIYA @USAGI: Support default route on router node; - * remove ip6_null_entry from the top of - * routing table. - * Ville Nuorvala: Fixed routing subtrees. + * + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt @@ -46,10 +44,9 @@ #define RT6_TRACE(x...) do { ; } while (0) #endif -static struct kmem_cache * fib6_node_kmem __read_mostly; +static struct kmem_cache *fib6_node_kmem __read_mostly; -enum fib_walk_state_t -{ +enum fib_walk_state_t { #ifdef CONFIG_IPV6_SUBTREES FWS_S, #endif @@ -59,8 +56,7 @@ enum fib_walk_state_t FWS_U }; -struct fib6_cleaner_t -{ +struct fib6_cleaner_t { struct fib6_walker_t w; struct net *net; int (*func)(struct rt6_info *, void *arg); @@ -75,8 +71,7 @@ static DEFINE_RWLOCK(fib6_walker_lock); #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn, - struct rt6_info *rt); +static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); static int fib6_walk(struct fib6_walker_t *w); @@ -138,7 +133,7 @@ static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) const __be32 *addr = token; /* * Here, - * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) + * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. @@ -147,7 +142,7 @@ static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static __inline__ struct fib6_node * node_alloc(void) +static __inline__ struct fib6_node *node_alloc(void) { struct fib6_node *fn; @@ -156,7 +151,7 @@ static __inline__ struct fib6_node * node_alloc(void) return fn; } -static __inline__ void node_free(struct fib6_node * fn) +static __inline__ void node_free(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } @@ -292,7 +287,7 @@ static int fib6_dump_node(struct fib6_walker_t *w) static void fib6_dump_end(struct netlink_callback *cb) { - struct fib6_walker_t *w = (void*)cb->args[2]; + struct fib6_walker_t *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { @@ -302,7 +297,7 @@ static void fib6_dump_end(struct netlink_callback *cb) cb->args[2] = 0; kfree(w); } - cb->done = (void*)cb->args[3]; + cb->done = (void *)cb->args[3]; cb->args[1] = 3; } @@ -485,7 +480,7 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right: fn->left; + fn = dir ? fn->right : fn->left; } while (fn); if (!allow_create) { @@ -638,12 +633,41 @@ static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) RTF_GATEWAY; } +static int fib6_commit_metrics(struct dst_entry *dst, + struct nlattr *mx, int mx_len) +{ + struct nlattr *nla; + int remaining; + u32 *mp; + + if (dst->flags & DST_HOST) { + mp = dst_metrics_write_ptr(dst); + } else { + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!mp) + return -ENOMEM; + dst_init_metrics(dst, mp, 0); + } + + nla_for_each_attr(nla, mx, mx_len, remaining) { + int type = nla_type(nla); + + if (type) { + if (type > RTAX_MAX) + return -EINVAL; + + mp[type - 1] = nla_get_u32(nla); + } + } + return 0; +} + /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info) + struct nl_info *info, struct nlattr *mx, int mx_len) { struct rt6_info *iter = NULL; struct rt6_info **ins; @@ -653,6 +677,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + int err; ins = &fn->leaf; @@ -751,6 +776,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: + if (mx) { + err = fib6_commit_metrics(&rt->dst, mx, mx_len); + if (err) + return err; + } rt->dst.rt6_next = iter; *ins = rt; rt->rt6i_node = fn; @@ -770,6 +800,11 @@ add: pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } + if (mx) { + err = fib6_commit_metrics(&rt->dst, mx, mx_len); + if (err) + return err; + } *ins = rt; rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; @@ -806,7 +841,8 @@ void fib6_force_start_gc(struct net *net) * with source addr info in sub-trees */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, + struct nlattr *mx, int mx_len) { struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; @@ -900,11 +936,11 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) } #endif - err = fib6_add_rt2node(fn, rt, info); + err = fib6_add_rt2node(fn, rt, info, mx, mx_len); if (!err) { fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn, rt); + fib6_prune_clones(info->nl_net, pn); } out: @@ -955,8 +991,8 @@ struct lookup_args { const struct in6_addr *addr; /* search key */ }; -static struct fib6_node * fib6_lookup_1(struct fib6_node *root, - struct lookup_args *args) +static struct fib6_node *fib6_lookup_1(struct fib6_node *root, + struct lookup_args *args) { struct fib6_node *fn; __be32 dir; @@ -1018,8 +1054,8 @@ backtrack: return NULL; } -struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, - const struct in6_addr *saddr) +struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { @@ -1051,9 +1087,9 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *da */ -static struct fib6_node * fib6_locate_1(struct fib6_node *root, - const struct in6_addr *addr, - int plen, int offset) +static struct fib6_node *fib6_locate_1(struct fib6_node *root, + const struct in6_addr *addr, + int plen, int offset) { struct fib6_node *fn; @@ -1081,9 +1117,9 @@ static struct fib6_node * fib6_locate_1(struct fib6_node *root, return NULL; } -struct fib6_node * fib6_locate(struct fib6_node *root, - const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len) +struct fib6_node *fib6_locate(struct fib6_node *root, + const struct in6_addr *daddr, int dst_len, + const struct in6_addr *saddr, int src_len) { struct fib6_node *fn; @@ -1151,8 +1187,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, children = 0; child = NULL; - if (fn->right) child = fn->right, children |= 1; - if (fn->left) child = fn->left, children |= 2; + if (fn->right) + child = fn->right, children |= 1; + if (fn->left) + child = fn->left, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES @@ -1180,8 +1218,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) pn->right = child; - else if (pn->left == fn) pn->left = child; + if (pn->right == fn) + pn->right = child; + else if (pn->left == fn) + pn->left = child; #if RT6_DEBUG >= 2 else WARN_ON(1); @@ -1213,10 +1253,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } @@ -1314,7 +1354,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) struct rt6_info **rtp; #if RT6_DEBUG >= 2 - if (rt->dst.obsolete>0) { + if (rt->dst.obsolete > 0) { WARN_ON(fn != NULL); return -ENOENT; } @@ -1334,7 +1374,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) pn = pn->parent; } #endif - fib6_prune_clones(info->nl_net, pn, rt); + fib6_prune_clones(info->nl_net, pn); } /* @@ -1418,7 +1458,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w) if (w->skip) { w->skip--; - continue; + goto skip; } err = w->func(w); @@ -1428,6 +1468,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w) w->count++; continue; } +skip: w->state = FWS_U; case FWS_U: if (fn == w->root) @@ -1530,7 +1571,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, } void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) + void *arg) { struct fib6_table *table; struct hlist_head *head; @@ -1542,7 +1583,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, prune, arg); + func, 0, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1559,10 +1600,9 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg) return 0; } -static void fib6_prune_clones(struct net *net, struct fib6_node *fn, - struct rt6_info *rt) +static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { - fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt); + fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); } /* @@ -1636,7 +1676,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) gc_args.more = icmp6_dst_gc(); - fib6_clean_all(net, fib6_age, 0, NULL); + fib6_clean_all(net, fib6_age, NULL); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -1707,7 +1747,7 @@ out_rt6_stats: kfree(net->ipv6.rt6_stats); out_timer: return -ENOMEM; - } +} static void fib6_net_exit(struct net *net) { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index e7fb7106550..4052694c6f2 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -15,9 +15,7 @@ #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> -#include <linux/if_arp.h> #include <linux/in6.h> -#include <linux/route.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -28,12 +26,7 @@ #include <net/sock.h> #include <net/ipv6.h> -#include <net/ndisc.h> -#include <net/protocol.h> -#include <net/ip6_route.h> -#include <net/addrconf.h> #include <net/rawv6.h> -#include <net/icmp.h> #include <net/transp_v6.h> #include <asm/uaccess.h> @@ -210,7 +203,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { - fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; + fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (lfl == NULL) @@ -481,11 +474,22 @@ static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, spin_unlock_bh(&ip6_sk_fl_lock); } -int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq) +int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, + int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; + if (flags & IPV6_FL_F_REMOTE) { + freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; + return 0; + } + + if (np->repflow) { + freq->flr_label = np->flow_label; + return 0; + } + rcu_read_lock_bh(); for_each_sk_fl_rcu(np, sfl) { @@ -527,6 +531,15 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) switch (freq.flr_action) { case IPV6_FL_A_PUT: + if (freq.flr_flags & IPV6_FL_F_REFLECT) { + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + if (!np->repflow) + return -ESRCH; + np->flow_label = 0; + np->repflow = 0; + return 0; + } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; (sfl = rcu_dereference(*sflp))!=NULL; @@ -567,6 +580,20 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) return -ESRCH; case IPV6_FL_A_GET: + if (freq.flr_flags & IPV6_FL_F_REFLECT) { + struct net *net = sock_net(sk); + if (net->ipv6.sysctl.flowlabel_consistency) { + net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); + return -EPERM; + } + + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + + np->repflow = 1; + return 0; + } + if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 8acb28621f9..3873181ed85 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -61,9 +61,6 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) -#define IPV6_TCLASS_SHIFT 20 - #define HASH_SIZE_SHIFT 5 #define HASH_SIZE (1 << HASH_SIZE_SHIFT) @@ -75,6 +72,7 @@ struct ip6gre_net { }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly; +static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); @@ -356,10 +354,10 @@ failed_free: static void ip6gre_tunnel_uninit(struct net_device *dev) { - struct net *net = dev_net(dev); - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct ip6_tnl *t = netdev_priv(dev); + struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); - ip6gre_tunnel_unlink(ign, netdev_priv(dev)); + ip6gre_tunnel_unlink(ign, t); dev_put(dev); } @@ -470,17 +468,7 @@ static int ip6gre_rcv(struct sk_buff *skb) goto drop; if (flags&GRE_CSUM) { - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - if (!csum) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - } + csum = skb_checksum_simple_validate(skb); offset += 4; } if (flags&GRE_KEY) { @@ -499,7 +487,7 @@ static int ip6gre_rcv(struct sk_buff *skb) &ipv6h->saddr, &ipv6h->daddr, key, gre_proto); if (tunnel) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; @@ -614,8 +602,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, int encap_limit, __u32 *pmtu) { - struct net *net = dev_net(dev); struct ip6_tnl *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; struct net_device *tdev; /* Device to other host */ struct ipv6hdr *ipv6h; /* Our new IP header */ unsigned int max_headroom = 0; /* The extra header space needed */ @@ -846,7 +834,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); + fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; @@ -982,7 +970,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); - struct rt6_info *rt = rt6_lookup(dev_net(dev), + struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, strict); @@ -1066,13 +1054,12 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, int err = 0; struct ip6_tnl_parm2 p; struct __ip6_tnl_parm p1; - struct ip6_tnl *t; - struct net *net = dev_net(dev); + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); switch (cmd) { case SIOCGETTUNNEL: - t = NULL; if (dev == ign->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; @@ -1080,9 +1067,9 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, } ip6gre_tnl_parm_from_user(&p1, &p); t = ip6gre_tunnel_locate(net, &p1, 0); + if (t == NULL) + t = netdev_priv(dev); } - if (t == NULL) - t = netdev_priv(dev); memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) @@ -1245,7 +1232,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->iflink = 0; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } @@ -1266,12 +1252,12 @@ static int ip6gre_tunnel_init(struct net_device *dev) if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ip6gre_tunnel_stats; + struct pcpu_sw_netstats *ip6gre_tunnel_stats; ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ip6gre_tunnel_stats->syncp); } @@ -1300,11 +1286,17 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; -static void ip6gre_destroy_tunnels(struct ip6gre_net *ign, - struct list_head *head) +static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) { + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct net_device *dev, *aux; int prio; + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &ip6gre_link_ops || + dev->rtnl_link_ops == &ip6gre_tap_ops) + unregister_netdevice_queue(dev, head); + for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { @@ -1313,7 +1305,12 @@ static void ip6gre_destroy_tunnels(struct ip6gre_net *ign, t = rtnl_dereference(ign->tunnels[prio][h]); while (t != NULL) { - unregister_netdevice_queue(t->dev, head); + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, + head); t = rtnl_dereference(t->next); } } @@ -1332,6 +1329,11 @@ static int __net_init ip6gre_init_net(struct net *net) goto err_alloc_dev; } dev_net_set(ign->fb_tunnel_dev, net); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; @@ -1352,12 +1354,10 @@ err_alloc_dev: static void __net_exit ip6gre_exit_net(struct net *net) { - struct ip6gre_net *ign; LIST_HEAD(list); - ign = net_generic(net, ip6gre_net_id); rtnl_lock(); - ip6gre_destroy_tunnels(ign, &list); + ip6gre_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } @@ -1457,7 +1457,6 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; - int i; tunnel = netdev_priv(dev); @@ -1467,16 +1466,10 @@ static int ip6gre_tap_init(struct net_device *dev) ip6gre_tnl_link_config(tunnel, 1); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_tstats *ip6gre_tap_stats; - ip6gre_tap_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ip6gre_tap_stats->syncp); - } - return 0; } @@ -1541,15 +1534,14 @@ out: static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip6_tnl *t, *nt; - struct net *net = dev_net(dev); + struct ip6_tnl *t, *nt = netdev_priv(dev); + struct net *net = nt->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct __ip6_tnl_parm p; if (dev == ign->fb_tunnel_dev) return -EINVAL; - nt = netdev_priv(dev); ip6gre_netlink_parms(data, &p); t = ip6gre_tunnel_locate(net, &p, 0); @@ -1569,6 +1561,15 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], return 0; } +static void ip6gre_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + if (dev != ign->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + static size_t ip6gre_get_size(const struct net_device *dev) { return @@ -1646,6 +1647,7 @@ static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .validate = ip6gre_tunnel_validate, .newlink = ip6gre_newlink, .changelink = ip6gre_changelink, + .dellink = ip6gre_dellink, .get_size = ip6gre_get_size, .fill_info = ip6gre_fill_info, }; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 302d6fb1ff2..51d54dc376f 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,7 +49,7 @@ int ip6_rcv_finish(struct sk_buff *skb) { - if (sysctl_ip_early_demux && !skb_dst(skb)) { + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 4b851692b1f..65eda2a8af4 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -89,7 +89,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, unsigned int unfrag_ip6hlen; u8 *prevhdr; int offset = 0; - bool tunnel; + bool encap, udpfrag; int nhoff; if (unlikely(skb_shinfo(skb)->gso_type & @@ -97,9 +97,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_MPLS | SKB_GSO_TCPV6 | 0))) @@ -110,8 +112,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; - tunnel = SKB_GSO_CB(skb)->encap_level > 0; - if (tunnel) + encap = SKB_GSO_CB(skb)->encap_level > 0; + if (encap) features = skb->dev->hw_enc_features & netif_skb_features(skb); SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); @@ -121,6 +123,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); + if (skb->encapsulation && + skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) + udpfrag = proto == IPPROTO_UDP && encap; + else + udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; + ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); @@ -133,13 +141,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h)); - if (tunnel) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } skb->network_header = (u8 *)ipv6h - skb->head; - if (!tunnel && proto == IPPROTO_UDP) { + if (udpfrag) { unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); fptr->frag_off = htons(offset); @@ -148,12 +152,40 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, offset += (ntohs(ipv6h->payload_len) - sizeof(struct frag_hdr)); } + if (encap) + skb_reset_inner_headers(skb); } out: return segs; } +/* Return the total length of all the extension hdrs, following the same + * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. + */ +static int ipv6_exthdrs_len(struct ipv6hdr *iph, + const struct net_offload **opps) +{ + struct ipv6_opt_hdr *opth = (void *)iph; + int len = 0, proto, optlen = sizeof(*iph); + + proto = iph->nexthdr; + for (;;) { + if (proto != NEXTHDR_HOP) { + *opps = rcu_dereference(inet6_offloads[proto]); + if (unlikely(!(*opps))) + break; + if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + opth = (void *)opth + optlen; + optlen = ipv6_optlen(opth); + len += optlen; + proto = opth->nexthdr; + } + return len; +} + static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -164,9 +196,8 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, unsigned int nlen; unsigned int hlen; unsigned int off; - int flush = 1; + u16 flush = 1; int proto; - __wsum csum; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); @@ -177,6 +208,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, goto out; } + skb_set_network_header(skb, off); skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -211,12 +243,16 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, if (!NAPI_GRO_CB(p)->same_flow) continue; - iph2 = ipv6_hdr(p); + iph2 = (struct ipv6hdr *)(p->data + off); first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; - /* All fields must match except length and Traffic Class. */ - if (nlen != skb_network_header_len(p) || - (first_word & htonl(0xF00FFFFF)) || + /* All fields must match except length and Traffic Class. + * XXX skbs on the gro_list have all been parsed and pulled + * already so we don't need to compare nlen + * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) + * memcmp() alone below is suffcient, right? + */ + if ((first_word & htonl(0xF00FFFFF)) || memcmp(&iph->nexthdr, &iph2->nexthdr, nlen - offsetof(struct ipv6hdr, nexthdr))) { NAPI_GRO_CB(p)->same_flow = 0; @@ -229,13 +265,10 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, NAPI_GRO_CB(skb)->flush |= flush; - csum = skb->csum; - skb_postpull_rcsum(skb, iph, skb_network_header_len(skb)); + skb_gro_postpull_rcsum(skb, iph, nlen); pp = ops->callbacks.gro_receive(head, skb); - skb->csum = csum; - out_unlock: rcu_read_unlock(); @@ -245,21 +278,21 @@ out: return pp; } -static int ipv6_gro_complete(struct sk_buff *skb) +static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); int err = -ENOSYS; - iph->payload_len = htons(skb->len - skb_network_offset(skb) - - sizeof(*iph)); + iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); rcu_read_lock(); - ops = rcu_dereference(inet6_offloads[NAPI_GRO_CB(skb)->proto]); + + nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb); + err = ops->callbacks.gro_complete(skb, nhoff); out_unlock: rcu_read_unlock(); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9a311cc7967..45702b8cd14 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -132,7 +132,7 @@ static int ip6_finish_output(struct sk_buff *skb) return ip6_finish_output2(skb); } -int ip6_output(struct sk_buff *skb) +int ip6_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); @@ -219,7 +219,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, skb->mark = sk->sk_mark; mtu = dst_mtu(dst); - if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { + if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, @@ -321,6 +321,45 @@ static inline int ip6_forward_finish(struct sk_buff *skb) return dst_output(skb); } +static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + unsigned int mtu; + struct inet6_dev *idev; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + +static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ + if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) + return true; + + if (skb->ignore_df) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -332,6 +371,9 @@ int ip6_forward(struct sk_buff *skb) if (net->ipv6.devconf_all->forwarding == 0) goto error; + if (skb->pkt_type != PACKET_HOST) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -341,9 +383,6 @@ int ip6_forward(struct sk_buff *skb) goto drop; } - if (skb->pkt_type != PACKET_HOST) - goto drop; - skb_forward_csum(skb); /* @@ -441,12 +480,11 @@ int ip6_forward(struct sk_buff *skb) } } - mtu = dst_mtu(dst); + mtu = ip6_dst_mtu_forward(dst); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || - (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { + if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -496,12 +534,23 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - to->nf_trace = from->nf_trace; -#endif skb_copy_secmark(to, from); } +static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) +{ + static u32 ip6_idents_hashrnd __read_mostly; + u32 hash, id; + + net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); + + hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd); + hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash); + + id = ip_idents_reserve(hash, 1); + fhdr->identification = htonl(id); +} + int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; @@ -524,7 +573,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ - if (unlikely(!skb->local_df && skb->len > mtu) || + if (unlikely(!skb->ignore_df && skb->len > mtu) || (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { if (skb->sk && dst_allfrag(skb_dst(skb))) @@ -941,7 +990,6 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); * @sk: socket which provides route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup - * @can_sleep: we are in a sleepable context * * This function performs a route lookup on the given flow. * @@ -949,8 +997,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); * error code. */ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, - const struct in6_addr *final_dst, - bool can_sleep) + const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; @@ -960,8 +1007,6 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; - if (can_sleep) - fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } @@ -972,7 +1017,6 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); * @sk: socket which provides the dst cache and route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup - * @can_sleep: we are in a sleepable context * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. @@ -983,8 +1027,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); * error code. */ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, - const struct in6_addr *final_dst, - bool can_sleep) + const struct in6_addr *final_dst) { struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); int err; @@ -996,8 +1039,6 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; - if (can_sleep) - fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } @@ -1078,21 +1119,19 @@ static void ip6_append_data_mtu(unsigned int *mtu, unsigned int fragheaderlen, struct sk_buff *skb, struct rt6_info *rt, - bool pmtuprobe) + unsigned int orig_mtu) { if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { if (skb == NULL) { /* first fragment, reserve header_len */ - *mtu = *mtu - rt->dst.header_len; + *mtu = orig_mtu - rt->dst.header_len; } else { /* * this fragment is not first, the headers * space is regarded as data space. */ - *mtu = min(*mtu, pmtuprobe ? - rt->dst.dev->mtu : - dst_mtu(rt->dst.path)); + *mtu = orig_mtu; } *maxfraglen = ((*mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); @@ -1109,7 +1148,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, struct ipv6_pinfo *np = inet6_sk(sk); struct inet_cork *cork; struct sk_buff *skb, *skb_prev = NULL; - unsigned int maxfraglen, fragheaderlen, mtu; + unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; int exthdrlen; int dst_exthdrlen; int hh_len; @@ -1165,10 +1204,10 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, np->cork.hop_limit = hlimit; np->cork.tclass = tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) - mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(&rt->dst); else - mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) @@ -1191,16 +1230,43 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, dst_exthdrlen = 0; mtu = cork->fragsize; } + orig_mtu = mtu; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { - if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { - ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); + unsigned int maxnonfragsize, headersize; + + headersize = sizeof(struct ipv6hdr) + + (opt ? opt->opt_flen + opt->opt_nflen : 0) + + (dst_allfrag(&rt->dst) ? + sizeof(struct frag_hdr) : 0) + + rt->rt6i_nfheader_len; + + if (ip6_sk_ignore_df(sk)) + maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; + else + maxnonfragsize = mtu; + + /* dontfrag active */ + if ((cork->length + length > mtu - headersize) && dontfrag && + (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_RAW)) { + ipv6_local_rxpmtu(sk, fl6, mtu - headersize + + sizeof(struct ipv6hdr)); + goto emsgsize; + } + + if (cork->length + length > maxnonfragsize - headersize) { +emsgsize: + ipv6_local_error(sk, EMSGSIZE, fl6, + mtu - headersize + + sizeof(struct ipv6hdr)); return -EMSGSIZE; } } @@ -1225,12 +1291,6 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, * --yoshfuji */ - if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP || - sk->sk_protocol == IPPROTO_RAW)) { - ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); - return -EMSGSIZE; - } - skb = skb_peek_tail(&sk->sk_write_queue); cork->length += length; if (((length > mtu) || @@ -1270,8 +1330,7 @@ alloc_new_skb: if (skb == NULL || skb_prev == NULL) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, - np->pmtudisc == - IPV6_PMTUDISC_PROBE); + orig_mtu); skb_prev = skb; @@ -1499,8 +1558,7 @@ int ip6_push_pending_frames(struct sock *sk) } /* Allow local fragmentation. */ - if (np->pmtudisc < IPV6_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip6_sk_ignore_df(sk); *final_dst = fl6->daddr; __skb_pull(skb, skb_network_header_len(skb)); @@ -1527,8 +1585,8 @@ int ip6_push_pending_frames(struct sock *sk) if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); + ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } err = ip6_local_out(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d6062325db0..afa08245836 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -29,7 +29,6 @@ #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> -#include <linux/if_tunnel.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> @@ -62,6 +61,7 @@ MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #ifdef IP6_TNL_DEBUG @@ -70,9 +70,6 @@ MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TNL_TRACE(x...) do {;} while(0) #endif -#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) -#define IPV6_TCLASS_SHIFT 20 - #define HASH_SIZE_SHIFT 5 #define HASH_SIZE (1 << HASH_SIZE_SHIFT) @@ -103,16 +100,26 @@ struct ip6_tnl_net { static struct net_device_stats *ip6_get_stats(struct net_device *dev) { - struct pcpu_tstats sum = { 0 }; + struct pcpu_sw_netstats tmp, sum = { 0 }; int i; for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - - sum.rx_packets += tstats->rx_packets; - sum.rx_bytes += tstats->rx_bytes; - sum.tx_packets += tstats->tx_packets; - sum.tx_bytes += tstats->tx_bytes; + unsigned int start; + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + tmp.rx_packets = tstats->rx_packets; + tmp.rx_bytes = tstats->rx_bytes; + tmp.tx_packets = tstats->tx_packets; + tmp.tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + sum.rx_packets += tmp.rx_packets; + sum.rx_bytes += tmp.rx_bytes; + sum.tx_packets += tmp.tx_packets; + sum.tx_bytes += tmp.tx_bytes; } dev->stats.rx_packets = sum.rx_packets; dev->stats.rx_bytes = sum.rx_bytes; @@ -785,7 +792,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr)) != NULL) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; if (t->parms.proto != ipproto && t->parms.proto != 0) { rcu_read_unlock(); @@ -824,8 +831,10 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, } tstats = this_cpu_ptr(t->dev->tstats); + u64_stats_update_begin(&tstats->syncp); tstats->rx_packets++; tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); netif_rx(skb); @@ -1131,7 +1140,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); + fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; @@ -1332,8 +1341,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; - struct ip6_tnl *t = NULL; - struct net *net = dev_net(dev); + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); switch (cmd) { @@ -1345,11 +1354,11 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); + if (t == NULL) + t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } - if (t == NULL) - t = netdev_priv(dev); ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { err = -EFAULT; @@ -1494,19 +1503,12 @@ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - int i; t->dev = dev; t->net = dev_net(dev); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - - for_each_possible_cpu(i) { - struct pcpu_tstats *ip6_tnl_stats; - ip6_tnl_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ip6_tnl_stats->syncp); - } return 0; } @@ -1556,7 +1558,7 @@ static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[]) { u8 proto; - if (!data) + if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ed94ba61dda..9aaa6bb229e 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -24,7 +24,6 @@ #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> -#include <linux/if_tunnel.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> @@ -75,26 +74,6 @@ struct vti6_net { struct ip6_tnl __rcu **tnls[2]; }; -static struct net_device_stats *vti6_get_stats(struct net_device *dev) -{ - struct pcpu_tstats sum = { 0 }; - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - - sum.rx_packets += tstats->rx_packets; - sum.rx_bytes += tstats->rx_bytes; - sum.tx_packets += tstats->tx_packets; - sum.tx_bytes += tstats->tx_bytes; - } - dev->stats.rx_packets = sum.rx_packets; - dev->stats.rx_bytes = sum.rx_bytes; - dev->stats.tx_packets = sum.tx_packets; - dev->stats.tx_bytes = sum.tx_bytes; - return &dev->stats; -} - #define for_each_vti6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) @@ -299,7 +278,6 @@ static void vti6_dev_uninit(struct net_device *dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else vti6_tnl_unlink(ip6n, t); - ip6_tnl_dst_reset(t); dev_put(dev); } @@ -309,11 +287,8 @@ static int vti6_rcv(struct sk_buff *skb) const struct ipv6hdr *ipv6h = ipv6_hdr(skb); rcu_read_lock(); - if ((t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr)) != NULL) { - struct pcpu_tstats *tstats; - if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { rcu_read_unlock(); goto discard; @@ -330,25 +305,58 @@ static int vti6_rcv(struct sk_buff *skb) goto discard; } - tstats = this_cpu_ptr(t->dev->tstats); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - - skb->mark = 0; - secpath_reset(skb); - skb->dev = t->dev; + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; + skb->mark = be32_to_cpu(t->parms.i_key); rcu_read_unlock(); - return 0; + + return xfrm6_rcv(skb); } rcu_read_unlock(); - return 1; - + return -EINVAL; discard: kfree_skb(skb); return 0; } +static int vti6_rcv_cb(struct sk_buff *skb, int err) +{ + unsigned short family; + struct net_device *dev; + struct pcpu_sw_netstats *tstats; + struct xfrm_state *x; + struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; + + if (!t) + return 1; + + dev = t->dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + x = xfrm_input_state(skb); + family = x->inner_mode->afinfo->family; + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + return -EPERM; + + skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); + skb->dev = dev; + + tstats = this_cpu_ptr(dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; +} + /** * vti6_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device @@ -368,44 +376,56 @@ vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } +static bool vti6_state_check(const struct xfrm_state *x, + const struct in6_addr *dst, + const struct in6_addr *src) +{ + xfrm_address_t *daddr = (xfrm_address_t *)dst; + xfrm_address_t *saddr = (xfrm_address_t *)src; + + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!x || x->props.mode != XFRM_MODE_TUNNEL || + x->props.family != AF_INET6) + return false; + + if (ipv6_addr_any(dst)) + return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6); + + if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6)) + return false; + + return true; +} + /** * vti6_xmit - send a packet * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device + * @fl: the flow informations for the xfrm_lookup **/ -static int vti6_xmit(struct sk_buff *skb, struct net_device *dev) +static int +vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { - struct net *net = dev_net(dev); struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; - struct dst_entry *dst = NULL, *ndst = NULL; - struct flowi6 fl6; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; int err = -1; - if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || - !ip6_tnl_xmit_ctl(t) || vti6_addr_conflict(t, ipv6h)) - return err; - - dst = ip6_tnl_dst_check(t); - if (!dst) { - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - ndst = ip6_route_output(net, NULL, &fl6); + if (!dst) + goto tx_err_link_failure; - if (ndst->error) - goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; - goto tx_err_link_failure; - } - dst = ndst; + dst_hold(dst); + dst = xfrm_lookup(t->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; } - if (!dst->xfrm || dst->xfrm->props.mode != XFRM_MODE_TUNNEL) + if (!vti6_state_check(dst->xfrm, &t->parms.raddr, &t->parms.laddr)) goto tx_err_link_failure; tdev = dst->dev; @@ -417,14 +437,21 @@ static int vti6_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_err_dst_release; } - - skb_dst_drop(skb); - skb_dst_set_noref(skb, dst); - - ip6tunnel_xmit(skb, dev); - if (ndst) { - dev->mtu = dst_mtu(ndst); - ip6_tnl_dst_store(t, ndst); + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = skb_dst(skb)->dev; + + err = dst_output(skb); + if (net_xmit_eval(err) == 0) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += skb->len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; } return 0; @@ -432,7 +459,7 @@ tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -441,16 +468,33 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; + struct ipv6hdr *ipv6h; + struct flowi fl; int ret; + memset(&fl, 0, sizeof(fl)); + skb->mark = be32_to_cpu(t->parms.o_key); + switch (skb->protocol) { case htons(ETH_P_IPV6): - ret = vti6_xmit(skb, dev); + ipv6h = ipv6_hdr(skb); + + if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t) || vti6_addr_conflict(t, ipv6h)) + goto tx_err; + + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); break; default: goto tx_err; } + ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; @@ -463,24 +507,69 @@ tx_err: return NETDEV_TX_OK; } +static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __be32 spi; + __u32 mark; + struct xfrm_state *x; + struct ip6_tnl *t; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct ip_comp_hdr *ipch; + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + int protocol = iph->nexthdr; + + t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr); + if (!t) + return -1; + + mark = be32_to_cpu(t->parms.o_key); + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); + xfrm_state_put(x); + + return 0; +} + static void vti6_link_config(struct ip6_tnl *t) { - struct dst_entry *dst; struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; - struct flowi6 *fl6 = &t->fl.u.ip6; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); - /* Set up flowi template */ - fl6->saddr = p->laddr; - fl6->daddr = p->raddr; - fl6->flowi6_oif = p->link; - fl6->flowi6_mark = be32_to_cpu(p->i_key); - fl6->flowi6_proto = p->proto; - fl6->flowlabel = 0; - p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); @@ -491,28 +580,6 @@ static void vti6_link_config(struct ip6_tnl *t) dev->flags &= ~IFF_POINTOPOINT; dev->iflink = p->link; - - if (p->flags & IP6_TNL_F_CAP_XMIT) { - - dst = ip6_route_output(dev_net(dev), NULL, fl6); - if (dst->error) - return; - - dst = xfrm_lookup(dev_net(dev), dst, flowi6_to_flowi(fl6), - NULL, 0); - if (IS_ERR(dst)) - return; - - if (dst->dev) { - dev->hard_header_len = dst->dev->hard_header_len; - - dev->mtu = dst_mtu(dst); - - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; - } - dst_release(dst); - } } /** @@ -716,7 +783,7 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_start_xmit = vti6_tnl_xmit, .ndo_do_ioctl = vti6_ioctl, .ndo_change_mtu = vti6_change_mtu, - .ndo_get_stats = vti6_get_stats, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; /** @@ -728,18 +795,14 @@ static const struct net_device_ops vti6_netdev_ops = { **/ static void vti6_dev_setup(struct net_device *dev) { - struct ip6_tnl *t; - dev->netdev_ops = &vti6_netdev_ops; dev->destructor = vti6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); dev->mtu = ETH_DATA_LEN; - t = netdev_priv(dev); dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } @@ -753,7 +816,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev) t->dev = dev; t->net = dev_net(dev); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; return 0; @@ -927,11 +990,6 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .fill_info = vti6_fill_info, }; -static struct xfrm_tunnel_notifier vti6_handler __read_mostly = { - .handler = vti6_rcv, - .priority = 1, -}; - static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) { int h; @@ -1003,6 +1061,27 @@ static struct pernet_operations vti6_net_ops = { .size = sizeof(struct vti6_net), }; +static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + /** * vti6_tunnel_init - register protocol and reserve needed resources * @@ -1016,11 +1095,30 @@ static int __init vti6_tunnel_init(void) if (err < 0) goto out_pernet; - err = xfrm6_mode_tunnel_input_register(&vti6_handler); + err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); if (err < 0) { - pr_err("%s: can't register vti6\n", __func__); + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + + err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); + if (err < 0) { + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); + pr_err("%s: can't register vti6 protocol\n", __func__); + goto out; } + + err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) { + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + err = rtnl_link_register(&vti6_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1028,7 +1126,9 @@ static int __init vti6_tunnel_init(void) return 0; rtnl_link_failed: - xfrm6_mode_tunnel_input_deregister(&vti6_handler); + xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); out: unregister_pernet_device(&vti6_net_ops); out_pernet: @@ -1041,8 +1141,12 @@ out_pernet: static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); - if (xfrm6_mode_tunnel_input_deregister(&vti6_handler)) - pr_info("%s: can't deregister vti6\n", __func__); + if (xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP)) + pr_info("%s: can't deregister protocol\n", __func__); + if (xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH)) + pr_info("%s: can't deregister protocol\n", __func__); + if (xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP)) + pr_info("%s: can't deregister protocol\n", __func__); unregister_pernet_device(&vti6_net_ops); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index f365310bfcc..8250474ab7d 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -141,9 +141,12 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr6_table **mrt) { - struct ip6mr_result res; - struct fib_lookup_arg arg = { .result = &res, }; int err; + struct ip6mr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); @@ -697,7 +700,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct mr6_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, - .flowi6_iif = skb->skb_iif, + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; int err; @@ -1630,7 +1633,7 @@ struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) { struct mr6_table *mrt; struct flowi6 fl6 = { - .flowi6_iif = skb->skb_iif, + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; @@ -2346,13 +2349,14 @@ int ip6mr_get_route(struct net *net, } static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, - u32 portid, u32 seq, struct mfc6_cache *c, int cmd) + u32 portid, u32 seq, struct mfc6_cache *c, int cmd, + int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI); + nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; @@ -2420,7 +2424,7 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, if (skb == NULL) goto errout; - err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd); + err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; @@ -2459,7 +2463,8 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (ip6mr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE) < 0) + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) goto done; next_entry: e++; @@ -2473,7 +2478,8 @@ next_entry: if (ip6mr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE) < 0) { + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) { spin_unlock_bh(&mfc_unres_lock); goto done; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index da9becb42e8..d1c793cffcb 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -53,7 +53,7 @@ #include <linux/icmpv6.h> #include <linux/mutex.h> -static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); @@ -65,19 +65,21 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) - return; + return 0; spi = htonl(ntohs(ipcomph->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6); if (!x) - return; + return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); else ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) @@ -174,6 +176,11 @@ out: return err; } +static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ipcomp6_type = { .description = "IPCOMP6", @@ -186,11 +193,12 @@ static const struct xfrm_type ipcomp6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static const struct inet6_protocol ipcomp6_protocol = +static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, + .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init ipcomp6_init(void) @@ -199,7 +207,7 @@ static int __init ipcomp6_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp6_type, AF_INET6); return -EAGAIN; @@ -209,7 +217,7 @@ static int __init ipcomp6_init(void) static void __exit ipcomp6_fini(void) { - if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) + if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 1c6ce3119ff..edb58aff4ae 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -722,7 +722,7 @@ done: case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) goto e_inval; - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) goto e_inval; np->pmtudisc = val; retv = 0; @@ -1002,10 +1002,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, release_sock(sk); if (skb) { - int err = ip6_datagram_recv_ctl(sk, &msg, skb); + ip6_datagram_recv_ctl(sk, &msg, skb); kfree_skb(skb); - if (err) - return err; } else { if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; @@ -1019,7 +1017,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { - int tclass = np->rcv_tclass; + int tclass = (int)ip6_tclass(np->rcv_flowinfo); + put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { @@ -1034,6 +1033,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, int hlim = np->mcast_hops; put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } + if (np->rxopt.bits.rxflow) { + __be32 flowinfo = np->rcv_flowinfo; + + put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + } } len -= msg.msg_controllen; return put_user(len, optlen); @@ -1215,6 +1219,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; + int flags; if (len < sizeof(freq)) return -EINVAL; @@ -1226,9 +1231,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -EINVAL; len = sizeof(freq); + flags = freq.flr_flags; + memset(&freq, 0, sizeof(freq)); - val = ipv6_flowlabel_opt_get(sk, &freq); + val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d18f9f903db..617f0958e16 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -999,7 +999,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, static void mld_gq_start_timer(struct inet6_dev *idev) { - unsigned long tv = net_random() % idev->mc_maxdelay; + unsigned long tv = prandom_u32() % idev->mc_maxdelay; idev->mc_gq_running = 1; if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) @@ -1015,7 +1015,7 @@ static void mld_gq_stop_timer(struct inet6_dev *idev) static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = net_random() % delay; + unsigned long tv = prandom_u32() % delay; if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) in6_dev_hold(idev); @@ -1030,7 +1030,7 @@ static void mld_ifc_stop_timer(struct inet6_dev *idev) static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = net_random() % delay; + unsigned long tv = prandom_u32() % delay; if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2)) in6_dev_hold(idev); @@ -1061,7 +1061,7 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) } if (delay >= resptime) - delay = net_random() % resptime; + delay = prandom_u32() % resptime; ma->mca_timer.expires = jiffies + delay; if (!mod_timer(&ma->mca_timer, jiffies + delay)) @@ -1301,8 +1301,17 @@ int igmp6_event_query(struct sk_buff *skb) len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); - /* Drop queries with not link local source */ - if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + /* RFC3810 6.2 + * Upon reception of an MLD message that contains a Query, the node + * checks if the source address of the message is a valid link-local + * address, if the Hop Limit is set to 1, and if the Router Alert + * option is present in the Hop-By-Hop Options header of the IPv6 + * packet. If any of these checks fails, the packet is dropped. + */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || + ipv6_hdr(skb)->hop_limit != 1 || + !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || + IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) return -EINVAL; idev = __in6_dev_get(skb->dev); @@ -1620,11 +1629,12 @@ static void mld_sendpack(struct sk_buff *skb) dst_output); out: if (!err) { - ICMP6MSGOUT_INC_STATS_BH(net, idev, ICMPV6_MLD2_REPORT); - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); - } else - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS); + ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); + } else { + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + } rcu_read_unlock(); return; @@ -1665,7 +1675,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, skb_tailroom(skb)) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, int gdeleted, int sdeleted) + int type, int gdeleted, int sdeleted, int crsend) { struct inet6_dev *idev = pmc->idev; struct net_device *dev = idev->dev; @@ -1757,7 +1767,7 @@ empty_source: if (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) return skb; - if (pmc->mca_crcount || isquery) { + if (pmc->mca_crcount || isquery || crsend) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { mld_sendpack(skb); @@ -1789,7 +1799,7 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); spin_unlock_bh(&pmc->mca_lock); } } else { @@ -1798,7 +1808,7 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); spin_unlock_bh(&pmc->mca_lock); } read_unlock_bh(&idev->lock); @@ -1843,13 +1853,13 @@ static void mld_send_cr(struct inet6_dev *idev) if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; - skb = add_grec(skb, pmc, type, 1, 0); - skb = add_grec(skb, pmc, dtype, 1, 1); + skb = add_grec(skb, pmc, type, 1, 0, 0); + skb = add_grec(skb, pmc, dtype, 1, 1, 0); } if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; - skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, type, 1, 0, 0); } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { @@ -1880,8 +1890,8 @@ static void mld_send_cr(struct inet6_dev *idev) type = MLD2_ALLOW_NEW_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; } - skb = add_grec(skb, pmc, type, 0, 0); - skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + skb = add_grec(skb, pmc, type, 0, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */ /* filter mode changes */ if (pmc->mca_crcount) { @@ -1889,7 +1899,7 @@ static void mld_send_cr(struct inet6_dev *idev) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } spin_unlock_bh(&pmc->mca_lock); @@ -1997,27 +2007,36 @@ err_out: goto out; } -static void mld_resend_report(struct inet6_dev *idev) +static void mld_send_initial_cr(struct inet6_dev *idev) { - if (mld_in_v1_mode(idev)) { - struct ifmcaddr6 *mcaddr; - read_lock_bh(&idev->lock); - for (mcaddr = idev->mc_list; mcaddr; mcaddr = mcaddr->next) { - if (!(mcaddr->mca_flags & MAF_NOREPORT)) - igmp6_send(&mcaddr->mca_addr, idev->dev, - ICMPV6_MGM_REPORT); - } - read_unlock_bh(&idev->lock); - } else { - mld_send_report(idev, NULL); + struct sk_buff *skb; + struct ifmcaddr6 *pmc; + int type; + + if (mld_in_v1_mode(idev)) + return; + + skb = NULL; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_CHANGE_TO_EXCLUDE; + else + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0, 1); + spin_unlock_bh(&pmc->mca_lock); } + read_unlock_bh(&idev->lock); + if (skb) + mld_sendpack(skb); } void ipv6_mc_dad_complete(struct inet6_dev *idev) { idev->mc_dad_count = idev->mc_qrv; if (idev->mc_dad_count) { - mld_resend_report(idev); + mld_send_initial_cr(idev); idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_timer(idev, idev->mc_maxdelay); @@ -2028,7 +2047,7 @@ static void mld_dad_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; - mld_resend_report(idev); + mld_send_initial_cr(idev); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) @@ -2328,7 +2347,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = net_random() % unsolicited_report_interval(ma->idev); + delay = prandom_u32() % unsolicited_report_interval(ma->idev); spin_lock_bh(&ma->mca_lock); if (del_timer(&ma->mca_timer)) { diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 09a22f4f36c..ca8d4ea48a5 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -851,7 +851,7 @@ out: static void ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); - const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + @@ -944,10 +944,7 @@ static void ndisc_recv_na(struct sk_buff *skb) /* * Change: router to host */ - struct rt6_info *rt; - rt = rt6_get_dflt_router(saddr, dev); - if (rt) - ip6_del_rt(rt); + rt6_clean_tohost(dev_net(dev), saddr); } out: diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 95f3f1da0d7..d38e6a8d8b9 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -30,13 +30,15 @@ int ip6_route_me_harder(struct sk_buff *skb) .daddr = iph->daddr, .saddr = iph->saddr, }; + int err; dst = ip6_route_output(net, skb->sk, &fl6); - if (dst->error) { + err = dst->error; + if (err) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); dst_release(dst); - return dst->error; + return err; } /* Drop old route. */ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 7702f9e90a0..4bff1f297e3 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -28,15 +28,32 @@ config NF_CONNTRACK_IPV6 config NF_TABLES_IPV6 depends on NF_TABLES tristate "IPv6 nf_tables support" + help + This option enables the IPv6 support for nf_tables. config NFT_CHAIN_ROUTE_IPV6 depends on NF_TABLES_IPV6 tristate "IPv6 nf_tables route chain support" + help + This option enables the "route" chain for IPv6 in nf_tables. This + chain type is used to force packet re-routing after mangling header + fields such as the source, destination, flowlabel, hop-limit and + the packet mark. config NFT_CHAIN_NAT_IPV6 depends on NF_TABLES_IPV6 depends on NF_NAT_IPV6 && NFT_NAT tristate "IPv6 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv6 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + +config NFT_REJECT_IPV6 + depends on NF_TABLES_IPV6 + default NFT_REJECT + tristate config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index d1b4928f34f..70d3dd66f2c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o +obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 710238f58aa..e080fbbbc0e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1241,8 +1241,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index da00a2ecde5..544b0a9da1b 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -23,181 +23,18 @@ #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <linux/netdevice.h> -#include <net/ipv6.h> -#include <net/tcp.h> #include <net/icmp.h> -#include <net/ip6_checksum.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> #include <net/flow.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_REJECT.h> +#include <net/netfilter/ipv6/nf_reject.h> + MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); -/* Send RST reply */ -static void send_reset(struct net *net, struct sk_buff *oldskb, int hook) -{ - struct sk_buff *nskb; - struct tcphdr otcph, *tcph; - unsigned int otcplen, hh_len; - int tcphoff, needs_ack; - const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); - struct ipv6hdr *ip6h; -#define DEFAULT_TOS_VALUE 0x0U - const __u8 tclass = DEFAULT_TOS_VALUE; - struct dst_entry *dst = NULL; - u8 proto; - __be16 frag_off; - struct flowi6 fl6; - - if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || - (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { - pr_debug("addr is not unicast.\n"); - return; - } - - proto = oip6h->nexthdr; - tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off); - - if ((tcphoff < 0) || (tcphoff > oldskb->len)) { - pr_debug("Cannot get TCP header.\n"); - return; - } - - otcplen = oldskb->len - tcphoff; - - /* IP header checks: fragment, too short. */ - if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { - pr_debug("proto(%d) != IPPROTO_TCP, " - "or too short. otcplen = %d\n", - proto, otcplen); - return; - } - - if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) - BUG(); - - /* No RST for RST. */ - if (otcph.rst) { - pr_debug("RST is set\n"); - return; - } - - /* Check checksum. */ - if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) { - pr_debug("TCP checksum is invalid\n"); - return; - } - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = oip6h->daddr; - fl6.daddr = oip6h->saddr; - fl6.fl6_sport = otcph.dest; - fl6.fl6_dport = otcph.source; - security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL || dst->error) { - dst_release(dst); - return; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - return; - - hh_len = (dst->dev->hard_header_len + 15)&~15; - nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) + dst->trailer_len, - GFP_ATOMIC); - - if (!nskb) { - net_dbg_ratelimited("cannot alloc skb\n"); - dst_release(dst); - return; - } - - skb_dst_set(nskb, dst); - - skb_reserve(nskb, hh_len + dst->header_len); - - skb_put(nskb, sizeof(struct ipv6hdr)); - skb_reset_network_header(nskb); - ip6h = ipv6_hdr(nskb); - ip6_flow_hdr(ip6h, tclass, 0); - ip6h->hop_limit = ip6_dst_hoplimit(dst); - ip6h->nexthdr = IPPROTO_TCP; - ip6h->saddr = oip6h->daddr; - ip6h->daddr = oip6h->saddr; - - skb_reset_transport_header(nskb); - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - /* Truncate to length (no data) */ - tcph->doff = sizeof(struct tcphdr)/4; - tcph->source = otcph.dest; - tcph->dest = otcph.source; - - if (otcph.ack) { - needs_ack = 0; - tcph->seq = otcph.ack_seq; - tcph->ack_seq = 0; - } else { - needs_ack = 1; - tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin - + otcplen - (otcph.doff<<2)); - tcph->seq = 0; - } - - /* Reset flags */ - ((u_int8_t *)tcph)[13] = 0; - tcph->rst = 1; - tcph->ack = needs_ack; - tcph->window = 0; - tcph->urg_ptr = 0; - tcph->check = 0; - - /* Adjust TCP checksum */ - tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, - &ipv6_hdr(nskb)->daddr, - sizeof(struct tcphdr), IPPROTO_TCP, - csum_partial(tcph, - sizeof(struct tcphdr), 0)); - - nf_ct_attach(nskb, oldskb); - -#ifdef CONFIG_BRIDGE_NETFILTER - /* If we use ip6_local_out for bridged traffic, the MAC source on - * the RST will be ours, instead of the destination's. This confuses - * some routers/firewalls, and they drop the packet. So we need to - * build the eth header using the original destination's MAC as the - * source, and send the RST packet directly. - */ - if (oldskb->nf_bridge) { - struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; - nskb->protocol = htons(ETH_P_IPV6); - ip6h->payload_len = htons(sizeof(struct tcphdr)); - if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) - return; - dev_queue_xmit(nskb); - } else -#endif - ip6_local_out(nskb); -} - -static inline void -send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code, - unsigned int hooknum) -{ - if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) - skb_in->dev = net->loopback_dev; - - icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); -} static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) @@ -208,25 +45,25 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) pr_debug("%s: medium point\n", __func__); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: - send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum); break; case IP6T_ICMP6_ADM_PROHIBITED: - send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); break; case IP6T_ICMP6_NOT_NEIGHBOUR: - send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); break; case IP6T_ICMP6_ADDR_UNREACH: - send_unreach(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); break; case IP6T_ICMP6_PORT_UNREACH: - send_unreach(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); break; case IP6T_ICMP6_ECHOREPLY: /* Do nothing */ break; case IP6T_TCP_RESET: - send_reset(net, skb, par->hooknum); + nf_send_reset6(net, skb, par->hooknum); break; default: net_info_ratelimited("case %u not handled yet\n", reject->with); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index f78f41aca8e..a0d17270117 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -446,6 +446,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) static struct xt_target synproxy_tg6_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg6, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg6_check, diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index e0983f3648a..790e0c6b19e 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -33,6 +33,7 @@ static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, struct ipv6hdr *iph = ipv6_hdr(skb); bool ret = false; struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, .flowi6_proto = iph->nexthdr, .daddr = iph->saddr, diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 84c7f33d0cf..387d8b8fc18 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -90,17 +90,9 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, if (nf_ct_is_untracked(ct)) return NF_ACCEPT; - nat = nfct_nat(ct); - if (!nat) { - /* NAT module was loaded late. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 767ab8da821..0d5279fd852 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -451,7 +451,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) } sub_frag_mem_limit(&fq->q, head->truesize); - head->local_df = 1; + head->ignore_df = 1; head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index d77db8a1350..0d812b31277 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -16,34 +16,51 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_ipv6.h> +static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + /* malformed packet, drop it */ + if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) + return NF_DROP; + + return nft_do_chain(&pkt, ops); +} + static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct nft_pktinfo pkt; - if (unlikely(skb->len < sizeof(struct ipv6hdr))) { if (net_ratelimit()) pr_info("nf_tables_ipv6: ignoring short SOCK_RAW " "packet\n"); return NF_ACCEPT; } - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) - return NF_DROP; - return nft_do_chain_pktinfo(&pkt, ops); + return nft_do_chain_ipv6(ops, skb, in, out, okfn); } -static struct nft_af_info nft_af_ipv6 __read_mostly = { +struct nft_af_info nft_af_ipv6 __read_mostly = { .family = NFPROTO_IPV6, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, [NF_INET_LOCAL_OUT] = nft_ipv6_output, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, }, }; +EXPORT_SYMBOL_GPL(nft_af_ipv6); static int nf_tables_ipv6_init_net(struct net *net) { @@ -73,44 +90,28 @@ static struct pernet_operations nf_tables_ipv6_net_ops = { .exit = nf_tables_ipv6_exit_net, }; -static unsigned int -nft_do_chain_ipv6(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) - return NF_DROP; - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_ipv6 = { - .family = NFPROTO_IPV6, +static const struct nf_chain_type filter_ipv6 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), - .fn = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, - [NF_INET_LOCAL_OUT] = nft_ipv6_output, - [NF_INET_FORWARD] = nft_do_chain_ipv6, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, - }, }; static int __init nf_tables_ipv6_init(void) { + int ret; + nft_register_chain_type(&filter_ipv6); - return register_pernet_subsys(&nf_tables_ipv6_net_ops); + ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_ipv6); + + return ret; } static void __exit nf_tables_ipv6_exit(void) diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index e86dcd70dc7..d189fcb437f 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -47,15 +47,9 @@ static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, if (ct == NULL || nf_ct_is_untracked(ct)) return NF_ACCEPT; - nat = nfct_nat(ct); - if (nat == NULL) { - /* Conntrack module was loaded late, can't add extension. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) - return NF_ACCEPT; - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: @@ -79,7 +73,7 @@ static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_ACCEPT) return ret; if (!nf_nat_initialized(ct, maniptype)) { @@ -170,21 +164,21 @@ static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_nat_ipv6 = { - .family = NFPROTO_IPV6, +static const struct nf_chain_type nft_chain_nat_ipv6 = { .name = "nat", .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .fn = { + .hooks = { [NF_INET_PRE_ROUTING] = nf_nat_ipv6_prerouting, [NF_INET_POST_ROUTING] = nf_nat_ipv6_postrouting, [NF_INET_LOCAL_OUT] = nf_nat_ipv6_output, [NF_INET_LOCAL_IN] = nf_nat_ipv6_fn, }, - .me = THIS_MODULE, }; static int __init nft_chain_nat_ipv6_init(void) diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 3fe40f0456a..42031299585 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -47,7 +47,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u32 *)ipv6_hdr(skb)); - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_DROP && ret != NF_QUEUE && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || @@ -59,15 +59,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_route_ipv6 = { - .family = NFPROTO_IPV6, +static const struct nf_chain_type nft_chain_route_ipv6 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_OUT), - .fn = { + .hooks = { [NF_INET_LOCAL_OUT] = nf_route_table_hook, }, - .me = THIS_MODULE, }; static int __init nft_chain_route_init(void) diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c new file mode 100644 index 00000000000..0bc19fa8782 --- /dev/null +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} +EXPORT_SYMBOL_GPL(nft_reject_ipv6_eval); + +static struct nft_expr_type nft_reject_ipv6_type; +static const struct nft_expr_ops nft_reject_ipv6_ops = { + .type = &nft_reject_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv6_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "reject", + .ops = &nft_reject_ipv6_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv6_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv6_type); +} + +static void __exit nft_reject_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv6_type); +} + +module_init(nft_reject_ipv6_module_init); +module_exit(nft_reject_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject"); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 827f795209c..5ec867e4a8b 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -6,35 +6,7 @@ #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/addrconf.h> - -void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) -{ - static atomic_t ipv6_fragmentation_id; - int old, new; - -#if IS_ENABLED(CONFIG_IPV6) - if (rt && !(rt->dst.flags & DST_NOPEER)) { - struct inet_peer *peer; - struct net *net; - - net = dev_net(rt->dst.dev); - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); - if (peer) { - fhdr->identification = htonl(inet_getid(peer, 0)); - inet_putpeer(peer); - return; - } - } -#endif - do { - old = atomic_read(&ipv6_fragmentation_id); - new = old + 1; - if (!new) - new = 1; - } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old); - fhdr->identification = htonl(new); -} -EXPORT_SYMBOL(ipv6_select_ident); +#include <net/secure_seq.h> int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { @@ -106,6 +78,7 @@ int __ip6_local_out(struct sk_buff *skb) if (len > IPV6_MAXPLEN) len = 0; ipv6_hdr(skb)->payload_len = htons(len); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, dst_output); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index a83243c3d65..5b7a1ed2aba 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -31,7 +31,7 @@ struct proto pingv6_prot = { .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, - .connect = ip6_datagram_connect, + .connect = ip6_datagram_connect_v6_only, .disconnect = udp_disconnect, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, @@ -51,7 +51,6 @@ static struct inet_protosw pingv6_protosw = { .protocol = IPPROTO_ICMPV6, .prot = &pingv6_prot, .ops = &inet6_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }; @@ -62,10 +61,9 @@ static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, { return -EAFNOSUPPORT; } -static int dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, +static void dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { - return -EAFNOSUPPORT; } static int dummy_icmpv6_err_convert(u8 type, u8 code, int *err) { @@ -103,7 +101,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return err; if (msg->msg_name) { - struct sockaddr_in6 *u = (struct sockaddr_in6 *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); if (msg->msg_namelen < sizeof(struct sockaddr_in6) || u->sin6_family != AF_INET6) { return -EINVAL; @@ -136,6 +134,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; + fl6.flowi6_mark = sk->sk_mark; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); @@ -145,7 +144,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, 1); + dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) return PTR_ERR(dst); rt = (struct rt6_info *) dst; @@ -168,12 +167,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, pfh.wcheck = 0; pfh.family = AF_INET6; - if (ipv6_addr_is_multicast(&fl6.daddr)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, @@ -182,8 +176,8 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, MSG_DONTWAIT, np->dontfrag); if (err) { - ICMP6_INC_STATS_BH(sock_net(sk), rt->rt6i_idev, - ICMP6_MIB_OUTERRORS); + ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, + ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { err = icmpv6_push_pending_frames(sk, &fl6, @@ -254,7 +248,9 @@ int __init pingv6_init(void) return ret; #endif pingv6_ops.ipv6_recv_error = ipv6_recv_error; - pingv6_ops.ip6_datagram_recv_ctl = ip6_datagram_recv_ctl; + pingv6_ops.ip6_datagram_recv_common_ctl = ip6_datagram_recv_common_ctl; + pingv6_ops.ip6_datagram_recv_specific_ctl = + ip6_datagram_recv_specific_ctl; pingv6_ops.icmpv6_err_convert = icmpv6_err_convert; pingv6_ops.ipv6_icmp_error = ipv6_icmp_error; pingv6_ops.ipv6_chk_addr = ipv6_chk_addr; @@ -267,7 +263,8 @@ int __init pingv6_init(void) void pingv6_exit(void) { pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error; - pingv6_ops.ip6_datagram_recv_ctl = dummy_ip6_datagram_recv_ctl; + pingv6_ops.ip6_datagram_recv_common_ctl = dummy_ip6_datagram_recv_ctl; + pingv6_ops.ip6_datagram_recv_specific_ctl = dummy_ip6_datagram_recv_ctl; pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert; pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error; pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 091d066a57b..3317440ea34 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -186,7 +186,7 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) /* can be called either with percpu mib (pcpumib != NULL), * or shared one (smib != NULL) */ -static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **pcpumib, +static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, const struct snmp_mib *itemlist) { @@ -201,7 +201,7 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **pcpumib, } } -static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu **mib, +static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, const struct snmp_mib *itemlist, size_t syncpoff) { int i; @@ -215,14 +215,14 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; - snmp6_seq_show_item64(seq, (void __percpu **)net->mib.ipv6_statistics, + snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); - snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics, + snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, NULL, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); - snmp6_seq_show_item(seq, (void __percpu **)net->mib.udp_stats_in6, + snmp6_seq_show_item(seq, net->mib.udp_stats_in6, NULL, snmp6_udp6_list); - snmp6_seq_show_item(seq, (void __percpu **)net->mib.udplite_stats_in6, + snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, NULL, snmp6_udplite6_list); return 0; } @@ -245,7 +245,7 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) struct inet6_dev *idev = (struct inet6_dev *)seq->private; seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); - snmp6_seq_show_item64(seq, (void __percpu **)idev->stats.ipv6, + snmp6_seq_show_item64(seq, idev->stats.ipv6, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, snmp6_icmp6_list); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 7fb4e14c467..b2dc60b0c76 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -250,6 +250,10 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); /* Raw sockets are IPv6 only */ @@ -457,7 +461,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, int noblock, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct sk_buff *skb; size_t copied; int err; @@ -734,7 +738,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; - struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -792,7 +796,6 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } @@ -865,19 +868,13 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, fl6.flowi6_oif = np->ucast_oif; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } - if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl6.daddr)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); - } + if (hlimit < 0) + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (tclass < 0) tclass = np->tclass; @@ -1210,7 +1207,7 @@ struct proto rawv6_prot = { .owner = THIS_MODULE, .close = rawv6_close, .destroy = raw6_destroy, - .connect = ip6_datagram_connect, + .connect = ip6_datagram_connect_v6_only, .disconnect = udp_disconnect, .ioctl = rawv6_ioctl, .init = rawv6_init_sk, @@ -1325,7 +1322,6 @@ static struct inet_protosw rawv6_protosw = { .protocol = IPPROTO_IP, /* wild card */ .prot = &rawv6_prot, .ops = &inet6_sockraw_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ddb9d41c8ee..f23fbd28a50 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -66,8 +66,9 @@ #endif enum rt6_nud_state { - RT6_NUD_FAIL_HARD = -2, - RT6_NUD_FAIL_SOFT = -1, + RT6_NUD_FAIL_HARD = -3, + RT6_NUD_FAIL_PROBE = -2, + RT6_NUD_FAIL_DO_RR = -1, RT6_NUD_SUCCEED = 1 }; @@ -83,9 +84,9 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); @@ -103,6 +104,36 @@ static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, int ifindex); #endif +static void rt6_bind_peer(struct rt6_info *rt, int create) +{ + struct inet_peer_base *base; + struct inet_peer *peer; + + base = inetpeer_base_ptr(rt->_rt6i_peer); + if (!base) + return; + + peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); + if (peer) { + if (!rt6_set_peer(rt, peer)) + inet_putpeer(peer); + } +} + +static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) +{ + if (rt6_has_peer(rt)) + return rt6_peer_ptr(rt); + + rt6_bind_peer(rt, create); + return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL); +} + +static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) +{ + return __rt6_get_peer(rt, 1); +} + static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { struct rt6_info *rt = (struct rt6_info *) dst; @@ -118,7 +149,8 @@ static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) unsigned long prev, new; p = peer->metrics; - if (inet_metrics_new(peer)) + if (inet_metrics_new(peer) || + (old & DST_METRICS_FORCE_OVERWRITE)) memcpy(p, old_p, sizeof(u32) * RTAX_MAX); new = (unsigned long) p; @@ -258,7 +290,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, - .output = dst_discard, + .output = dst_discard_sk, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -311,22 +343,6 @@ static void ip6_dst_destroy(struct dst_entry *dst) } } -void rt6_bind_peer(struct rt6_info *rt, int create) -{ - struct inet_peer_base *base; - struct inet_peer *peer; - - base = inetpeer_base_ptr(rt->_rt6i_peer); - if (!base) - return; - - peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); - if (peer) { - if (!rt6_set_peer(rt, peer)) - inet_putpeer(peer); - } -} - static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) { @@ -358,12 +374,6 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static bool rt6_need_strict(const struct in6_addr *daddr) -{ - return ipv6_addr_type(daddr) & - (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); -} - /* Multipath route selection: * Hash based function using packet header and flowlabel. * Adapted from fib_info_hashfn() @@ -521,7 +531,7 @@ static void rt6_probe(struct rt6_info *rt) work = kmalloc(sizeof(*work), GFP_ATOMIC); if (neigh && work) - neigh->updated = jiffies; + __neigh_set_probe_once(neigh); if (neigh) write_unlock(&neigh->lock); @@ -577,11 +587,13 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(neigh->nud_state & NUD_FAILED)) ret = RT6_NUD_SUCCEED; + else + ret = RT6_NUD_FAIL_PROBE; #endif read_unlock(&neigh->lock); } else { ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? - RT6_NUD_SUCCEED : RT6_NUD_FAIL_SOFT; + RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; } rcu_read_unlock_bh(); @@ -618,16 +630,17 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, goto out; m = rt6_score_route(rt, oif, strict); - if (m == RT6_NUD_FAIL_SOFT) { + if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ - } else if (m < 0) { + } else if (m == RT6_NUD_FAIL_HARD) { goto out; } if (strict & RT6_LOOKUP_F_REACHABLE) rt6_probe(rt); + /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; @@ -839,14 +852,15 @@ EXPORT_SYMBOL(rt6_lookup); be destroyed. */ -static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) +static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, + struct nlattr *mx, int mx_len) { int err; struct fib6_table *table; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info); + err = fib6_add(&table->tb6_root, rt, info, mx, mx_len); write_unlock_bh(&table->tb6_lock); return err; @@ -857,7 +871,7 @@ int ip6_ins_rt(struct rt6_info *rt) struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; - return __ip6_ins_rt(rt, &info); + return __ip6_ins_rt(rt, &info, NULL, 0); } static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, @@ -1044,7 +1058,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; if (dst_metrics_read_only(&ort->dst)) new->_metrics = ort->dst._metrics; @@ -1162,7 +1176,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; + fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); @@ -1259,6 +1273,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; fl6.daddr = iph->daddr; @@ -1280,6 +1295,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; fl6.daddr = msg->dest; @@ -1324,7 +1340,7 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; mtu = IPV6_MIN_MTU; @@ -1334,7 +1350,8 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) mtu = idev->cnf.mtu6; rcu_read_unlock(); - return mtu; +out: + return min_t(unsigned int, mtu, IP6_MAX_MTU); } static struct dst_entry *icmp6_dst_gc_list; @@ -1438,7 +1455,7 @@ static int ip6_dst_gc(struct dst_ops *ops) goto out; net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size); + fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; @@ -1495,7 +1512,7 @@ int ip6_route_add(struct fib6_config *cfg) if (!table) goto out; - rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table); + rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table); if (!rt) { err = -ENOMEM; @@ -1525,17 +1542,11 @@ int ip6_route_add(struct fib6_config *cfg) ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) - rt->dst.flags |= DST_HOST; - - if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) { - u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); - if (!metrics) { - err = -ENOMEM; - goto out; - } - dst_init_metrics(&rt->dst, metrics, 0); + if (rt->rt6i_dst.plen == 128) { + rt->dst.flags |= DST_HOST; + dst_metrics_set_force_overwrite(&rt->dst); } + #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->rt6i_src.plen = cfg->fc_src_len; @@ -1568,7 +1579,7 @@ int ip6_route_add(struct fib6_config *cfg) switch (cfg->fc_type) { case RTN_BLACKHOLE: rt->dst.error = -EINVAL; - rt->dst.output = dst_discard; + rt->dst.output = dst_discard_sk; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: @@ -1654,31 +1665,13 @@ int ip6_route_add(struct fib6_config *cfg) rt->rt6i_flags = cfg->fc_flags; install_route: - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - if (type > RTAX_MAX) { - err = -EINVAL; - goto out; - } - - dst_metric_set(&rt->dst, type, nla_get_u32(nla)); - } - } - } - rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); - return __ip6_ins_rt(rt, &cfg->fc_nlinfo); + return __ip6_ins_rt(rt, &cfg->fc_nlinfo, cfg->fc_mx, cfg->fc_mx_len); out: if (dev) @@ -1905,9 +1898,7 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, else rt->rt6i_gateway = *dest; rt->rt6i_flags = ort->rt6i_flags; - if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) == - (RTF_DEFAULT | RTF_ADDRCONF)) - rt6_set_from(rt, ort); + rt6_set_from(rt, ort); rt->rt6i_metric = 0; #ifdef CONFIG_IPV6_SUBTREES @@ -2140,7 +2131,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); @@ -2151,7 +2142,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); @@ -2166,12 +2157,10 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, bool anycast) { struct net *net = dev_net(idev->dev); - struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL); - - if (!rt) { - net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n"); + struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, + DST_NOCOUNT, NULL); + if (!rt) return ERR_PTR(-ENOMEM); - } in6_dev_hold(idev); @@ -2242,7 +2231,28 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) .net = net, .addr = &ifp->addr, }; - fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni); + fib6_clean_all(net, fib6_remove_prefsrc, &adni); +} + +#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +/* Remove routers and update dst entries when gateway turn into host. */ +static int fib6_clean_tohost(struct rt6_info *rt, void *arg) +{ + struct in6_addr *gateway = (struct in6_addr *)arg; + + if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || + ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + return -1; + } + return 0; +} + +void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) +{ + fib6_clean_all(net, fib6_clean_tohost, gateway); } struct arg_dev_net { @@ -2269,7 +2279,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev) .net = net, }; - fib6_clean_all(net, fib6_ifdown, 0, &adn); + fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); } @@ -2324,7 +2334,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) .mtu = mtu, }; - fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg); + fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { @@ -2720,6 +2730,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) if (tb[RTA_OIF]) oif = nla_get_u32(tb[RTA_OIF]); + if (tb[RTA_MARK]) + fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); + if (iif) { struct net_device *dev; int flags = 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 366fbba3359..4f408176dc6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -475,6 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev) ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } + ip_tunnel_dst_reset_all(tunnel); dev_put(dev); } @@ -559,12 +560,12 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->dev->ifindex, 0, IPPROTO_IPV6, 0); + t->parms.link, 0, IPPROTO_IPV6, 0); err = 0; goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, IPPROTO_IPV6, 0); err = 0; goto out; @@ -671,7 +672,7 @@ static int ipip6_rcv(struct sk_buff *skb) tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && tunnel->parms.iph.protocol != 0) @@ -702,8 +703,10 @@ static int ipip6_rcv(struct sk_buff *skb) } tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); tstats->rx_packets++; tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); netif_rx(skb); @@ -924,7 +927,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (tunnel->parms.iph.daddr && skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; @@ -966,11 +969,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); - if (IS_ERR(skb)) + if (IS_ERR(skb)) { + ip_rt_put(rt); goto out; + } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, - ttl, df, !net_eq(tunnel->net, dev_net(dev))); + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, + IPPROTO_IPV6, tos, ttl, df, + !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; @@ -1078,6 +1084,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) t->parms.link = p->link; ipip6_tunnel_bind_dev(t->dev); } + ip_tunnel_dst_reset_all(t); netdev_state_change(t->dev); } @@ -1108,6 +1115,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; + ip_tunnel_dst_reset_all(t); netdev_state_change(t->dev); return 0; } @@ -1119,8 +1127,8 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) int err = 0; struct ip_tunnel_parm p; struct ip_tunnel_prl prl; - struct ip_tunnel *t; - struct net *net = dev_net(dev); + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; @@ -1131,16 +1139,15 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: #endif - t = NULL; if (dev == sitn->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } t = ipip6_tunnel_locate(net, &p, 0); + if (t == NULL) + t = netdev_priv(dev); } - if (t == NULL) - t = netdev_priv(dev); err = -EFAULT; if (cmd == SIOCGETTUNNEL) { @@ -1236,9 +1243,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EINVAL; if (dev == sitn->fb_tunnel_dev) goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); break; @@ -1254,9 +1258,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EFAULT; if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; switch (cmd) { case SIOCDELPRL: @@ -1267,6 +1268,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } + ip_tunnel_dst_reset_all(t); netdev_state_change(dev); break; @@ -1283,8 +1285,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) sizeof(ip6rd))) goto done; - t = netdev_priv(dev); - if (cmd != SIOCDEL6RD) { err = ipip6_tunnel_update_6rd(t, &ip6rd); if (err < 0) @@ -1322,6 +1322,9 @@ static const struct net_device_ops ipip6_netdev_ops = { static void ipip6_dev_free(struct net_device *dev) { + struct ip_tunnel *tunnel = netdev_priv(dev); + + free_percpu(tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1352,7 +1355,6 @@ static void ipip6_tunnel_setup(struct net_device *dev) static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - int i; tunnel->dev = dev; tunnel->net = dev_net(dev); @@ -1361,14 +1363,14 @@ static int ipip6_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip6_tunnel_bind_dev(dev); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_tstats *ipip6_tunnel_stats; - ipip6_tunnel_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ipip6_tunnel_stats->syncp); + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; } return 0; @@ -1380,7 +1382,6 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) struct iphdr *iph = &tunnel->parms.iph; struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); - int i; tunnel->dev = dev; tunnel->net = dev_net(dev); @@ -1391,14 +1392,14 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_tstats *ipip6_fb_stats; - ipip6_fb_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ipip6_fb_stats->syncp); + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; } dev_hold(dev); @@ -1827,4 +1828,5 @@ xfrm_tunnel_failed: module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 535a3ad262f..a822b880689 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -216,6 +216,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = inet6_iif(skb); + ireq->ir_mark = inet_request_mark(sk, skb); + req->expires = 0UL; req->num_retrans = 0; ireq->ecn_ok = ecn_ok; @@ -242,12 +244,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) final_p = fl6_update_dst(&fl6, np->opt, &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) goto out_free; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 107b2f1d90a..058f3eca2e5 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -24,6 +24,27 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "anycast_src_echo_reply", + .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "flowlabel_consistency", + .data = &init_net.ipv6.sysctl.flowlabel_consistency, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "fwmark_reflect", + .data = &init_net.ipv6.sysctl.fwmark_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -51,6 +72,8 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_table) goto out; ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; + ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; + ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0740f93a114..229239ad96b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -39,7 +39,7 @@ #include <linux/ipsec.h> #include <linux/times.h> #include <linux/slab.h> - +#include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> @@ -65,8 +65,6 @@ #include <net/tcp_memcontrol.h> #include <net/busy_poll.h> -#include <asm/uaccess.h> - #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -156,7 +154,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - usin->sin6_addr = flowlabel->dst; fl6_sock_release(flowlabel); } } @@ -165,12 +162,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * connect() to INADDR_ANY means loopback (BSD'ism). */ - if(ipv6_addr_any(&usin->sin6_addr)) + if (ipv6_addr_any(&usin->sin6_addr)) usin->sin6_addr.s6_addr[15] = 0x1; addr_type = ipv6_addr_type(&usin->sin6_addr); - if(addr_type & IPV6_ADDR_MULTICAST) + if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { @@ -258,7 +255,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; @@ -337,13 +334,14 @@ static void tcp_v6_mtu_reduced(struct sock *sk) static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data; + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; struct sock *sk; int err; struct tcp_sock *tp; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; struct net *net = dev_net(skb->dev); sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, @@ -374,8 +372,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tp = tcp_sk(sk); seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt)) { + !between(seq, snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -398,6 +399,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_LISTEN) goto out; + if (!ip6_sk_accept_pmtu(sk)) + goto out; + tp->mtu_info = ntohl(info); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); @@ -436,8 +440,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -463,24 +472,28 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, struct flowi6 *fl6, struct request_sock *req, - u16 queue_mapping) + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff * skb; + struct sk_buff *skb; int err = -ENOMEM; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, NULL); + skb = tcp_make_synack(sk, dst, req, foc); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; + if (np->repflow && (ireq->pktopts != NULL)) + fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); + skb_set_queue_mapping(skb, queue_mapping); err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); @@ -495,9 +508,11 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) struct flowi6 fl6; int res; - res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0); - if (!res) + res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); + if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } return res; } @@ -525,8 +540,8 @@ static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk, return tcp_v6_md5_do_lookup(sk, &inet_rsk(req)->ir_v6_rmt_addr); } -static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, - int optlen) +static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval, + int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; @@ -710,7 +725,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, + .syn_ack_timeout = tcp_syn_ack_timeout, }; #ifdef CONFIG_TCP_MD5SIG @@ -721,8 +736,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { #endif static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - u32 tsval, u32 tsecr, - struct tcp_md5sig_key *key, int rst, u8 tclass) + u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, int rst, u8 tclass, + u32 label) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -784,6 +800,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, memset(&fl6, 0, sizeof(fl6)); fl6.daddr = ipv6_hdr(skb)->saddr; fl6.saddr = ipv6_hdr(skb)->daddr; + fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; buff->csum = 0; @@ -791,8 +808,11 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; - if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = inet6_iif(skb); + else + fl6.flowi6_oif = oif; + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); @@ -801,7 +821,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, * Underlying function will use this to retrieve the network * namespace */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false); + dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); @@ -826,6 +846,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif + int oif; if (th->rst) return; @@ -869,7 +890,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); - tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0); + oif = sk ? sk->sk_bound_dev_if : 0; + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG release_sk1: @@ -881,10 +903,12 @@ release_sk1: } static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 tsval, u32 tsecr, - struct tcp_md5sig_key *key, u8 tclass) + u32 win, u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, u8 tclass, + u32 label) { - tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, key, 0, tclass); + tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass, + label); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -895,8 +919,8 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, - tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw), - tw->tw_tclass); + tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), + tw->tw_tclass, (tw->tw_flowlabel << 12)); inet_twsk_put(tw); } @@ -904,13 +928,19 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, - req->rcv_wnd, tcp_time_stamp, req->ts_recent, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0); + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. + */ + tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + tcp_rsk(req)->rcv_nxt, + req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + 0, 0); } -static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) { struct request_sock *req, **prev; const struct tcphdr *th = tcp_hdr(skb); @@ -955,8 +985,10 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; + struct tcp_fastopen_cookie foc = { .len = -1 }; + bool want_cookie = false, fastopen; struct flowi6 fl6; - bool want_cookie = false; + int err; if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -987,7 +1019,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, NULL); + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -1002,6 +1034,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) TCP_ECN_create_request(req, skb, sock_net(sk)); ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_mark = inet_request_mark(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && @@ -1011,7 +1044,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!isn) { if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || + np->repflow) { atomic_inc(&skb->users); ireq->pktopts = skb; } @@ -1059,19 +1093,27 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v6_init_sequence(skb); } have_isn: - tcp_rsk(req)->snt_isn = isn; if (security_inet_conn_request(sk, skb, req)) goto drop_and_release; - if (tcp_v6_send_synack(sk, dst, &fl6, req, - skb_get_queue_mapping(skb)) || - want_cookie) + if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) goto drop_and_free; + tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_rsk(req)->listener = NULL; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v6_send_synack(sk, dst, &fl6, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } return 0; drop_and_release: @@ -1083,9 +1125,9 @@ drop: return 0; /* don't send reset */ } -static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); @@ -1135,7 +1177,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - newnp->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); + if (np->repflow) + newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count @@ -1215,7 +1259,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - newnp->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); + if (np->repflow) + newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Clone native IPv6 options from listening socket (if any) @@ -1231,7 +1277,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + newnp->opt->opt_flen); - tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && @@ -1245,7 +1290,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ - if ((key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr)) != NULL) { + key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr); + if (key != NULL) { /* We're using one, so create a matching key * on the newsk structure. If we fail to get * memory, then we end up not copying the key @@ -1275,26 +1321,6 @@ out: return NULL; } -static __sum16 tcp_v6_checksum_init(struct sk_buff *skb) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v6_check(skb->len, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - } - - skb->csum = ~csum_unfold(tcp_v6_check(skb->len, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, 0)); - - if (skb->len <= 76) { - return __skb_checksum_complete(skb); - } - return 0; -} - /* The socket must have it's spinlock held when we get * here. * @@ -1321,7 +1347,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return tcp_v4_do_rcv(sk, skb); #ifdef CONFIG_TCP_MD5SIG - if (tcp_v6_inbound_md5_hash (sk, skb)) + if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard; #endif @@ -1380,7 +1406,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * otherwise we just shortcircuit this and continue with * the new socket.. */ - if(nsk != sk) { + if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; @@ -1425,8 +1451,10 @@ ipv6_pktoptions: np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; - if (np->rxopt.bits.rxtclass) - np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(opt_skb)); + if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) + np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); + if (np->repflow) + np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); @@ -1466,7 +1494,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; - if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb)) + if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) goto csum_error; th = tcp_hdr(skb); @@ -1586,7 +1614,8 @@ do_time_wait: break; case TCP_TW_RST: goto no_tcp_socket; - case TCP_TW_SUCCESS:; + case TCP_TW_SUCCESS: + ; } goto discard_it; } @@ -1631,7 +1660,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, - .twsk_destructor= tcp_twsk_destructor, + .twsk_destructor = tcp_twsk_destructor, }; static const struct inet_connection_sock_af_ops ipv6_specific = { @@ -1665,7 +1694,6 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { /* * TCP over IPv4 via INET6 API */ - static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, @@ -1740,7 +1768,7 @@ static void get_openreq6(struct seq_file *seq, dest->s6_addr32[2], dest->s6_addr32[3], ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, - 0,0, /* could print option size, but that is af dependent. */ + 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, @@ -1759,6 +1787,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); + struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1799,9 +1828,11 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) atomic_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), - (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh + sp->sk_state == TCP_LISTEN ? + (fastopenq ? fastopenq->max_qlen : 0) : + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } @@ -1961,7 +1992,6 @@ static struct inet_protosw tcpv6_protosw = { .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 6d18157dc32..01b0ff9a0c2 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -42,7 +42,7 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, if (NAPI_GRO_CB(skb)->flush) goto skip_csum; - wsum = skb->csum; + wsum = NAPI_GRO_CB(skb)->csum; switch (skb->ip_summed) { case CHECKSUM_NONE: @@ -66,14 +66,14 @@ skip_csum: return tcp_gro_receive(head, skb); } -static int tcp6_gro_complete(struct sk_buff *skb) +static int tcp6_gro_complete(struct sk_buff *skb, int thoff) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb), - &iph->saddr, &iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, + &iph->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; return tcp_gro_complete(skb); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bcd5699313c..7092ff78fd8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -460,9 +460,7 @@ try_again: /* Copy the address. */ if (msg->msg_name) { - struct sockaddr_in6 *sin6; - - sin6 = (struct sockaddr_in6 *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); sin6->sin6_family = AF_INET6; sin6->sin6_port = udp_hdr(skb)->source; sin6->sin6_flowinfo = 0; @@ -479,12 +477,16 @@ try_again: } *addr_len = sizeof(*sin6); } + + if (np->rxopt.all) + ip6_datagram_recv_common_ctl(sk, msg, skb); + if (is_udp4) { if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } else { if (np->rxopt.all) - ip6_datagram_recv_ctl(sk, msg, skb); + ip6_datagram_recv_specific_ctl(sk, msg, skb); } err = copied; @@ -538,8 +540,11 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk == NULL) return; - if (type == ICMPV6_PKT_TOOBIG) + if (type == ICMPV6_PKT_TOOBIG) { + if (!ip6_sk_accept_pmtu(sk)) + goto out; ip6_sk_update_pmtu(skb, sk, info); + } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); goto out; @@ -629,6 +634,10 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { int ret; + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), @@ -665,8 +674,11 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; } - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; + } skb_dst_drop(skb); @@ -681,6 +693,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) bh_unlock_sock(sk); return rc; + csum_error: UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: @@ -696,17 +709,16 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, int dif) { struct hlist_nulls_node *node; - struct sock *s = sk; unsigned short num = ntohs(loc_port); - sk_nulls_for_each_from(s, node) { - struct inet_sock *inet = inet_sk(s); + sk_nulls_for_each_from(sk, node) { + struct inet_sock *inet = inet_sk(sk); - if (!net_eq(sock_net(s), net)) + if (!net_eq(sock_net(sk), net)) continue; - if (udp_sk(s)->udp_port_hash == num && - s->sk_family == PF_INET6) { + if (udp_sk(sk)->udp_port_hash == num && + sk->sk_family == PF_INET6) { if (inet->inet_dport) { if (inet->inet_dport != rmt_port) continue; @@ -715,16 +727,16 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; - if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif) + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) continue; } - if (!inet6_mc_check(s, loc_addr, rmt_addr)) + if (!inet6_mc_check(sk, loc_addr, rmt_addr)) continue; - return s; + return sk; } } return NULL; @@ -755,6 +767,17 @@ static void flush_stack(struct sock **stack, unsigned int count, if (unlikely(skb1)) kfree_skb(skb1); } + +static void udp6_csum_zero_error(struct sk_buff *skb) +{ + /* RFC 2460 section 8.1 says that we SHOULD log + * this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n", + &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source), + &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest)); +} + /* * Note: called only from the BH handler context, * so we don't need to lock the hashes. @@ -774,7 +797,12 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, dif = inet6_iif(skb); sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); while (sk) { - stack[count++] = sk; + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + if (uh->check || udp_sk(sk)->no_check6_rx) + stack[count++] = sk; + sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, uh->source, saddr, dif); if (unlikely(count == ARRAY_SIZE(stack))) { @@ -862,6 +890,12 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; + if (!uh->check && !udp_sk(sk)->no_check6_rx) { + sock_put(sk); + udp6_csum_zero_error(skb); + goto csum_error; + } + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); @@ -874,6 +908,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return 0; } + if (!uh->check) { + udp6_csum_zero_error(skb); + goto csum_error; + } + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; @@ -1001,7 +1040,10 @@ static int udp_v6_push_pending_frames(struct sock *sk) if (is_udplite) csum = udplite_csum_outgoing(sk, skb); - else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ + else if (up->no_check6_tx) { /* UDP csum disabled */ + skb->ip_summed = CHECKSUM_NONE; + goto send; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, up->len); goto send; @@ -1038,7 +1080,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; @@ -1140,7 +1182,6 @@ do_udp_sendmsg: flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } @@ -1221,21 +1262,15 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, true); + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto out; } - if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl6.daddr)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); - } + if (hlimit < 0) + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (tclass < 0) tclass = np->tclass; @@ -1475,7 +1510,6 @@ static struct inet_protosw udpv6_protosw = { .protocol = IPPROTO_UDP, .prot = &udpv6_prot, .ops = &inet6_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e7359f9eaa8..0ae3d98f83e 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -63,7 +63,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_MPLS) || @@ -76,7 +78,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, goto out; } - if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) + if (skb->encapsulation && skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features); else { /* Do software UFO. Complete and fill in the UDP checksum as HW cannot @@ -113,7 +116,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); fptr->nexthdr = nexthdr; fptr->reserved = 0; - ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb)); + fptr->identification = skb_shinfo(skb)->ip6_frag_id; /* Fragment the skb. ipv6 header and the remaining fields of the * fragment header are updated in ipv6_gso_segment() diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index dfcc4be4689..9cf097e206e 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -64,7 +64,6 @@ static struct inet_protosw udplite6_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplitev6_prot, .ops = &inet6_dgram_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index cb04f7a16b5..901ef6f8add 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -18,65 +18,6 @@ #include <net/ipv6.h> #include <net/xfrm.h> -/* Informational hook. The decap is still done here. */ -static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly; -static DEFINE_MUTEX(xfrm6_mode_tunnel_input_mutex); - -int xfrm6_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -EEXIST; - int priority = handler->priority; - - mutex_lock(&xfrm6_mode_tunnel_input_mutex); - - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm6_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t->priority > priority) - break; - if (t->priority == priority) - goto err; - - } - - handler->next = *pprev; - rcu_assign_pointer(*pprev, handler); - - ret = 0; - -err: - mutex_unlock(&xfrm6_mode_tunnel_input_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xfrm6_mode_tunnel_input_register); - -int xfrm6_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -ENOENT; - - mutex_lock(&xfrm6_mode_tunnel_input_mutex); - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm6_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t == handler) { - *pprev = handler->next; - ret = 0; - break; - } - } - mutex_unlock(&xfrm6_mode_tunnel_input_mutex); - synchronize_net(); - - return ret; -} -EXPORT_SYMBOL_GPL(xfrm6_mode_tunnel_input_deregister); - static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); @@ -130,7 +71,6 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_tunnel_notifier *handler; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) @@ -138,9 +78,6 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; - for_each_input_rcu(rcv_notify_handlers, handler) - handler->handler(skb); - err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 6cd625e3770..433672d07d0 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -78,7 +78,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (!skb->local_df && skb->len > mtu) { + if (!skb->ignore_df && skb->len > mtu) { skb->dev = dst->dev; if (xfrm6_local_dontfrag(skb)) @@ -114,13 +114,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); -#ifdef CONFIG_NETFILTER - IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif - - skb->protocol = htons(ETH_P_IPV6); - skb->local_df = 1; + skb->ignore_df = 1; return x->outer_mode->output2(x, skb); } @@ -128,11 +122,13 @@ EXPORT_SYMBOL(xfrm6_prepare_output); int xfrm6_output_finish(struct sk_buff *skb) { + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + skb->protocol = htons(ETH_P_IPV6); + #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IPV6); return xfrm_output(skb); } @@ -142,6 +138,13 @@ static int __xfrm6_output(struct sk_buff *skb) struct xfrm_state *x = dst->xfrm; int mtu; +#ifdef CONFIG_NETFILTER + if (!x) { + IP6CB(skb)->flags |= IP6SKB_REROUTED; + return dst_output(skb); + } +#endif + if (skb->protocol == htons(ETH_P_IPV6)) mtu = ip6_skb_dst_mtu(skb); else @@ -150,7 +153,7 @@ static int __xfrm6_output(struct sk_buff *skb) if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); return -EMSGSIZE; - } else if (!skb->local_df && skb->len > mtu && skb->sk) { + } else if (!skb->ignore_df && skb->len > mtu && skb->sk) { xfrm_local_error(skb, mtu); return -EMSGSIZE; } @@ -163,8 +166,9 @@ static int __xfrm6_output(struct sk_buff *skb) return x->outer_mode->afinfo->output_finish(skb); } -int xfrm6_output(struct sk_buff *skb) +int xfrm6_output(struct sock *sk, struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, - skb_dst(skb)->dev, __xfrm6_output); + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, + NULL, skb_dst(skb)->dev, __xfrm6_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 5f8e128c512..2a0bbda2c76 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -389,11 +389,17 @@ int __init xfrm6_init(void) if (ret) goto out_policy; + ret = xfrm6_protocol_init(); + if (ret) + goto out_state; + #ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm6_net_ops); #endif out: return ret; +out_state: + xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; @@ -404,6 +410,7 @@ void xfrm6_fini(void) #ifdef CONFIG_SYSCTL unregister_pernet_subsys(&xfrm6_net_ops); #endif + xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); dst_entries_destroy(&xfrm6_dst_ops); diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c new file mode 100644 index 00000000000..54d13f8dbba --- /dev/null +++ b/net/ipv6/xfrm6_protocol.c @@ -0,0 +1,279 @@ +/* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/xfrm4_protocol.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <linux/icmpv6.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly; +static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly; +static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly; +static DEFINE_MUTEX(xfrm6_protocol_mutex); + +static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp6_handlers; + case IPPROTO_AH: + return &ah6_handlers; + case IPPROTO_COMP: + return &ipcomp6_handlers; + } + + return NULL; +} + +#define for_each_protocol_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ + int ret; + struct xfrm6_protocol *handler; + struct xfrm6_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; + + for_each_protocol_rcu(*proto_handlers(protocol), handler) + if ((ret = handler->cb_handler(skb, err)) <= 0) + return ret; + + return 0; +} +EXPORT_SYMBOL(xfrm6_rcv_cb); + +static int xfrm6_esp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(esp6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(esp6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static int xfrm6_ah_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(ah6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(ah6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static int xfrm6_ipcomp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(ipcomp6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(ipcomp6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static const struct inet6_protocol esp6_protocol = { + .handler = xfrm6_esp_rcv, + .err_handler = xfrm6_esp_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static const struct inet6_protocol ah6_protocol = { + .handler = xfrm6_ah_rcv, + .err_handler = xfrm6_ah_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static const struct inet6_protocol ipcomp6_protocol = { + .handler = xfrm6_ipcomp_rcv, + .err_handler = xfrm6_ipcomp_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static struct xfrm_input_afinfo xfrm6_input_afinfo = { + .family = AF_INET6, + .owner = THIS_MODULE, + .callback = xfrm6_rcv_cb, +}; + +static inline const struct inet6_protocol *netproto(unsigned char protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp6_protocol; + case IPPROTO_AH: + return &ah6_protocol; + case IPPROTO_COMP: + return &ipcomp6_protocol; + } + + return NULL; +} + +int xfrm6_protocol_register(struct xfrm6_protocol *handler, + unsigned char protocol) +{ + struct xfrm6_protocol __rcu **pprev; + struct xfrm6_protocol *t; + bool add_netproto = false; + int ret = -EEXIST; + int priority = handler->priority; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm6_protocol_mutex); + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm6_protocol_mutex))) + add_netproto = true; + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t->priority < priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm6_protocol_mutex); + + if (add_netproto) { + if (inet6_add_protocol(netproto(protocol), protocol)) { + pr_err("%s: can't add protocol\n", __func__); + ret = -EAGAIN; + } + } + + return ret; +} +EXPORT_SYMBOL(xfrm6_protocol_register); + +int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, + unsigned char protocol) +{ + struct xfrm6_protocol __rcu **pprev; + struct xfrm6_protocol *t; + int ret = -ENOENT; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm6_protocol_mutex); + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm6_protocol_mutex))) { + if (inet6_del_protocol(netproto(protocol), protocol) < 0) { + pr_err("%s: can't remove protocol\n", __func__); + ret = -EAGAIN; + } + } + + mutex_unlock(&xfrm6_protocol_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm6_protocol_deregister); + +int __init xfrm6_protocol_init(void) +{ + return xfrm_input_register_afinfo(&xfrm6_input_afinfo); +} + +void xfrm6_protocol_fini(void) +{ + xfrm_input_unregister_afinfo(&xfrm6_input_afinfo); +} diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index e096025b477..91729b807c7 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -52,18 +52,12 @@ #include <net/p8022.h> #include <net/psnap.h> #include <net/sock.h> +#include <net/datalink.h> #include <net/tcp_states.h> +#include <net/net_namespace.h> #include <asm/uaccess.h> -#ifdef CONFIG_SYSCTL -extern void ipx_register_sysctl(void); -extern void ipx_unregister_sysctl(void); -#else -#define ipx_register_sysctl() -#define ipx_unregister_sysctl() -#endif - /* Configuration Variables */ static unsigned char ipxcfg_max_hops = 16; static char ipxcfg_auto_select_primary; @@ -84,15 +78,6 @@ DEFINE_SPINLOCK(ipx_interfaces_lock); struct ipx_interface *ipx_primary_net; struct ipx_interface *ipx_internal_net; -extern int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, - unsigned char *node); -extern void ipxrtr_del_routes(struct ipx_interface *intrfc); -extern int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, - struct iovec *iov, size_t len, int noblock); -extern int ipxrtr_route_skb(struct sk_buff *skb); -extern struct ipx_route *ipxrtr_lookup(__be32 net); -extern int ipxrtr_ioctl(unsigned int cmd, void __user *arg); - struct ipx_interface *ipx_interfaces_head(void) { struct ipx_interface *rc = NULL; @@ -1368,7 +1353,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol, sk_refcnt_debug_inc(sk); sock_init_data(sock, sk); - sk->sk_no_check = 1; /* Checksum off by default */ + sk->sk_no_check_tx = 1; /* Checksum off by default */ sock->ops = &ipx_dgram_ops; rc = 0; out: @@ -1383,6 +1368,7 @@ static int ipx_release(struct socket *sock) goto out; lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); @@ -1707,7 +1693,7 @@ static int ipx_sendmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct ipx_sock *ipxs = ipx_sk(sk); - struct sockaddr_ipx *usipx = (struct sockaddr_ipx *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ipx *, usipx, msg->msg_name); struct sockaddr_ipx local_sipx; int rc = -EINVAL; int flags = msg->msg_flags; @@ -1774,7 +1760,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct ipx_sock *ipxs = ipx_sk(sk); - struct sockaddr_ipx *sipx = (struct sockaddr_ipx *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ipx *, sipx, msg->msg_name); struct ipxhdr *ipx = NULL; struct sk_buff *skb; int copied, rc; @@ -1806,8 +1792,11 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &rc); - if (!skb) + if (!skb) { + if (rc == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) + rc = 0; goto out; + } ipx = ipx_hdr(skb); copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr); @@ -1937,6 +1926,26 @@ static int ipx_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long } #endif +static int ipx_shutdown(struct socket *sock, int mode) +{ + struct sock *sk = sock->sk; + + if (mode < SHUT_RD || mode > SHUT_RDWR) + return -EINVAL; + /* This maps: + * SHUT_RD (0) -> RCV_SHUTDOWN (1) + * SHUT_WR (1) -> SEND_SHUTDOWN (2) + * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) + */ + ++mode; + + lock_sock(sk); + sk->sk_shutdown |= mode; + release_sock(sk); + sk->sk_state_change(sk); + + return 0; +} /* * Socket family declarations @@ -1963,7 +1972,7 @@ static const struct proto_ops ipx_dgram_ops = { .compat_ioctl = ipx_compat_ioctl, #endif .listen = sock_no_listen, - .shutdown = sock_no_shutdown, /* FIXME: support shutdown */ + .shutdown = ipx_shutdown, .setsockopt = ipx_setsockopt, .getsockopt = ipx_getsockopt, .sendmsg = ipx_sendmsg, @@ -1986,9 +1995,6 @@ static struct notifier_block ipx_dev_notifier = { .notifier_call = ipxitf_device_event, }; -extern struct datalink_proto *make_EII_client(void); -extern void destroy_EII_client(struct datalink_proto *); - static const unsigned char ipx_8022_type = 0xE0; static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; static const char ipx_EII_err_msg[] __initconst = diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index 30f4519b092..67e7ad3d46b 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c @@ -20,15 +20,11 @@ DEFINE_RWLOCK(ipx_routes_lock); extern struct ipx_interface *ipx_internal_net; -extern __be16 ipx_cksum(struct ipxhdr *packet, int length); extern struct ipx_interface *ipxitf_find_using_net(__be32 net); extern int ipxitf_demux_socket(struct ipx_interface *intrfc, struct sk_buff *skb, int copy); extern int ipxitf_demux_socket(struct ipx_interface *intrfc, struct sk_buff *skb, int copy); -extern int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, - char *node); -extern struct ipx_interface *ipxitf_find_using_net(__be32 net); struct ipx_route *ipxrtr_lookup(__be32 net) { @@ -240,7 +236,8 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, } /* Apply checksum. Not allowed on 802.3 links. */ - if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023)) + if (sk->sk_no_check_tx || + intrfc->if_dlink_type == htons(IPX_FRAME_8023)) ipx->ipx_checksum = htons(0xFFFF); else ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 73baf9b346b..54747c25c86 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1652,7 +1652,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, /* Check if an address was specified with sendto. Jean II */ if (msg->msg_name) { - struct sockaddr_irda *addr = (struct sockaddr_irda *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_irda *, addr, msg->msg_name); err = -EINVAL; /* Check address, extract pid. Jean II */ if (msg->msg_namelen < sizeof(*addr)) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index c4b7218058b..7a95fa4a3de 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -682,6 +682,18 @@ struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock) return NULL; } +static void __iucv_auto_name(struct iucv_sock *iucv) +{ + char name[12]; + + sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name)); + while (__iucv_get_sock_by_name(name)) { + sprintf(name, "%08x", + atomic_inc_return(&iucv_sk_list.autobind_name)); + } + memcpy(iucv->src_name, name, 8); +} + /* Bind an unbound socket */ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -724,8 +736,12 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if (!memcmp(dev->perm_addr, uid, 8)) { - memcpy(iucv->src_name, sa->siucv_name, 8); memcpy(iucv->src_user_id, sa->siucv_user_id, 8); + /* Check for unitialized siucv_name */ + if (strncmp(sa->siucv_name, " ", 8) == 0) + __iucv_auto_name(iucv); + else + memcpy(iucv->src_name, sa->siucv_name, 8); sk->sk_bound_dev_if = dev->ifindex; iucv->hs_dev = dev; dev_hold(dev); @@ -763,7 +779,6 @@ done: static int iucv_sock_autobind(struct sock *sk) { struct iucv_sock *iucv = iucv_sk(sk); - char name[12]; int err = 0; if (unlikely(!pr_iucv)) @@ -772,17 +787,9 @@ static int iucv_sock_autobind(struct sock *sk) memcpy(iucv->src_user_id, iucv_userid, 8); write_lock_bh(&iucv_sk_list.lock); - - sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name)); - while (__iucv_get_sock_by_name(name)) { - sprintf(name, "%08x", - atomic_inc_return(&iucv_sk_list.autobind_name)); - } - + __iucv_auto_name(iucv); write_unlock_bh(&iucv_sk_list.lock); - memcpy(&iucv->src_name, name, 8); - if (!iucv->msglimit) iucv->msglimit = IUCV_QUEUELEN_DEFAULT; @@ -1382,6 +1389,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_type == SOCK_STREAM) { if (copied < rlen) { IUCV_SKB_CB(skb)->offset = offset + copied; + skb_queue_head(&sk->sk_receive_queue, skb); goto done; } } @@ -1756,7 +1764,7 @@ static int iucv_callback_connreq(struct iucv_path *path, /* Wake up accept */ nsk->sk_state = IUCV_CONNECTED; - sk->sk_data_ready(sk, 1); + sk->sk_data_ready(sk); err = 0; fail: bh_unlock_sock(sk); @@ -1829,7 +1837,7 @@ static void iucv_callback_txdone(struct iucv_path *path, spin_lock_irqsave(&list->lock, flags); while (list_skb != (struct sk_buff *)list) { - if (msg->tag != IUCV_SKB_CB(list_skb)->tag) { + if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { this = list_skb; break; } @@ -1935,11 +1943,10 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) sk_acceptq_is_full(sk) || !nsk) { /* error on server socket - connection refused */ - if (nsk) - sk_free(nsk); afiucv_swap_src_dest(skb); trans_hdr->flags = AF_IUCV_FLAG_SYN | AF_IUCV_FLAG_FIN; err = dev_queue_xmit(skb); + iucv_sock_kill(nsk); bh_unlock_sock(sk); goto out; } @@ -1967,7 +1974,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) if (!err) { iucv_accept_enqueue(sk, nsk); nsk->sk_state = IUCV_CONNECTED; - sk->sk_data_ready(sk, 1); + sk->sk_data_ready(sk); } else iucv_sock_kill(nsk); bh_unlock_sock(sk); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index cd5b8ec9be0..da787930df0 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -621,6 +621,42 @@ static void iucv_disable(void) put_online_cpus(); } +static void free_iucv_data(int cpu) +{ + kfree(iucv_param_irq[cpu]); + iucv_param_irq[cpu] = NULL; + kfree(iucv_param[cpu]); + iucv_param[cpu] = NULL; + kfree(iucv_irq_data[cpu]); + iucv_irq_data[cpu] = NULL; +} + +static int alloc_iucv_data(int cpu) +{ + /* Note: GFP_DMA used to get memory below 2G */ + iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_irq_data[cpu]) + goto out_free; + + /* Allocate parameter blocks. */ + iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param[cpu]) + goto out_free; + + iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param_irq[cpu]) + goto out_free; + + return 0; + +out_free: + free_iucv_data(cpu); + return -ENOMEM; +} + static int iucv_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -630,38 +666,14 @@ static int iucv_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_irq_data[cpu]) - return notifier_from_errno(-ENOMEM); - - iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param[cpu]) { - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; + if (alloc_iucv_data(cpu)) return notifier_from_errno(-ENOMEM); - } - iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param_irq[cpu]) { - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; - return notifier_from_errno(-ENOMEM); - } break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - kfree(iucv_param_irq[cpu]); - iucv_param_irq[cpu] = NULL; - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; + free_iucv_data(cpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -2016,7 +2028,7 @@ static int __init iucv_init(void) rc = iucv_query_maxconn(); if (rc) goto out_ctl; - rc = register_external_interrupt(0x4000, iucv_external_interrupt); + rc = register_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); if (rc) goto out_ctl; iucv_root = root_device_register("iucv"); @@ -2025,33 +2037,20 @@ static int __init iucv_init(void) goto out_int; } - for_each_online_cpu(cpu) { - /* Note: GFP_DMA used to get memory below 2G */ - iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_irq_data[cpu]) { - rc = -ENOMEM; - goto out_free; - } + cpu_notifier_register_begin(); - /* Allocate parameter blocks. */ - iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param[cpu]) { - rc = -ENOMEM; - goto out_free; - } - iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param_irq[cpu]) { + for_each_online_cpu(cpu) { + if (alloc_iucv_data(cpu)) { rc = -ENOMEM; goto out_free; } - } - rc = register_hotcpu_notifier(&iucv_cpu_notifier); + rc = __register_hotcpu_notifier(&iucv_cpu_notifier); if (rc) goto out_free; + + cpu_notifier_register_done(); + rc = register_reboot_notifier(&iucv_reboot_notifier); if (rc) goto out_cpu; @@ -2069,19 +2068,17 @@ static int __init iucv_init(void) out_reboot: unregister_reboot_notifier(&iucv_reboot_notifier); out_cpu: - unregister_hotcpu_notifier(&iucv_cpu_notifier); + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&iucv_cpu_notifier); out_free: - for_each_possible_cpu(cpu) { - kfree(iucv_param_irq[cpu]); - iucv_param_irq[cpu] = NULL; - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; - } + for_each_possible_cpu(cpu) + free_iucv_data(cpu); + + cpu_notifier_register_done(); + root_device_unregister(iucv_root); out_int: - unregister_external_interrupt(0x4000, iucv_external_interrupt); + unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); out_ctl: ctl_clear_bit(0, 1); out: @@ -2105,18 +2102,14 @@ static void __exit iucv_exit(void) kfree(p); spin_unlock_irq(&iucv_queue_lock); unregister_reboot_notifier(&iucv_reboot_notifier); - unregister_hotcpu_notifier(&iucv_cpu_notifier); - for_each_possible_cpu(cpu) { - kfree(iucv_param_irq[cpu]); - iucv_param_irq[cpu] = NULL; - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; - } + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&iucv_cpu_notifier); + for_each_possible_cpu(cpu) + free_iucv_data(cpu); + cpu_notifier_register_done(); root_device_unregister(iucv_root); bus_unregister(&iucv_bus); - unregister_external_interrupt(0x4000, iucv_external_interrupt); + unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); } subsys_initcall(iucv_init); diff --git a/net/key/af_key.c b/net/key/af_key.c index 545f047868a..ba2a2f95911 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -205,7 +205,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { skb_set_owner_r(*skb2, sk); skb_queue_tail(&sk->sk_receive_queue, *skb2); - sk->sk_data_ready(sk, (*skb2)->len); + sk->sk_data_ready(sk); *skb2 = NULL; err = 0; } @@ -365,6 +365,7 @@ static const u8 sadb_ext_min_len[] = { [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress), + [SADB_X_EXT_FILTER] = (u8) sizeof(struct sadb_x_filter), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ @@ -433,12 +434,13 @@ static inline int verify_sec_ctx_len(const void *p) return 0; } -static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx) +static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx, + gfp_t gfp) { struct xfrm_user_sec_ctx *uctx = NULL; int ctx_size = sec_ctx->sadb_x_ctx_len; - uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL); + uctx = kmalloc((sizeof(*uctx)+ctx_size), gfp); if (!uctx) return NULL; @@ -1124,7 +1126,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { - struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) goto out; @@ -1340,6 +1342,12 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ max_spi = range->sadb_spirange_max; } + err = verify_spi_info(x->id.proto, min_spi, max_spi); + if (err) { + xfrm_state_put(x); + return err; + } + err = xfrm_alloc_spi(x, min_spi, max_spi); resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); @@ -1380,10 +1388,9 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb return 0; spin_lock_bh(&x->lock); - if (x->km.state == XFRM_STATE_ACQ) { + if (x->km.state == XFRM_STATE_ACQ) x->km.state = XFRM_STATE_ERROR; - wake_up(&net->xfrm.km_waitq); - } + spin_unlock_bh(&x->lock); xfrm_state_put(x); return 0; @@ -1469,9 +1476,7 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg else err = xfrm_state_update(x); - xfrm_audit_state_add(x, err ? 0 : 1, - audit_get_loginuid(current), - audit_get_sessionid(current), 0); + xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; @@ -1525,9 +1530,7 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_ c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); out: - xfrm_audit_state_delete(x, err ? 0 : 1, - audit_get_loginuid(current), - audit_get_sessionid(current), 0); + xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; @@ -1719,17 +1722,13 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m struct net *net = sock_net(sk); unsigned int proto; struct km_event c; - struct xfrm_audit audit_info; int err, err2; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; - audit_info.loginuid = audit_get_loginuid(current); - audit_info.sessionid = audit_get_sessionid(current); - audit_info.secid = 0; - err = xfrm_state_flush(net, proto, &audit_info); + err = xfrm_state_flush(net, proto, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ @@ -1785,12 +1784,15 @@ static int pfkey_dump_sa(struct pfkey_sock *pfk) static void pfkey_dump_sa_done(struct pfkey_sock *pfk) { - xfrm_state_walk_done(&pfk->dump.u.state); + struct net *net = sock_net(&pfk->sk); + + xfrm_state_walk_done(&pfk->dump.u.state, net); } static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { u8 proto; + struct xfrm_address_filter *filter = NULL; struct pfkey_sock *pfk = pfkey_sk(sk); if (pfk->dump.dump != NULL) @@ -1800,11 +1802,27 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms if (proto == 0) return -EINVAL; + if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { + struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; + + filter = kmalloc(sizeof(*filter), GFP_KERNEL); + if (filter == NULL) + return -ENOMEM; + + memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, + sizeof(xfrm_address_t)); + memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr, + sizeof(xfrm_address_t)); + filter->family = xfilter->sadb_x_filter_family; + filter->splen = xfilter->sadb_x_filter_splen; + filter->dplen = xfilter->sadb_x_filter_dplen; + } + pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sa; pfk->dump.done = pfkey_dump_sa_done; - xfrm_state_walk_init(&pfk->dump.u.state, proto); + xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); return pfkey_do_dump(pfk); } @@ -1861,7 +1879,7 @@ static u32 gen_reqid(struct net *net) reqid = IPSEC_MANUAL_REQID_MAX+1; xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN); rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid); - xfrm_policy_walk_done(&walk); + xfrm_policy_walk_done(&walk, net); if (rc != -EEXIST) return reqid; } while (reqid != start); @@ -2224,14 +2242,14 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_ sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { - struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) { err = -ENOBUFS; goto out; } - err = security_xfrm_policy_alloc(&xp->security, uctx); + err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL); kfree(uctx); if (err) @@ -2262,9 +2280,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_ err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); - xfrm_audit_policy_add(xp, err ? 0 : 1, - audit_get_loginuid(current), - audit_get_sessionid(current), 0); + xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) goto out; @@ -2328,12 +2344,12 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { - struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) return -ENOMEM; - err = security_xfrm_policy_alloc(&pol_ctx, uctx); + err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL); kfree(uctx); if (err) return err; @@ -2346,9 +2362,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa if (xp == NULL) return -ENOENT; - xfrm_audit_policy_delete(xp, err ? 0 : 1, - audit_get_loginuid(current), - audit_get_sessionid(current), 0); + xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; @@ -2485,6 +2499,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, struct xfrm_selector sel; struct xfrm_migrate m[XFRM_MAX_DEPTH]; struct xfrm_kmaddress k; + struct net *net = sock_net(sk); if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1], ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) || @@ -2526,7 +2541,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, sel.sport_mask = htons(0xffff); /* set destination address info of selector */ - sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1], + sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); @@ -2558,7 +2573,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL); + kma ? &k : NULL, net); out: return err; @@ -2595,9 +2610,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_ return -ENOENT; if (delete) { - xfrm_audit_policy_delete(xp, err ? 0 : 1, - audit_get_loginuid(current), - audit_get_sessionid(current), 0); + xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; @@ -2659,7 +2672,9 @@ static int pfkey_dump_sp(struct pfkey_sock *pfk) static void pfkey_dump_sp_done(struct pfkey_sock *pfk) { - xfrm_policy_walk_done(&pfk->dump.u.policy); + struct net *net = sock_net((struct sock *)pfk); + + xfrm_policy_walk_done(&pfk->dump.u.policy, net); } static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) @@ -2704,13 +2719,9 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad { struct net *net = sock_net(sk); struct km_event c; - struct xfrm_audit audit_info; int err, err2; - audit_info.loginuid = audit_get_loginuid(current); - audit_info.sessionid = audit_get_sessionid(current); - audit_info.secid = 0; - err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info); + err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ @@ -3049,6 +3060,24 @@ static u32 get_acqseq(void) return res; } +static bool pfkey_is_alive(const struct km_event *c) +{ + struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id); + struct sock *sk; + bool is_alive = false; + + rcu_read_lock(); + sk_for_each_rcu(sk, &net_pfkey->table) { + if (pfkey_sk(sk)->registered) { + is_alive = true; + break; + } + } + rcu_read_unlock(); + + return is_alive; +} + static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp) { struct sk_buff *skb; @@ -3229,8 +3258,8 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, } if ((*dir = verify_sec_ctx_len(p))) goto out; - uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); - *dir = security_xfrm_policy_alloc(&xp->security, uctx); + uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC); + *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC); kfree(uctx); if (*dir) @@ -3569,6 +3598,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb, struct sk_buff *skb = NULL; struct sadb_msg *hdr = NULL; int err; + struct net *net = sock_net(sk); err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) @@ -3591,9 +3621,9 @@ static int pfkey_sendmsg(struct kiocb *kiocb, if (!hdr) goto out; - mutex_lock(&xfrm_cfg_mutex); + mutex_lock(&net->xfrm.xfrm_cfg_mutex); err = pfkey_process(sk, skb, hdr); - mutex_unlock(&xfrm_cfg_mutex); + mutex_unlock(&net->xfrm.xfrm_cfg_mutex); out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) @@ -3773,6 +3803,7 @@ static struct xfrm_mgr pfkeyv2_mgr = .new_mapping = pfkey_send_new_mapping, .notify_policy = pfkey_send_policy_notify, .migrate = pfkey_send_migrate, + .is_alive = pfkey_is_alive, }; static int __net_init pfkey_net_init(struct net *net) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 9af77d9c0ec..bea25904320 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -112,7 +112,6 @@ struct l2tp_net { spinlock_t l2tp_session_hlist_lock; }; -static void l2tp_session_set_header_len(struct l2tp_session *session, int version); static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) @@ -176,7 +175,7 @@ l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id) * owned by userspace. A struct sock returned from this function must be * released using l2tp_tunnel_sock_put once you're done with it. */ -struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel) +static struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel) { int err = 0; struct socket *sock = NULL; @@ -202,10 +201,9 @@ struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel) out: return sk; } -EXPORT_SYMBOL_GPL(l2tp_tunnel_sock_lookup); /* Drop a reference to a tunnel socket obtained via. l2tp_tunnel_sock_put */ -void l2tp_tunnel_sock_put(struct sock *sk) +static void l2tp_tunnel_sock_put(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); if (tunnel) { @@ -217,7 +215,6 @@ void l2tp_tunnel_sock_put(struct sock *sk) } sock_put(sk); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_sock_put); /* Lookup a session by id in the global session list */ @@ -498,52 +495,6 @@ out: spin_unlock_bh(&session->reorder_q.lock); } -static inline int l2tp_verify_udp_checksum(struct sock *sk, - struct sk_buff *skb) -{ - struct udphdr *uh = udp_hdr(skb); - u16 ulen = ntohs(uh->len); - __wsum psum; - - if (sk->sk_no_check || skb_csum_unnecessary(skb)) - return 0; - -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6 && !l2tp_tunnel(sk)->v4mapped) { - if (!uh->check) { - LIMIT_NETDEBUG(KERN_INFO "L2TP: IPv6: checksum is 0\n"); - return 1; - } - if ((skb->ip_summed == CHECKSUM_COMPLETE) && - !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, ulen, - IPPROTO_UDP, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len, IPPROTO_UDP, - 0)); - } else -#endif - { - struct inet_sock *inet; - if (!uh->check) - return 0; - inet = inet_sk(sk); - psum = csum_tcpudp_nofold(inet->inet_saddr, inet->inet_daddr, - ulen, IPPROTO_UDP, 0); - - if ((skb->ip_summed == CHECKSUM_COMPLETE) && - !csum_fold(csum_add(psum, skb->csum))) - return 0; - skb->csum = psum; - } - - return __skb_checksum_complete(skb); -} - static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr) { u32 nws; @@ -898,8 +849,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, u16 version; int length; - if (tunnel->sock && l2tp_verify_udp_checksum(tunnel->sock, skb)) - goto discard_bad_csum; + /* UDP has verifed checksum */ /* UDP always verifies the packet length. */ __skb_pull(skb, sizeof(struct udphdr)); @@ -982,14 +932,6 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, return 0; -discard_bad_csum: - LIMIT_NETDEBUG("%s: UDP: bad checksum\n", tunnel->name); - UDP_INC_STATS_USER(tunnel->l2tp_net, UDP_MIB_INERRORS, 0); - atomic_long_inc(&tunnel->stats.rx_errors); - kfree_skb(skb); - - return 0; - error: /* Put UDP header back */ __skb_push(skb, sizeof(struct udphdr)); @@ -1131,13 +1073,13 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, } /* Queue the packet to IP for output */ - skb->local_df = 1; + skb->ignore_df = 1; #if IS_ENABLED(CONFIG_IPV6) - if (skb->sk->sk_family == PF_INET6 && !tunnel->v4mapped) - error = inet6_csk_xmit(skb, NULL); + if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped) + error = inet6_csk_xmit(tunnel->sock, skb, NULL); else #endif - error = ip_queue_xmit(skb, fl); + error = ip_queue_xmit(tunnel->sock, skb, fl); /* Update stats */ if (error >= 0) { @@ -1153,48 +1095,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, return 0; } -/* Automatically called when the skb is freed. - */ -static void l2tp_sock_wfree(struct sk_buff *skb) -{ - sock_put(skb->sk); -} - -/* For data skbs that we transmit, we associate with the tunnel socket - * but don't do accounting. - */ -static inline void l2tp_skb_set_owner_w(struct sk_buff *skb, struct sock *sk) -{ - sock_hold(sk); - skb->sk = sk; - skb->destructor = l2tp_sock_wfree; -} - -#if IS_ENABLED(CONFIG_IPV6) -static void l2tp_xmit_ipv6_csum(struct sock *sk, struct sk_buff *skb, - int udp_len) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct udphdr *uh = udp_hdr(skb); - - if (!skb_dst(skb) || !skb_dst(skb)->dev || - !(skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { - __wsum csum = skb_checksum(skb, 0, udp_len, 0); - skb->ip_summed = CHECKSUM_UNNECESSARY; - uh->check = csum_ipv6_magic(&np->saddr, &sk->sk_v6_daddr, udp_len, - IPPROTO_UDP, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - uh->check = ~csum_ipv6_magic(&np->saddr, &sk->sk_v6_daddr, - udp_len, IPPROTO_UDP, 0); - } -} -#endif - /* If caller requires the skb to have a ppp header, the header must be * inserted in the skb data before calling this function. */ @@ -1206,7 +1106,6 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len struct flowi *fl; struct udphdr *uh; struct inet_sock *inet; - __wsum csum; int headroom; int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; int udp_len; @@ -1223,7 +1122,6 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len return NET_XMIT_DROP; } - skb_orphan(skb); /* Setup L2TP header */ session->build_header(session, __skb_push(skb, hdr_len)); @@ -1256,41 +1154,23 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len uh->dest = inet->inet_dport; udp_len = uhlen + hdr_len + data_len; uh->len = htons(udp_len); - uh->check = 0; /* Calculate UDP checksum if configured to do so */ #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) - l2tp_xmit_ipv6_csum(sk, skb, udp_len); + udp6_set_csum(udp_get_no_check6_tx(sk), + skb, &inet6_sk(sk)->saddr, + &sk->sk_v6_daddr, udp_len); else #endif - if (sk->sk_no_check == UDP_CSUM_NOXMIT) - skb->ip_summed = CHECKSUM_NONE; - else if ((skb_dst(skb) && skb_dst(skb)->dev) && - (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM))) { - skb->ip_summed = CHECKSUM_COMPLETE; - csum = skb_checksum(skb, 0, udp_len, 0); - uh->check = csum_tcpudp_magic(inet->inet_saddr, - inet->inet_daddr, - udp_len, IPPROTO_UDP, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - uh->check = ~csum_tcpudp_magic(inet->inet_saddr, - inet->inet_daddr, - udp_len, IPPROTO_UDP, 0); - } + udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr, + inet->inet_daddr, udp_len); break; case L2TP_ENCAPTYPE_IP: break; } - l2tp_skb_set_owner_w(skb, sk); - l2tp_xmit_core(session, skb, fl, data_len); out_unlock: bh_unlock_sock(sk); @@ -1513,6 +1393,11 @@ static int l2tp_tunnel_sock_create(struct net *net, sizeof(udp6_addr), 0); if (err < 0) goto out; + + if (cfg->udp6_zero_tx_checksums) + udp_set_no_check6_tx(sock->sk, true); + if (cfg->udp6_zero_rx_checksums) + udp_set_no_check6_rx(sock->sk, true); } else #endif { @@ -1541,7 +1426,7 @@ static int l2tp_tunnel_sock_create(struct net *net, } if (!cfg->use_udp_checksums) - sock->sk->sk_no_check = UDP_CSUM_NOXMIT; + sock->sk->sk_no_check_tx = 1; break; @@ -1811,8 +1696,6 @@ void l2tp_session_free(struct l2tp_session *session) } kfree(session); - - return; } EXPORT_SYMBOL_GPL(l2tp_session_free); @@ -1865,7 +1748,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_delete); /* We come here whenever a session's send_seq, cookie_len or * l2specific_len parameters are set. */ -static void l2tp_session_set_header_len(struct l2tp_session *session, int version) +void l2tp_session_set_header_len(struct l2tp_session *session, int version) { if (version == L2TP_HDR_VER_2) { session->hdr_len = 6; @@ -1878,6 +1761,7 @@ static void l2tp_session_set_header_len(struct l2tp_session *session, int versio } } +EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { @@ -2018,7 +1902,7 @@ static int __init l2tp_init(void) if (rc) goto out; - l2tp_wq = alloc_workqueue("l2tp", WQ_NON_REENTRANT | WQ_UNBOUND, 0); + l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0); if (!l2tp_wq) { pr_err("alloc_workqueue failed\n"); rc = -ENOMEM; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 1ee9f6965d6..68aa9ffd4ae 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -162,7 +162,9 @@ struct l2tp_tunnel_cfg { #endif u16 local_udp_port; u16 peer_udp_port; - unsigned int use_udp_checksums:1; + unsigned int use_udp_checksums:1, + udp6_zero_tx_checksums:1, + udp6_zero_rx_checksums:1; }; struct l2tp_tunnel { @@ -238,8 +240,6 @@ out: return tunnel; } -struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel); -void l2tp_tunnel_sock_put(struct sock *sk); struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id); @@ -265,6 +265,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, int length, int (*payload_hook)(struct sk_buff *skb)); int l2tp_session_queue_purge(struct l2tp_session *session); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); +void l2tp_session_set_header_len(struct l2tp_session *session, int version); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index da1a1cee1a0..369a9822488 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -403,7 +403,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m /* Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_l2tpip *lip = (struct sockaddr_l2tpip *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name); rc = -EINVAL; if (msg->msg_namelen < sizeof(*lip)) goto out; @@ -487,7 +487,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m xmit: /* Queue the packet to IP for output */ - rc = ip_queue_xmit(skb, &inet->cork.fl); + rc = ip_queue_xmit(sk, skb, &inet->cork.fl); rcu_read_unlock(); error: @@ -512,7 +512,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) @@ -606,7 +606,6 @@ static struct inet_protosw l2tp_ip_protosw = { .protocol = IPPROTO_L2TP, .prot = &l2tp_ip_prot, .ops = &l2tp_ip_ops, - .no_check = 0, }; static struct net_protocol l2tp_ip_protocol __read_mostly = { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index d9b437e5500..f3f98a156ce 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -371,6 +371,9 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, if (addr_len < sizeof(*lsa)) return -EINVAL; + if (usin->sin6_family != AF_INET6) + return -EINVAL; + addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -EINVAL; @@ -481,8 +484,7 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; - struct sockaddr_l2tpip6 *lsa = - (struct sockaddr_l2tpip6 *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt = NULL; @@ -528,7 +530,6 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } @@ -598,20 +599,14 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } - if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl6.daddr)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); - } + if (hlimit < 0) + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (tclass < 0) tclass = np->tclass; @@ -653,7 +648,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; @@ -760,7 +755,6 @@ static struct inet_protosw l2tp_ip6_protosw = { .protocol = IPPROTO_L2TP, .prot = &l2tp_ip6_prot, .ops = &l2tp_ip6_ops, - .no_check = 0, }; static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 4cfd722e915..0ac907adb2f 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -161,6 +161,13 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]); if (info->attrs[L2TP_ATTR_UDP_CSUM]) cfg.use_udp_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_CSUM]); + +#if IS_ENABLED(CONFIG_IPV6) + if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]) + cfg.udp6_zero_tx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); + if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]) + cfg.udp6_zero_rx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); +#endif } if (info->attrs[L2TP_ATTR_DEBUG]) @@ -297,8 +304,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla case L2TP_ENCAPTYPE_UDP: if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)) || - nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, - (sk->sk_no_check != UDP_CSUM_NOXMIT))) + nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx)) goto nla_put_failure; /* NOBREAK */ case L2TP_ENCAPTYPE_IP: @@ -578,8 +584,10 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); - if (info->attrs[L2TP_ATTR_SEND_SEQ]) + if (info->attrs[L2TP_ATTR_SEND_SEQ]) { session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); + l2tp_session_set_header_len(session, session->tunnel->version); + } if (info->attrs[L2TP_ATTR_LNS_MODE]) session->lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index be5fadf3473..13752d96275 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -254,12 +254,14 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int po = pppox_sk(sk); ppp_input(&po->chan, skb); } else { - l2tp_info(session, PPPOL2TP_MSG_DATA, "%s: socket not bound\n", - session->name); + l2tp_dbg(session, PPPOL2TP_MSG_DATA, + "%s: recv %d byte data frame, passing to L2TP socket\n", + session->name, data_len); - /* Not bound. Nothing we can do, so discard. */ - atomic_long_inc(&session->stats.rx_errors); - kfree_skb(skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { + atomic_long_inc(&session->stats.rx_errors); + kfree_skb(skb); + } } return; @@ -454,13 +456,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { inet_shutdown(sock, 2); /* Don't let the session go away before our socket does */ l2tp_session_inc_refcount(session); } - return; } /* Really kill the session socket. (Called from sock_put() if @@ -474,7 +474,6 @@ static void pppol2tp_session_destruct(struct sock *sk) BUG_ON(session->magic != L2TP_SESSION_MAGIC); l2tp_session_dec_refcount(session); } - return; } /* Called when the PPPoX socket (session) is closed. @@ -754,9 +753,9 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, session->deref = pppol2tp_session_sock_put; /* If PMTU discovery was enabled, use the MTU that was discovered */ - dst = sk_dst_get(sk); + dst = sk_dst_get(tunnel->sock); if (dst != NULL) { - u32 pmtu = dst_mtu(__sk_dst_get(sk)); + u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock)); if (pmtu != 0) session->mtu = session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; @@ -1312,6 +1311,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } + l2tp_session_set_header_len(session, session->tunnel->version); l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set send_seq=%d\n", session->name, session->send_seq); @@ -1365,7 +1365,7 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, int err; if (level != SOL_PPPOL2TP) - return udp_prot.setsockopt(sk, level, optname, optval, optlen); + return -EINVAL; if (optlen < sizeof(int)) return -EINVAL; @@ -1491,7 +1491,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, struct pppol2tp_session *ps; if (level != SOL_PPPOL2TP) - return udp_prot.getsockopt(sk, level, optname, optval, optlen); + return -EINVAL; if (get_user(len, optlen)) return -EFAULT; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 7b01b9f5846..0080d2b0a8a 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -707,7 +707,7 @@ out: static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags) { - struct sockaddr_llc *uaddr = (struct sockaddr_llc *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name); const int nonblock = flags & MSG_DONTWAIT; struct sk_buff *skb = NULL; struct sock *sk = sock->sk; @@ -715,7 +715,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, unsigned long cpu_flags; size_t copied = 0; u32 peek_seq = 0; - u32 *seq; + u32 *seq, skb_len; unsigned long used; int target; /* Read at least this many bytes */ long timeo; @@ -812,6 +812,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, } continue; found_ok_skb: + skb_len = skb->len; /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) @@ -844,7 +845,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, } /* Partial read */ - if (used + offset < skb->len) + if (used + offset < skb_len) continue; } while (len > 0); @@ -883,7 +884,7 @@ static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - struct sockaddr_llc *addr = (struct sockaddr_llc *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index cd872417796..42dc2e45c92 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -753,7 +753,7 @@ void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk) * * Sends received pdus to the connection state machine. */ -static int llc_conn_rcv(struct sock* sk, struct sk_buff *skb) +static int llc_conn_rcv(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); @@ -891,7 +891,7 @@ out_kfree_skb: * * Initializes a socket with default llc values. */ -static void llc_sk_init(struct sock* sk) +static void llc_sk_init(struct sock *sk) { struct llc_sock *llc = llc_sk(sk); diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 2bb0ddff8c0..842851cef69 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -23,7 +23,7 @@ #include <net/llc.h> LIST_HEAD(llc_sap_list); -DEFINE_SPINLOCK(llc_sap_list_lock); +static DEFINE_SPINLOCK(llc_sap_list_lock); /** * llc_sap_alloc - allocates and initializes sap. @@ -48,7 +48,7 @@ static struct llc_sap *llc_sap_alloc(void) static struct llc_sap *__llc_sap_find(unsigned char sap_value) { - struct llc_sap* sap; + struct llc_sap *sap; list_for_each_entry(sap, &llc_sap_list, node) if (sap->laddr.lsap == sap_value) @@ -159,7 +159,6 @@ module_init(llc_init); module_exit(llc_exit); EXPORT_SYMBOL(llc_sap_list); -EXPORT_SYMBOL(llc_sap_list_lock); EXPORT_SYMBOL(llc_sap_find); EXPORT_SYMBOL(llc_sap_open); EXPORT_SYMBOL(llc_sap_close); diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c index 2dae8a5df23..94425e42121 100644 --- a/net/llc/llc_output.c +++ b/net/llc/llc_output.c @@ -43,7 +43,7 @@ int llc_mac_hdr_init(struct sk_buff *skb, rc = 0; break; default: - WARN(1, "device type not supported: %d\n", skb->dev->type); + break; } return rc; } diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index e5850699098..06033f6c845 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -66,7 +66,7 @@ struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev, return skb; } -void llc_save_primitive(struct sock *sk, struct sk_buff* skb, u8 prim) +void llc_save_primitive(struct sock *sk, struct sk_buff *skb, u8 prim) { struct sockaddr_llc *addr; @@ -114,7 +114,7 @@ void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb) * failure. */ static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap, - struct sk_buff* skb) + struct sk_buff *skb) { int i = 0; struct llc_sap_state_trans *rc = NULL; diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 9d7d840aac6..1e46ffa6916 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -25,7 +25,8 @@ mac80211-y := \ wme.o \ event.o \ chan.o \ - trace.o mlme.o + trace.o mlme.o \ + tdls.o mac80211-$(CONFIG_MAC80211_LEDS) += led.o mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c index 7c7df475a40..ec24378caaa 100644 --- a/net/mac80211/aes_ccm.c +++ b/net/mac80211/aes_ccm.c @@ -23,12 +23,13 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, u8 *data, size_t data_len, u8 *mic) { struct scatterlist assoc, pt, ct[2]; - struct { - struct aead_request req; - u8 priv[crypto_aead_reqsize(tfm)]; - } aead_req; - memset(&aead_req, 0, sizeof(aead_req)); + char aead_req_data[sizeof(struct aead_request) + + crypto_aead_reqsize(tfm)] + __aligned(__alignof__(struct aead_request)); + struct aead_request *aead_req = (void *) aead_req_data; + + memset(aead_req, 0, sizeof(aead_req_data)); sg_init_one(&pt, data, data_len); sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); @@ -36,23 +37,23 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, sg_set_buf(&ct[0], data, data_len); sg_set_buf(&ct[1], mic, IEEE80211_CCMP_MIC_LEN); - aead_request_set_tfm(&aead_req.req, tfm); - aead_request_set_assoc(&aead_req.req, &assoc, assoc.length); - aead_request_set_crypt(&aead_req.req, &pt, ct, data_len, b_0); + aead_request_set_tfm(aead_req, tfm); + aead_request_set_assoc(aead_req, &assoc, assoc.length); + aead_request_set_crypt(aead_req, &pt, ct, data_len, b_0); - crypto_aead_encrypt(&aead_req.req); + crypto_aead_encrypt(aead_req); } int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, u8 *data, size_t data_len, u8 *mic) { struct scatterlist assoc, pt, ct[2]; - struct { - struct aead_request req; - u8 priv[crypto_aead_reqsize(tfm)]; - } aead_req; + char aead_req_data[sizeof(struct aead_request) + + crypto_aead_reqsize(tfm)] + __aligned(__alignof__(struct aead_request)); + struct aead_request *aead_req = (void *) aead_req_data; - memset(&aead_req, 0, sizeof(aead_req)); + memset(aead_req, 0, sizeof(aead_req_data)); sg_init_one(&pt, data, data_len); sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); @@ -60,12 +61,12 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, sg_set_buf(&ct[0], data, data_len); sg_set_buf(&ct[1], mic, IEEE80211_CCMP_MIC_LEN); - aead_request_set_tfm(&aead_req.req, tfm); - aead_request_set_assoc(&aead_req.req, &assoc, assoc.length); - aead_request_set_crypt(&aead_req.req, ct, &pt, + aead_request_set_tfm(aead_req, tfm); + aead_request_set_assoc(aead_req, &assoc, assoc.length); + aead_request_set_crypt(aead_req, ct, &pt, data_len + IEEE80211_CCMP_MIC_LEN, b_0); - return crypto_aead_decrypt(&aead_req.req); + return crypto_aead_decrypt(aead_req); } struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[]) diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index 537488cbf94..9b9009f9955 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -111,7 +111,7 @@ void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad, } -struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]) +struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[]) { struct crypto_cipher *tfm; diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h index 20785a64725..0ce6487af79 100644 --- a/net/mac80211/aes_cmac.h +++ b/net/mac80211/aes_cmac.h @@ -11,7 +11,7 @@ #include <linux/crypto.h> -struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]); +struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[]); void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad, const u8 *data, size_t data_len, u8 *mic); void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 13b7683de5a..ce9633a3cfb 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -107,7 +107,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.addba_req.start_seq_num = cpu_to_le16(start_seq_num << 4); - ieee80211_tx_skb_tid(sdata, skb, tid); + ieee80211_tx_skb(sdata, skb); } void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f80e8c4c6bc..592f4b152ba 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -109,6 +109,15 @@ static int ieee80211_change_iface(struct wiphy *wiphy, static int ieee80211_start_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + int ret; + + mutex_lock(&sdata->local->chanctx_mtx); + ret = ieee80211_check_combinations(sdata, NULL, 0, 0); + mutex_unlock(&sdata->local->chanctx_mtx); + if (ret < 0) + return ret; + return ieee80211_do_open(wdev, true); } @@ -301,9 +310,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, if (!sta) goto out; - if (pairwise) + if (pairwise && key_idx < NUM_DEFAULT_KEYS) key = rcu_dereference(sta->ptk[key_idx]); - else if (key_idx < NUM_DEFAULT_KEYS) + else if (!pairwise && + key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) key = rcu_dereference(sta->gtk[key_idx]); } else key = rcu_dereference(sdata->keys[key_idx]); @@ -450,11 +460,11 @@ void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) rinfo->flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH; if (sta->last_rx_rate_flag & RX_FLAG_SHORT_GI) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; - if (sta->last_rx_rate_flag & RX_FLAG_80MHZ) + if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_80MHZ) rinfo->flags |= RATE_INFO_FLAGS_80_MHZ_WIDTH; - if (sta->last_rx_rate_flag & RX_FLAG_80P80MHZ) + if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_80P80MHZ) rinfo->flags |= RATE_INFO_FLAGS_80P80_MHZ_WIDTH; - if (sta->last_rx_rate_flag & RX_FLAG_160MHZ) + if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_160MHZ) rinfo->flags |= RATE_INFO_FLAGS_160_MHZ_WIDTH; } @@ -462,10 +472,15 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; + struct rate_control_ref *ref = NULL; struct timespec uptime; u64 packets = 0; + u32 thr = 0; int i, ac; + if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) + ref = local->rate_ctrl; + sinfo->generation = sdata->local->sta_generation; sinfo->filled = STATION_INFO_INACTIVE_TIME | @@ -577,6 +592,17 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); + + /* check if the driver has a SW RC implementation */ + if (ref && ref->ops->get_expected_throughput) + thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); + else + thr = drv_get_expected_throughput(local, &sta->sta); + + if (thr != 0) { + sinfo->filled |= STATION_INFO_EXPECTED_THROUGHPUT; + sinfo->expected_throughput = thr; + } } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { @@ -767,7 +793,7 @@ static void ieee80211_get_et_strings(struct wiphy *wiphy, } static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, - int idx, u8 *mac, struct station_info *sinfo) + int idx, u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -797,7 +823,7 @@ static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, - u8 *mac, struct station_info *sinfo) + const u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -827,6 +853,7 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, if (cfg80211_chandef_identical(&local->monitor_chandef, chandef)) return 0; + mutex_lock(&local->mtx); mutex_lock(&local->iflist_mtx); if (local->use_chanctx) { sdata = rcu_dereference_protected( @@ -845,6 +872,7 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, if (ret == 0) local->monitor_chandef = *chandef; mutex_unlock(&local->iflist_mtx); + mutex_unlock(&local->mtx); return ret; } @@ -873,8 +901,8 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, return 0; } -int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_beacon_data *params) +static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, + struct cfg80211_beacon_data *params) { struct beacon_data *new, *old; int new_head_len, new_tail_len; @@ -950,6 +978,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; struct beacon_data *old; struct ieee80211_sub_if_data *vlan; u32 changed = BSS_CHANGED_BEACON_INT | @@ -966,13 +995,15 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, /* TODO: make hostapd tell us what it wants */ sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = sdata->local->rx_chains; - sdata->radar_required = params->radar_required; + mutex_lock(&local->mtx); err = ieee80211_vif_use_channel(sdata, ¶ms->chandef, IEEE80211_CHANCTX_SHARED); + if (!err) + ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + mutex_unlock(&local->mtx); if (err) return err; - ieee80211_vif_copy_chanctx_to_vlans(sdata, false); /* * Apply control port protocol, this allows us to @@ -1015,8 +1046,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, IEEE80211_P2P_OPPPS_ENABLE_BIT; err = ieee80211_assign_beacon(sdata, ¶ms->beacon); - if (err < 0) + if (err < 0) { + ieee80211_vif_release_channel(sdata); return err; + } changed |= err; err = drv_start_ap(sdata->local, sdata); @@ -1026,9 +1059,11 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(sdata->u.ap.beacon, NULL); + ieee80211_vif_release_channel(sdata); return err; } + ieee80211_recalc_dtim(local, sdata); ieee80211_bss_info_change_notify(sdata, changed); netif_carrier_on(dev); @@ -1046,6 +1081,7 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, int err; sdata = IEEE80211_DEV_TO_SUB_IF(dev); + sdata_assert_lock(sdata); /* don't allow changing the beacon while CSA is in place - offset * of channel switch counter may change @@ -1064,6 +1100,31 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, return 0; } +bool ieee80211_csa_needs_block_tx(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + + lockdep_assert_held(&local->mtx); + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if (!sdata->vif.csa_active) + continue; + + if (!sdata->csa_block_tx) + continue; + + rcu_read_unlock(); + return true; + } + rcu_read_unlock(); + + return false; +} + static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1073,18 +1134,25 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) struct probe_resp *old_probe_resp; struct cfg80211_chan_def chandef; + sdata_assert_lock(sdata); + old_beacon = sdata_dereference(sdata->u.ap.beacon, sdata); if (!old_beacon) return -ENOENT; old_probe_resp = sdata_dereference(sdata->u.ap.probe_resp, sdata); /* abort any running channel switch */ + mutex_lock(&local->mtx); sdata->vif.csa_active = false; + if (!ieee80211_csa_needs_block_tx(local)) + ieee80211_wake_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); + mutex_unlock(&local->mtx); + kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; - cancel_work_sync(&sdata->u.ap.request_smps_work); - /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); @@ -1096,18 +1164,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); + sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF; - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) - sta_info_flush_defer(vlan); - sta_info_flush_defer(sdata); - synchronize_net(); - rcu_barrier(); - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { - sta_info_flush_cleanup(vlan); - ieee80211_free_keys(vlan); - } - sta_info_flush_cleanup(sdata); - ieee80211_free_keys(sdata); + __sta_info_flush(sdata, true); + ieee80211_free_keys(sdata, true); sdata->vif.bss_conf.enable_beacon = false; sdata->vif.bss_conf.ssid_len = 0; @@ -1128,8 +1188,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf); skb_queue_purge(&sdata->u.ap.ps.bc_buf); + mutex_lock(&local->mtx); ieee80211_vif_copy_chanctx_to_vlans(sdata, true); ieee80211_vif_release_channel(sdata); + mutex_unlock(&local->mtx); return 0; } @@ -1341,6 +1403,15 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, sta); + if (params->opmode_notif_used) { + /* returned value is only needed for rc update, but the + * rc isn't initialized here yet, so ignore it + */ + __ieee80211_vht_handle_opmode(sdata, sta, + params->opmode_notif, + band, false); + } + if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH u32 changed = 0; @@ -1402,7 +1473,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, } static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, - u8 *mac, struct station_parameters *params) + const u8 *mac, + struct station_parameters *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; @@ -1436,6 +1508,8 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) { sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); + } else { + sta->sta.tdls = true; } err = sta_apply_parameters(local, sta, params); @@ -1469,7 +1543,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, - u8 *mac) + const u8 *mac) { struct ieee80211_sub_if_data *sdata; @@ -1483,7 +1557,7 @@ static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_change_station(struct wiphy *wiphy, - struct net_device *dev, u8 *mac, + struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1552,7 +1626,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sta->sdata->u.vlan.sta) { - rcu_assign_pointer(sta->sdata->u.vlan.sta, NULL); + RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); prev_4addr = true; } @@ -1608,7 +1682,7 @@ out_err: #ifdef CONFIG_MAC80211_MESH static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, - u8 *dst, u8 *next_hop) + const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; @@ -1636,7 +1710,7 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, - u8 *dst) + const u8 *dst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1647,9 +1721,8 @@ static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int ieee80211_change_mpath(struct wiphy *wiphy, - struct net_device *dev, - u8 *dst, u8 *next_hop) +static int ieee80211_change_mpath(struct wiphy *wiphy, struct net_device *dev, + const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; @@ -1741,8 +1814,8 @@ static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev, - int idx, u8 *dst, u8 *next_hop, - struct mpath_info *pinfo) + int idx, u8 *dst, u8 *next_hop, + struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; @@ -1952,8 +2025,10 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = sdata->local->rx_chains; + mutex_lock(&sdata->local->mtx); err = ieee80211_vif_use_channel(sdata, &setup->chandef, IEEE80211_CHANCTX_SHARED); + mutex_unlock(&sdata->local->mtx); if (err) return err; @@ -1965,7 +2040,9 @@ static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); ieee80211_stop_mesh(sdata); + mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); return 0; } @@ -2587,8 +2664,8 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, int j; sdata->rc_rateidx_mask[i] = mask->control[i].legacy; - memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].mcs, - sizeof(mask->control[i].mcs)); + memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs, + sizeof(mask->control[i].ht_mcs)); sdata->rc_has_mcs_mask[i] = false; if (!sband) @@ -2624,6 +2701,18 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (!roc) return -ENOMEM; + /* + * If the duration is zero, then the driver + * wouldn't actually do anything. Set it to + * 10 for now. + * + * TODO: cancel the off-channel operation + * when we get the SKB's TX status and + * the wait time was zero before. + */ + if (!duration) + duration = 10; + roc->chan = channel; roc->duration = duration; roc->req_duration = duration; @@ -2634,6 +2723,24 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); + /* + * cookie is either the roc cookie (for normal roc) + * or the SKB (for mgmt TX) + */ + if (!txskb) { + /* local->mtx protects this */ + local->roc_cookie_counter++; + roc->cookie = local->roc_cookie_counter; + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(roc->cookie == 0)) { + roc->cookie = 1; + local->roc_cookie_counter++; + } + *cookie = roc->cookie; + } else { + *cookie = (unsigned long)txskb; + } + /* if there's one pending or we're scanning, queue this one */ if (!list_empty(&local->roc_list) || local->scanning || local->radar_detect_enabled) @@ -2647,18 +2754,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, /* otherwise actually kick it off here (for error handling) */ - /* - * If the duration is zero, then the driver - * wouldn't actually do anything. Set it to - * 10 for now. - * - * TODO: cancel the off-channel operation - * when we get the SKB's TX status and - * the wait time was zero before. - */ - if (!duration) - duration = 10; - ret = drv_remain_on_channel(local, sdata, channel, duration, type); if (ret) { kfree(roc); @@ -2768,24 +2863,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (!queued) list_add_tail(&roc->list, &local->roc_list); - /* - * cookie is either the roc cookie (for normal roc) - * or the SKB (for mgmt TX) - */ - if (!txskb) { - /* local->mtx protects this */ - local->roc_cookie_counter++; - roc->cookie = local->roc_cookie_counter; - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(roc->cookie == 0)) { - roc->cookie = 1; - local->roc_cookie_counter++; - } - *cookie = roc->cookie; - } else { - *cookie = (unsigned long)txskb; - } - return 0; } @@ -2896,33 +2973,35 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_chan_def *chandef) + struct cfg80211_chan_def *chandef, + u32 cac_time_ms) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - unsigned long timeout; int err; - if (!list_empty(&local->roc_list) || local->scanning) - return -EBUSY; + mutex_lock(&local->mtx); + if (!list_empty(&local->roc_list) || local->scanning) { + err = -EBUSY; + goto out_unlock; + } /* whatever, but channel contexts should not complain about that one */ sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = local->rx_chains; - sdata->radar_required = true; - mutex_lock(&local->iflist_mtx); err = ieee80211_vif_use_channel(sdata, chandef, IEEE80211_CHANCTX_SHARED); - mutex_unlock(&local->iflist_mtx); if (err) - return err; + goto out_unlock; - timeout = msecs_to_jiffies(IEEE80211_DFS_MIN_CAC_TIME_MS); ieee80211_queue_delayed_work(&sdata->local->hw, - &sdata->dfs_cac_timer_work, timeout); + &sdata->dfs_cac_timer_work, + msecs_to_jiffies(cac_time_ms)); - return 0; + out_unlock: + mutex_unlock(&local->mtx); + return err; } static struct cfg80211_beacon_data * @@ -2981,134 +3060,179 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) return new_beacon; } -void ieee80211_csa_finalize_work(struct work_struct *work) +void ieee80211_csa_finish(struct ieee80211_vif *vif) { - struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, - csa_finalize_work); - struct ieee80211_local *local = sdata->local; - int err, changed = 0; - - sdata_lock(sdata); - /* AP might have been stopped while waiting for the lock. */ - if (!sdata->vif.csa_active) - goto unlock; - - if (!ieee80211_sdata_running(sdata)) - goto unlock; + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - sdata->radar_required = sdata->csa_radar_required; - err = ieee80211_vif_change_channel(sdata, &changed); - if (WARN_ON(err < 0)) - goto unlock; - - if (!local->use_chanctx) { - local->_oper_chandef = sdata->csa_chandef; - ieee80211_hw_config(local, 0); - } + ieee80211_queue_work(&sdata->local->hw, + &sdata->csa_finalize_work); +} +EXPORT_SYMBOL(ieee80211_csa_finish); - ieee80211_bss_info_change_notify(sdata, changed); +static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, + u32 *changed) +{ + int err; - sdata->vif.csa_active = false; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon); - if (err < 0) - goto unlock; - - changed |= err; kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; - ieee80211_bss_info_change_notify(sdata, err); + if (err < 0) + return err; + *changed |= err; break; case NL80211_IFTYPE_ADHOC: - ieee80211_ibss_finish_csa(sdata); + err = ieee80211_ibss_finish_csa(sdata); + if (err < 0) + return err; + *changed |= err; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: err = ieee80211_mesh_finish_csa(sdata); if (err < 0) - goto unlock; + return err; + *changed |= err; break; #endif default: WARN_ON(1); - goto unlock; + return -EINVAL; } - ieee80211_wake_queues_by_reason(&sdata->local->hw, + return 0; +} + +static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + u32 changed = 0; + int err; + + sdata_assert_lock(sdata); + lockdep_assert_held(&local->mtx); + + sdata->radar_required = sdata->csa_radar_required; + err = ieee80211_vif_change_channel(sdata, &changed); + if (err < 0) + return err; + + if (!local->use_chanctx) { + local->_oper_chandef = sdata->csa_chandef; + ieee80211_hw_config(local, 0); + } + + sdata->vif.csa_active = false; + + err = ieee80211_set_after_csa_beacon(sdata, &changed); + if (err) + return err; + + ieee80211_bss_info_change_notify(sdata, changed); + cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef); + + if (!ieee80211_csa_needs_block_tx(local)) + ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_CSA); - cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef); + return 0; +} -unlock: - sdata_unlock(sdata); +static void ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) +{ + if (__ieee80211_csa_finalize(sdata)) { + sdata_info(sdata, "failed to finalize CSA, disconnecting\n"); + cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, + GFP_KERNEL); + } } -static int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_csa_settings *params) +void ieee80211_csa_finalize_work(struct work_struct *work) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + csa_finalize_work); struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_chanctx *chanctx; - struct ieee80211_if_mesh __maybe_unused *ifmsh; - int err, num_chanctx; - - lockdep_assert_held(&sdata->wdev.mtx); - - if (!list_empty(&local->roc_list) || local->scanning) - return -EBUSY; - if (sdata->wdev.cac_started) - return -EBUSY; + sdata_lock(sdata); + mutex_lock(&local->mtx); - if (cfg80211_chandef_identical(¶ms->chandef, - &sdata->vif.bss_conf.chandef)) - return -EINVAL; + /* AP might have been stopped while waiting for the lock. */ + if (!sdata->vif.csa_active) + goto unlock; - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (!chanctx_conf) { - rcu_read_unlock(); - return -EBUSY; - } + if (!ieee80211_sdata_running(sdata)) + goto unlock; - /* don't handle for multi-VIF cases */ - chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); - if (chanctx->refcount > 1) { - rcu_read_unlock(); - return -EBUSY; - } - num_chanctx = 0; - list_for_each_entry_rcu(chanctx, &local->chanctx_list, list) - num_chanctx++; - rcu_read_unlock(); + ieee80211_csa_finalize(sdata); - if (num_chanctx > 1) - return -EBUSY; +unlock: + mutex_unlock(&local->mtx); + sdata_unlock(sdata); +} - /* don't allow another channel switch if one is already active. */ - if (sdata->vif.csa_active) - return -EBUSY; +static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, + struct cfg80211_csa_settings *params, + u32 *changed) +{ + int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: - sdata->csa_counter_offset_beacon = - params->counter_offset_beacon; - sdata->csa_counter_offset_presp = params->counter_offset_presp; sdata->u.ap.next_beacon = cfg80211_beacon_dup(¶ms->beacon_after); if (!sdata->u.ap.next_beacon) return -ENOMEM; + /* + * With a count of 0, we don't have to wait for any + * TBTT before switching, so complete the CSA + * immediately. In theory, with a count == 1 we + * should delay the switch until just before the next + * TBTT, but that would complicate things so we switch + * immediately too. If we would delay the switch + * until the next TBTT, we would have to set the probe + * response here. + * + * TODO: A channel switch with count <= 1 without + * sending a CSA action frame is kind of useless, + * because the clients won't know we're changing + * channels. The action frame must be implemented + * either here or in the userspace. + */ + if (params->count <= 1) + break; + + if ((params->n_counter_offsets_beacon > + IEEE80211_MAX_CSA_COUNTERS_NUM) || + (params->n_counter_offsets_presp > + IEEE80211_MAX_CSA_COUNTERS_NUM)) + return -EINVAL; + + /* make sure we don't have garbage in other counters */ + memset(sdata->csa_counter_offset_beacon, 0, + sizeof(sdata->csa_counter_offset_beacon)); + memset(sdata->csa_counter_offset_presp, 0, + sizeof(sdata->csa_counter_offset_presp)); + + memcpy(sdata->csa_counter_offset_beacon, + params->counter_offsets_beacon, + params->n_counter_offsets_beacon * sizeof(u16)); + memcpy(sdata->csa_counter_offset_presp, + params->counter_offsets_presp, + params->n_counter_offsets_presp * sizeof(u16)); + err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa); if (err < 0) { kfree(sdata->u.ap.next_beacon); return err; } + *changed |= err; + break; case NL80211_IFTYPE_ADHOC: if (!sdata->vif.bss_conf.ibss_joined) @@ -3136,16 +3260,20 @@ static int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, params->chandef.chan->band) return -EINVAL; - err = ieee80211_ibss_csa_beacon(sdata, params); - if (err < 0) - return err; + /* see comments in the NL80211_IFTYPE_AP block */ + if (params->count > 1) { + err = ieee80211_ibss_csa_beacon(sdata, params); + if (err < 0) + return err; + *changed |= err; + } + + ieee80211_send_action_csa(sdata, params); + break; #ifdef CONFIG_MAC80211_MESH - case NL80211_IFTYPE_MESH_POINT: - ifmsh = &sdata->u.mesh; - - if (!ifmsh->mesh_id) - return -EINVAL; + case NL80211_IFTYPE_MESH_POINT: { + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; if (params->chandef.width != sdata->vif.bss_conf.chandef.width) return -EINVAL; @@ -3155,39 +3283,126 @@ static int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, params->chandef.chan->band) return -EINVAL; - ifmsh->chsw_init = true; - if (!ifmsh->pre_value) - ifmsh->pre_value = 1; - else - ifmsh->pre_value++; + if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_NONE) { + ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_INIT; + if (!ifmsh->pre_value) + ifmsh->pre_value = 1; + else + ifmsh->pre_value++; + } - err = ieee80211_mesh_csa_beacon(sdata, params, true); - if (err < 0) { - ifmsh->chsw_init = false; - return err; + /* see comments in the NL80211_IFTYPE_AP block */ + if (params->count > 1) { + err = ieee80211_mesh_csa_beacon(sdata, params); + if (err < 0) { + ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; + return err; + } + *changed |= err; } + + if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) + ieee80211_send_action_csa(sdata, params); + break; + } #endif default: return -EOPNOTSUPP; } - sdata->csa_radar_required = params->radar_required; + return 0; +} - if (params->block_tx) - ieee80211_stop_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); +static int +__ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_csa_settings *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *chanctx; + int err, num_chanctx, changed = 0; + + sdata_assert_lock(sdata); + lockdep_assert_held(&local->mtx); + + if (!list_empty(&local->roc_list) || local->scanning) + return -EBUSY; + + if (sdata->wdev.cac_started) + return -EBUSY; + + if (cfg80211_chandef_identical(¶ms->chandef, + &sdata->vif.bss_conf.chandef)) + return -EINVAL; + + mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) { + mutex_unlock(&local->chanctx_mtx); + return -EBUSY; + } + + /* don't handle for multi-VIF cases */ + chanctx = container_of(conf, struct ieee80211_chanctx, conf); + if (ieee80211_chanctx_refcount(local, chanctx) > 1) { + mutex_unlock(&local->chanctx_mtx); + return -EBUSY; + } + num_chanctx = 0; + list_for_each_entry_rcu(chanctx, &local->chanctx_list, list) + num_chanctx++; + mutex_unlock(&local->chanctx_mtx); + + if (num_chanctx > 1) + return -EBUSY; + + /* don't allow another channel switch if one is already active. */ + if (sdata->vif.csa_active) + return -EBUSY; + + err = ieee80211_set_csa_beacon(sdata, params, &changed); + if (err) + return err; + sdata->csa_radar_required = params->radar_required; sdata->csa_chandef = params->chandef; + sdata->csa_block_tx = params->block_tx; + sdata->csa_current_counter = params->count; sdata->vif.csa_active = true; - ieee80211_bss_info_change_notify(sdata, err); - drv_channel_switch_beacon(sdata, ¶ms->chandef); + if (sdata->csa_block_tx) + ieee80211_stop_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); + + if (changed) { + ieee80211_bss_info_change_notify(sdata, changed); + drv_channel_switch_beacon(sdata, ¶ms->chandef); + } else { + /* if the beacon didn't change, we can finalize immediately */ + ieee80211_csa_finalize(sdata); + } return 0; } +int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_csa_settings *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + int err; + + mutex_lock(&local->mtx); + err = __ieee80211_channel_switch(wiphy, dev, params); + mutex_unlock(&local->mtx); + + return err; +} + static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) @@ -3200,6 +3415,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, bool need_offchan = false; u32 flags; int ret; + u8 *data; if (params->dont_wait_for_ack) flags = IEEE80211_TX_CTL_NO_ACK; @@ -3293,7 +3509,20 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, } skb_reserve(skb, local->hw.extra_tx_headroom); - memcpy(skb_put(skb, params->len), params->buf, params->len); + data = skb_put(skb, params->len); + memcpy(data, params->buf, params->len); + + /* Update CSA counters */ + if (sdata->vif.csa_active && + (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_ADHOC) && + params->n_csa_offsets) { + int i; + u8 c = sdata->csa_current_counter; + + for (i = 0; i < params->n_csa_offsets; i++) + data[params->csa_offsets[i]] = c; + } IEEE80211_SKB_CB(skb)->flags = flags; @@ -3402,320 +3631,6 @@ static int ieee80211_set_rekey_data(struct wiphy *wiphy, return 0; } -static void ieee80211_tdls_add_ext_capab(struct sk_buff *skb) -{ - u8 *pos = (void *)skb_put(skb, 7); - - *pos++ = WLAN_EID_EXT_CAPABILITY; - *pos++ = 5; /* len */ - *pos++ = 0x0; - *pos++ = 0x0; - *pos++ = 0x0; - *pos++ = 0x0; - *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED; -} - -static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - u16 capab; - - capab = 0; - if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ) - return capab; - - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; - - return capab; -} - -static void ieee80211_tdls_add_link_ie(struct sk_buff *skb, u8 *src_addr, - u8 *peer, u8 *bssid) -{ - struct ieee80211_tdls_lnkie *lnkid; - - lnkid = (void *)skb_put(skb, sizeof(struct ieee80211_tdls_lnkie)); - - lnkid->ie_type = WLAN_EID_LINK_ID; - lnkid->ie_len = sizeof(struct ieee80211_tdls_lnkie) - 2; - - memcpy(lnkid->bssid, bssid, ETH_ALEN); - memcpy(lnkid->init_sta, src_addr, ETH_ALEN); - memcpy(lnkid->resp_sta, peer, ETH_ALEN); -} - -static int -ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev, - u8 *peer, u8 action_code, u8 dialog_token, - u16 status_code, struct sk_buff *skb) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); - struct ieee80211_tdls_data *tf; - - tf = (void *)skb_put(skb, offsetof(struct ieee80211_tdls_data, u)); - - memcpy(tf->da, peer, ETH_ALEN); - memcpy(tf->sa, sdata->vif.addr, ETH_ALEN); - tf->ether_type = cpu_to_be16(ETH_P_TDLS); - tf->payload_type = WLAN_TDLS_SNAP_RFTYPE; - - switch (action_code) { - case WLAN_TDLS_SETUP_REQUEST: - tf->category = WLAN_CATEGORY_TDLS; - tf->action_code = WLAN_TDLS_SETUP_REQUEST; - - skb_put(skb, sizeof(tf->u.setup_req)); - tf->u.setup_req.dialog_token = dialog_token; - tf->u.setup_req.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); - - ieee80211_add_srates_ie(sdata, skb, false, band); - ieee80211_add_ext_srates_ie(sdata, skb, false, band); - ieee80211_tdls_add_ext_capab(skb); - break; - case WLAN_TDLS_SETUP_RESPONSE: - tf->category = WLAN_CATEGORY_TDLS; - tf->action_code = WLAN_TDLS_SETUP_RESPONSE; - - skb_put(skb, sizeof(tf->u.setup_resp)); - tf->u.setup_resp.status_code = cpu_to_le16(status_code); - tf->u.setup_resp.dialog_token = dialog_token; - tf->u.setup_resp.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); - - ieee80211_add_srates_ie(sdata, skb, false, band); - ieee80211_add_ext_srates_ie(sdata, skb, false, band); - ieee80211_tdls_add_ext_capab(skb); - break; - case WLAN_TDLS_SETUP_CONFIRM: - tf->category = WLAN_CATEGORY_TDLS; - tf->action_code = WLAN_TDLS_SETUP_CONFIRM; - - skb_put(skb, sizeof(tf->u.setup_cfm)); - tf->u.setup_cfm.status_code = cpu_to_le16(status_code); - tf->u.setup_cfm.dialog_token = dialog_token; - break; - case WLAN_TDLS_TEARDOWN: - tf->category = WLAN_CATEGORY_TDLS; - tf->action_code = WLAN_TDLS_TEARDOWN; - - skb_put(skb, sizeof(tf->u.teardown)); - tf->u.teardown.reason_code = cpu_to_le16(status_code); - break; - case WLAN_TDLS_DISCOVERY_REQUEST: - tf->category = WLAN_CATEGORY_TDLS; - tf->action_code = WLAN_TDLS_DISCOVERY_REQUEST; - - skb_put(skb, sizeof(tf->u.discover_req)); - tf->u.discover_req.dialog_token = dialog_token; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int -ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev, - u8 *peer, u8 action_code, u8 dialog_token, - u16 status_code, struct sk_buff *skb) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); - struct ieee80211_mgmt *mgmt; - - mgmt = (void *)skb_put(skb, 24); - memset(mgmt, 0, 24); - memcpy(mgmt->da, peer, ETH_ALEN); - memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); - memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); - - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_ACTION); - - switch (action_code) { - case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: - skb_put(skb, 1 + sizeof(mgmt->u.action.u.tdls_discover_resp)); - mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; - mgmt->u.action.u.tdls_discover_resp.action_code = - WLAN_PUB_ACTION_TDLS_DISCOVER_RES; - mgmt->u.action.u.tdls_discover_resp.dialog_token = - dialog_token; - mgmt->u.action.u.tdls_discover_resp.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); - - ieee80211_add_srates_ie(sdata, skb, false, band); - ieee80211_add_ext_srates_ie(sdata, skb, false, band); - ieee80211_tdls_add_ext_capab(skb); - break; - default: - return -EINVAL; - } - - return 0; -} - -static int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, - u8 *peer, u8 action_code, u8 dialog_token, - u16 status_code, const u8 *extra_ies, - size_t extra_ies_len) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - struct ieee80211_local *local = sdata->local; - struct sk_buff *skb = NULL; - bool send_direct; - int ret; - - if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) - return -ENOTSUPP; - - /* make sure we are in managed mode, and associated */ - if (sdata->vif.type != NL80211_IFTYPE_STATION || - !sdata->u.mgd.associated) - return -EINVAL; - - tdls_dbg(sdata, "TDLS mgmt action %d peer %pM\n", - action_code, peer); - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + - max(sizeof(struct ieee80211_mgmt), - sizeof(struct ieee80211_tdls_data)) + - 50 + /* supported rates */ - 7 + /* ext capab */ - extra_ies_len + - sizeof(struct ieee80211_tdls_lnkie)); - if (!skb) - return -ENOMEM; - - skb_reserve(skb, local->hw.extra_tx_headroom); - - switch (action_code) { - case WLAN_TDLS_SETUP_REQUEST: - case WLAN_TDLS_SETUP_RESPONSE: - case WLAN_TDLS_SETUP_CONFIRM: - case WLAN_TDLS_TEARDOWN: - case WLAN_TDLS_DISCOVERY_REQUEST: - ret = ieee80211_prep_tdls_encap_data(wiphy, dev, peer, - action_code, dialog_token, - status_code, skb); - send_direct = false; - break; - case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: - ret = ieee80211_prep_tdls_direct(wiphy, dev, peer, action_code, - dialog_token, status_code, - skb); - send_direct = true; - break; - default: - ret = -ENOTSUPP; - break; - } - - if (ret < 0) - goto fail; - - if (extra_ies_len) - memcpy(skb_put(skb, extra_ies_len), extra_ies, extra_ies_len); - - /* the TDLS link IE is always added last */ - switch (action_code) { - case WLAN_TDLS_SETUP_REQUEST: - case WLAN_TDLS_SETUP_CONFIRM: - case WLAN_TDLS_TEARDOWN: - case WLAN_TDLS_DISCOVERY_REQUEST: - /* we are the initiator */ - ieee80211_tdls_add_link_ie(skb, sdata->vif.addr, peer, - sdata->u.mgd.bssid); - break; - case WLAN_TDLS_SETUP_RESPONSE: - case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: - /* we are the responder */ - ieee80211_tdls_add_link_ie(skb, peer, sdata->vif.addr, - sdata->u.mgd.bssid); - break; - default: - ret = -ENOTSUPP; - goto fail; - } - - if (send_direct) { - ieee80211_tx_skb(sdata, skb); - return 0; - } - - /* - * According to 802.11z: Setup req/resp are sent in AC_BK, otherwise - * we should default to AC_VI. - */ - switch (action_code) { - case WLAN_TDLS_SETUP_REQUEST: - case WLAN_TDLS_SETUP_RESPONSE: - skb_set_queue_mapping(skb, IEEE80211_AC_BK); - skb->priority = 2; - break; - default: - skb_set_queue_mapping(skb, IEEE80211_AC_VI); - skb->priority = 5; - break; - } - - /* disable bottom halves when entering the Tx path */ - local_bh_disable(); - ret = ieee80211_subif_start_xmit(skb, dev); - local_bh_enable(); - - return ret; - -fail: - dev_kfree_skb(skb); - return ret; -} - -static int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, - u8 *peer, enum nl80211_tdls_operation oper) -{ - struct sta_info *sta; - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - - if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) - return -ENOTSUPP; - - if (sdata->vif.type != NL80211_IFTYPE_STATION) - return -EINVAL; - - tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer); - - switch (oper) { - case NL80211_TDLS_ENABLE_LINK: - rcu_read_lock(); - sta = sta_info_get(sdata, peer); - if (!sta) { - rcu_read_unlock(); - return -ENOLINK; - } - - set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); - rcu_read_unlock(); - break; - case NL80211_TDLS_DISABLE_LINK: - return sta_info_destroy_addr(sdata, peer); - case NL80211_TDLS_TEARDOWN: - case NL80211_TDLS_SETUP: - case NL80211_TDLS_DISCOVERY_REQ: - /* We don't support in-driver setup/teardown/discovery */ - return -ENOTSUPP; - default: - return -ENOTSUPP; - } - - return 0; -} - static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u64 *cookie) { @@ -3829,7 +3744,47 @@ static void ieee80211_set_wakeup(struct wiphy *wiphy, bool enabled) } #endif -struct cfg80211_ops mac80211_config_ops = { +static int ieee80211_set_qos_map(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_qos_map *qos_map) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct mac80211_qos_map *new_qos_map, *old_qos_map; + + if (qos_map) { + new_qos_map = kzalloc(sizeof(*new_qos_map), GFP_KERNEL); + if (!new_qos_map) + return -ENOMEM; + memcpy(&new_qos_map->qos_map, qos_map, sizeof(*qos_map)); + } else { + /* A NULL qos_map was passed to disable QoS mapping */ + new_qos_map = NULL; + } + + old_qos_map = sdata_dereference(sdata->qos_map, sdata); + rcu_assign_pointer(sdata->qos_map, new_qos_map); + if (old_qos_map) + kfree_rcu(old_qos_map, rcu_head); + + return 0; +} + +static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int ret; + u32 changed = 0; + + ret = ieee80211_vif_change_bandwidth(sdata, chandef, &changed); + if (ret == 0) + ieee80211_bss_info_change_notify(sdata, changed); + + return ret; +} + +const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, .change_virtual_intf = ieee80211_change_iface, @@ -3908,4 +3863,6 @@ struct cfg80211_ops mac80211_config_ops = { .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, .channel_switch = ieee80211_channel_switch, + .set_qos_map = ieee80211_set_qos_map, + .set_ap_chanwidth = ieee80211_set_ap_chanwidth, }; diff --git a/net/mac80211/cfg.h b/net/mac80211/cfg.h index 7d7879f5b00..2d51f62dc76 100644 --- a/net/mac80211/cfg.h +++ b/net/mac80211/cfg.h @@ -4,6 +4,6 @@ #ifndef __CFG_H #define __CFG_H -extern struct cfg80211_ops mac80211_config_ops; +extern const struct cfg80211_ops mac80211_config_ops; #endif /* __CFG_H */ diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index a57d5d9466b..a310e33972d 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -9,6 +9,170 @@ #include "ieee80211_i.h" #include "driver-ops.h" +static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + struct ieee80211_sub_if_data *sdata; + int num = 0; + + lockdep_assert_held(&local->chanctx_mtx); + + list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list) + num++; + + return num; +} + +static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + struct ieee80211_sub_if_data *sdata; + int num = 0; + + lockdep_assert_held(&local->chanctx_mtx); + + list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list) + num++; + + return num; +} + +int ieee80211_chanctx_refcount(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + return ieee80211_chanctx_num_assigned(local, ctx) + + ieee80211_chanctx_num_reserved(local, ctx); +} + +static int ieee80211_num_chanctx(struct ieee80211_local *local) +{ + struct ieee80211_chanctx *ctx; + int num = 0; + + lockdep_assert_held(&local->chanctx_mtx); + + list_for_each_entry(ctx, &local->chanctx_list, list) + num++; + + return num; +} + +static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local) +{ + lockdep_assert_held(&local->chanctx_mtx); + return ieee80211_num_chanctx(local) < ieee80211_max_num_channels(local); +} + +static const struct cfg80211_chan_def * +ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + const struct cfg80211_chan_def *compat) +{ + struct ieee80211_sub_if_data *sdata; + + lockdep_assert_held(&local->chanctx_mtx); + + list_for_each_entry(sdata, &ctx->reserved_vifs, + reserved_chanctx_list) { + if (!compat) + compat = &sdata->reserved_chandef; + + compat = cfg80211_chandef_compatible(&sdata->reserved_chandef, + compat); + if (!compat) + break; + } + + return compat; +} + +static const struct cfg80211_chan_def * +ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + const struct cfg80211_chan_def *compat) +{ + struct ieee80211_sub_if_data *sdata; + + lockdep_assert_held(&local->chanctx_mtx); + + list_for_each_entry(sdata, &ctx->assigned_vifs, + assigned_chanctx_list) { + if (sdata->reserved_chanctx != NULL) + continue; + + if (!compat) + compat = &sdata->vif.bss_conf.chandef; + + compat = cfg80211_chandef_compatible( + &sdata->vif.bss_conf.chandef, compat); + if (!compat) + break; + } + + return compat; +} + +static const struct cfg80211_chan_def * +ieee80211_chanctx_combined_chandef(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + const struct cfg80211_chan_def *compat) +{ + lockdep_assert_held(&local->chanctx_mtx); + + compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat); + if (!compat) + return NULL; + + compat = ieee80211_chanctx_non_reserved_chandef(local, ctx, compat); + if (!compat) + return NULL; + + return compat; +} + +static bool +ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + const struct cfg80211_chan_def *def) +{ + lockdep_assert_held(&local->chanctx_mtx); + + if (ieee80211_chanctx_combined_chandef(local, ctx, def)) + return true; + + if (!list_empty(&ctx->reserved_vifs) && + ieee80211_chanctx_reserved_chandef(local, ctx, def)) + return true; + + return false; +} + +static struct ieee80211_chanctx * +ieee80211_find_reservation_chanctx(struct ieee80211_local *local, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode mode) +{ + struct ieee80211_chanctx *ctx; + + lockdep_assert_held(&local->chanctx_mtx); + + if (mode == IEEE80211_CHANCTX_EXCLUSIVE) + return NULL; + + list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) + continue; + + if (!ieee80211_chanctx_can_reserve_chandef(local, ctx, + chandef)) + continue; + + return ctx; + } + + return NULL; +} + static enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta) { switch (sta->bandwidth) { @@ -100,6 +264,12 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, } max_bw = max(max_bw, width); } + + /* use the configured bandwidth in case of monitor interface */ + sdata = rcu_dereference(local->monitor_sdata); + if (sdata && rcu_access_pointer(sdata->vif.chanctx_conf) == conf) + max_bw = max(max_bw, conf->def.width); + rcu_read_unlock(); return max_bw; @@ -184,6 +354,11 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (!compat) continue; + compat = ieee80211_chanctx_reserved_chandef(local, ctx, + compat); + if (!compat) + continue; + ieee80211_change_chanctx(local, ctx, compat); return ctx; @@ -196,6 +371,8 @@ static bool ieee80211_is_radar_required(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; + lockdep_assert_held(&local->mtx); + rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->radar_required) { @@ -209,67 +386,91 @@ static bool ieee80211_is_radar_required(struct ieee80211_local *local) } static struct ieee80211_chanctx * -ieee80211_new_chanctx(struct ieee80211_local *local, - const struct cfg80211_chan_def *chandef, - enum ieee80211_chanctx_mode mode) +ieee80211_alloc_chanctx(struct ieee80211_local *local, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; - u32 changed; - int err; lockdep_assert_held(&local->chanctx_mtx); ctx = kzalloc(sizeof(*ctx) + local->hw.chanctx_data_size, GFP_KERNEL); if (!ctx) - return ERR_PTR(-ENOMEM); + return NULL; + INIT_LIST_HEAD(&ctx->assigned_vifs); + INIT_LIST_HEAD(&ctx->reserved_vifs); ctx->conf.def = *chandef; ctx->conf.rx_chains_static = 1; ctx->conf.rx_chains_dynamic = 1; ctx->mode = mode; ctx->conf.radar_enabled = ieee80211_is_radar_required(local); ieee80211_recalc_chanctx_min_def(local, ctx); + + return ctx; +} + +static int ieee80211_add_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + u32 changed; + int err; + + lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); + if (!local->use_chanctx) local->hw.conf.radar_enabled = ctx->conf.radar_enabled; - /* acquire mutex to prevent idle from changing */ - mutex_lock(&local->mtx); /* turn idle off *before* setting channel -- some drivers need that */ changed = ieee80211_idle_off(local); if (changed) ieee80211_hw_config(local, changed); if (!local->use_chanctx) { - local->_oper_chandef = *chandef; - ieee80211_hw_config(local, 0); + local->_oper_chandef = ctx->conf.def; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { err = drv_add_chanctx(local, ctx); if (err) { - kfree(ctx); - ctx = ERR_PTR(err); - ieee80211_recalc_idle(local); - goto out; + return err; } } - /* and keep the mutex held until the new chanctx is on the list */ - list_add_rcu(&ctx->list, &local->chanctx_list); + return 0; +} - out: - mutex_unlock(&local->mtx); +static struct ieee80211_chanctx * +ieee80211_new_chanctx(struct ieee80211_local *local, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode mode) +{ + struct ieee80211_chanctx *ctx; + int err; + + lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); + + ctx = ieee80211_alloc_chanctx(local, chandef, mode); + if (!ctx) + return ERR_PTR(-ENOMEM); + + err = ieee80211_add_chanctx(local, ctx); + if (err) { + kfree(ctx); + return ERR_PTR(err); + } + list_add_rcu(&ctx->list, &local->chanctx_list); return ctx; } -static void ieee80211_free_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) +static void ieee80211_del_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { - bool check_single_channel = false; lockdep_assert_held(&local->chanctx_mtx); - WARN_ON_ONCE(ctx->refcount != 0); - if (!local->use_chanctx) { struct cfg80211_chan_def *chandef = &local->_oper_chandef; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; @@ -279,50 +480,29 @@ static void ieee80211_free_chanctx(struct ieee80211_local *local, /* NOTE: Disabling radar is only valid here for * single channel context. To be sure, check it ... */ - if (local->hw.conf.radar_enabled) - check_single_channel = true; + WARN_ON(local->hw.conf.radar_enabled && + !list_empty(&local->chanctx_list)); + local->hw.conf.radar_enabled = false; - ieee80211_hw_config(local, 0); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { drv_remove_chanctx(local, ctx); } - list_del_rcu(&ctx->list); - kfree_rcu(ctx, rcu_head); - - /* throw a warning if this wasn't the only channel context. */ - WARN_ON(check_single_channel && !list_empty(&local->chanctx_list)); - - mutex_lock(&local->mtx); ieee80211_recalc_idle(local); - mutex_unlock(&local->mtx); } -static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata, - struct ieee80211_chanctx *ctx) +static void ieee80211_free_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { - struct ieee80211_local *local = sdata->local; - int ret; - lockdep_assert_held(&local->chanctx_mtx); - ret = drv_assign_vif_chanctx(local, sdata, ctx); - if (ret) - return ret; - - rcu_assign_pointer(sdata->vif.chanctx_conf, &ctx->conf); - ctx->refcount++; - - ieee80211_recalc_txpower(sdata); - ieee80211_recalc_chanctx_min_def(local, ctx); - sdata->vif.bss_conf.idle = false; - - if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && - sdata->vif.type != NL80211_IFTYPE_MONITOR) - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + WARN_ON_ONCE(ieee80211_chanctx_refcount(local, ctx) != 0); - return 0; + list_del_rcu(&ctx->list); + ieee80211_del_chanctx(local, ctx); + kfree_rcu(ctx, rcu_head); } static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, @@ -358,73 +538,106 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, ieee80211_change_chanctx(local, ctx, compat); } -static void ieee80211_unassign_vif_chanctx(struct ieee80211_sub_if_data *sdata, - struct ieee80211_chanctx *ctx) +static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *chanctx) { - struct ieee80211_local *local = sdata->local; + bool radar_enabled; lockdep_assert_held(&local->chanctx_mtx); + /* for setting local->radar_detect_enabled */ + lockdep_assert_held(&local->mtx); - ctx->refcount--; - rcu_assign_pointer(sdata->vif.chanctx_conf, NULL); - - sdata->vif.bss_conf.idle = true; + radar_enabled = ieee80211_is_radar_required(local); - if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && - sdata->vif.type != NL80211_IFTYPE_MONITOR) - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + if (radar_enabled == chanctx->conf.radar_enabled) + return; - drv_unassign_vif_chanctx(local, sdata, ctx); + chanctx->conf.radar_enabled = radar_enabled; + local->radar_detect_enabled = chanctx->conf.radar_enabled; - if (ctx->refcount > 0) { - ieee80211_recalc_chanctx_chantype(sdata->local, ctx); - ieee80211_recalc_smps_chanctx(local, ctx); - ieee80211_recalc_radar_chanctx(local, ctx); - ieee80211_recalc_chanctx_min_def(local, ctx); + if (!local->use_chanctx) { + local->hw.conf.radar_enabled = chanctx->conf.radar_enabled; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } + + drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR); } -static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) +static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata, + struct ieee80211_chanctx *new_ctx) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; - struct ieee80211_chanctx *ctx; - - lockdep_assert_held(&local->chanctx_mtx); + struct ieee80211_chanctx *curr_ctx = NULL; + int ret = 0; conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); - if (!conf) - return; - ctx = container_of(conf, struct ieee80211_chanctx, conf); + if (conf) { + curr_ctx = container_of(conf, struct ieee80211_chanctx, conf); - ieee80211_unassign_vif_chanctx(sdata, ctx); - if (ctx->refcount == 0) - ieee80211_free_chanctx(local, ctx); + drv_unassign_vif_chanctx(local, sdata, curr_ctx); + conf = NULL; + list_del(&sdata->assigned_chanctx_list); + } + + if (new_ctx) { + ret = drv_assign_vif_chanctx(local, sdata, new_ctx); + if (ret) + goto out; + + conf = &new_ctx->conf; + list_add(&sdata->assigned_chanctx_list, + &new_ctx->assigned_vifs); + } + +out: + rcu_assign_pointer(sdata->vif.chanctx_conf, conf); + + sdata->vif.bss_conf.idle = !conf; + + if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) { + ieee80211_recalc_chanctx_chantype(local, curr_ctx); + ieee80211_recalc_smps_chanctx(local, curr_ctx); + ieee80211_recalc_radar_chanctx(local, curr_ctx); + ieee80211_recalc_chanctx_min_def(local, curr_ctx); + } + + if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { + ieee80211_recalc_txpower(sdata); + ieee80211_recalc_chanctx_min_def(local, new_ctx); + } + + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_MONITOR) + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_IDLE); + + return ret; } -void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *chanctx) +static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) { - bool radar_enabled; + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; lockdep_assert_held(&local->chanctx_mtx); - radar_enabled = ieee80211_is_radar_required(local); - - if (radar_enabled == chanctx->conf.radar_enabled) + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) return; - chanctx->conf.radar_enabled = radar_enabled; - local->radar_detect_enabled = chanctx->conf.radar_enabled; + ctx = container_of(conf, struct ieee80211_chanctx, conf); - if (!local->use_chanctx) { - local->hw.conf.radar_enabled = chanctx->conf.radar_enabled; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); - } + if (sdata->reserved_chanctx) + ieee80211_vif_unreserve_chanctx(sdata); - drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR); + ieee80211_assign_vif_chanctx(sdata, NULL); + if (ieee80211_chanctx_refcount(local, ctx) == 0) + ieee80211_free_chanctx(local, ctx); } void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, @@ -489,6 +702,13 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, rx_chains_static = max(rx_chains_static, needed_static); rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic); } + + /* Disable SMPS for the monitor interface */ + sdata = rcu_dereference(local->monitor_sdata); + if (sdata && + rcu_access_pointer(sdata->vif.chanctx_conf) == &chanctx->conf) + rx_chains_dynamic = rx_chains_static = local->rx_chains; + rcu_read_unlock(); if (!local->use_chanctx) { @@ -516,11 +736,30 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *ctx; + u8 radar_detect_width = 0; int ret; + lockdep_assert_held(&local->mtx); + WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev)); mutex_lock(&local->chanctx_mtx); + + ret = cfg80211_chandef_dfs_required(local->hw.wiphy, + chandef, + sdata->wdev.iftype); + if (ret < 0) + goto out; + if (ret > 0) + radar_detect_width = BIT(chandef->width); + + sdata->radar_required = ret; + + ret = ieee80211_check_combinations(sdata, chandef, mode, + radar_detect_width); + if (ret < 0) + goto out; + __ieee80211_vif_release_channel(sdata); ctx = ieee80211_find_chanctx(local, chandef, mode); @@ -536,7 +775,7 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, ret = ieee80211_assign_vif_chanctx(sdata, ctx); if (ret) { /* if assign fails refcount stays the same */ - if (ctx->refcount == 0) + if (ieee80211_chanctx_refcount(local, ctx) == 0) ieee80211_free_chanctx(local, ctx); goto out; } @@ -548,25 +787,142 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, return ret; } +static int __ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, + struct ieee80211_chanctx *ctx, + u32 *changed) +{ + struct ieee80211_local *local = sdata->local; + const struct cfg80211_chan_def *chandef = &sdata->csa_chandef; + u32 chanctx_changed = 0; + + if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, + IEEE80211_CHAN_DISABLED)) + return -EINVAL; + + if (ieee80211_chanctx_refcount(local, ctx) != 1) + return -EINVAL; + + if (sdata->vif.bss_conf.chandef.width != chandef->width) { + chanctx_changed = IEEE80211_CHANCTX_CHANGE_WIDTH; + *changed |= BSS_CHANGED_BANDWIDTH; + } + + sdata->vif.bss_conf.chandef = *chandef; + ctx->conf.def = *chandef; + + chanctx_changed |= IEEE80211_CHANCTX_CHANGE_CHANNEL; + drv_change_chanctx(local, ctx, chanctx_changed); + + ieee80211_recalc_chanctx_chantype(local, ctx); + ieee80211_recalc_smps_chanctx(local, ctx); + ieee80211_recalc_radar_chanctx(local, ctx); + ieee80211_recalc_chanctx_min_def(local, ctx); + + return 0; +} + int ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, u32 *changed) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; - const struct cfg80211_chan_def *chandef = &sdata->csa_chandef; int ret; - u32 chanctx_changed = 0; + + lockdep_assert_held(&local->mtx); /* should never be called if not performing a channel switch. */ if (WARN_ON(!sdata->vif.csa_active)) return -EINVAL; - if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, - IEEE80211_CHAN_DISABLED)) + mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) { + ret = -EINVAL; + goto out; + } + + ctx = container_of(conf, struct ieee80211_chanctx, conf); + + ret = __ieee80211_vif_change_channel(sdata, ctx, changed); + out: + mutex_unlock(&local->chanctx_mtx); + return ret; +} + +static void +__ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, + bool clear) +{ + struct ieee80211_local *local __maybe_unused = sdata->local; + struct ieee80211_sub_if_data *vlan; + struct ieee80211_chanctx_conf *conf; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP)) + return; + + lockdep_assert_held(&local->mtx); + + /* Check that conf exists, even when clearing this function + * must be called with the AP's channel context still there + * as it would otherwise cause VLANs to have an invalid + * channel context pointer for a while, possibly pointing + * to a channel context that has already been freed. + */ + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + WARN_ON(!conf); + + if (clear) + conf = NULL; + + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + rcu_assign_pointer(vlan->vif.chanctx_conf, conf); +} + +void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, + bool clear) +{ + struct ieee80211_local *local = sdata->local; + + mutex_lock(&local->chanctx_mtx); + + __ieee80211_vif_copy_chanctx_to_vlans(sdata, clear); + + mutex_unlock(&local->chanctx_mtx); +} + +int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_chanctx *ctx = sdata->reserved_chanctx; + + lockdep_assert_held(&sdata->local->chanctx_mtx); + + if (WARN_ON(!ctx)) return -EINVAL; + list_del(&sdata->reserved_chanctx_list); + sdata->reserved_chanctx = NULL; + + if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) + ieee80211_free_chanctx(sdata->local, ctx); + + return 0; +} + +int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode mode, + bool radar_required) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *new_ctx, *curr_ctx; + int ret = 0; + mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) { @@ -574,30 +930,108 @@ int ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, goto out; } - ctx = container_of(conf, struct ieee80211_chanctx, conf); - if (ctx->refcount != 1) { + curr_ctx = container_of(conf, struct ieee80211_chanctx, conf); + + new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode); + if (!new_ctx) { + if (ieee80211_chanctx_refcount(local, curr_ctx) == 1 && + (local->hw.flags & IEEE80211_HW_CHANGE_RUNNING_CHANCTX)) { + /* if we're the only users of the chanctx and + * the driver supports changing a running + * context, reserve our current context + */ + new_ctx = curr_ctx; + } else if (ieee80211_can_create_new_chanctx(local)) { + /* create a new context and reserve it */ + new_ctx = ieee80211_new_chanctx(local, chandef, mode); + if (IS_ERR(new_ctx)) { + ret = PTR_ERR(new_ctx); + goto out; + } + } else { + ret = -EBUSY; + goto out; + } + } + + list_add(&sdata->reserved_chanctx_list, &new_ctx->reserved_vifs); + sdata->reserved_chanctx = new_ctx; + sdata->reserved_chandef = *chandef; + sdata->reserved_radar_required = radar_required; +out: + mutex_unlock(&local->chanctx_mtx); + return ret; +} + +int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata, + u32 *changed) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx *ctx; + struct ieee80211_chanctx *old_ctx; + struct ieee80211_chanctx_conf *conf; + int ret; + u32 tmp_changed = *changed; + + /* TODO: need to recheck if the chandef is usable etc.? */ + + lockdep_assert_held(&local->mtx); + + mutex_lock(&local->chanctx_mtx); + + ctx = sdata->reserved_chanctx; + if (WARN_ON(!ctx)) { ret = -EINVAL; goto out; } - if (sdata->vif.bss_conf.chandef.width != chandef->width) { - chanctx_changed = IEEE80211_CHANCTX_CHANGE_WIDTH; - *changed |= BSS_CHANGED_BANDWIDTH; + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) { + ret = -EINVAL; + goto out; } - sdata->vif.bss_conf.chandef = *chandef; - ctx->conf.def = *chandef; + old_ctx = container_of(conf, struct ieee80211_chanctx, conf); - chanctx_changed |= IEEE80211_CHANCTX_CHANGE_CHANNEL; - drv_change_chanctx(local, ctx, chanctx_changed); + if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width) + tmp_changed |= BSS_CHANGED_BANDWIDTH; + + sdata->vif.bss_conf.chandef = sdata->reserved_chandef; + + /* unref our reservation */ + sdata->reserved_chanctx = NULL; + sdata->radar_required = sdata->reserved_radar_required; + list_del(&sdata->reserved_chanctx_list); + + if (old_ctx == ctx) { + /* This is our own context, just change it */ + ret = __ieee80211_vif_change_channel(sdata, old_ctx, + &tmp_changed); + if (ret) + goto out; + } else { + ret = ieee80211_assign_vif_chanctx(sdata, ctx); + if (ieee80211_chanctx_refcount(local, old_ctx) == 0) + ieee80211_free_chanctx(local, old_ctx); + if (ret) { + /* if assign fails refcount stays the same */ + if (ieee80211_chanctx_refcount(local, ctx) == 0) + ieee80211_free_chanctx(local, ctx); + goto out; + } + + if (sdata->vif.type == NL80211_IFTYPE_AP) + __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + } + + *changed = tmp_changed; ieee80211_recalc_chanctx_chantype(local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); ieee80211_recalc_chanctx_min_def(local, ctx); - - ret = 0; - out: +out: mutex_unlock(&local->chanctx_mtx); return ret; } @@ -655,6 +1089,8 @@ void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) { WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev)); + lockdep_assert_held(&sdata->local->mtx); + mutex_lock(&sdata->local->chanctx_mtx); __ieee80211_vif_release_channel(sdata); mutex_unlock(&sdata->local->chanctx_mtx); @@ -679,40 +1115,6 @@ void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->chanctx_mtx); } -void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, - bool clear) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_sub_if_data *vlan; - struct ieee80211_chanctx_conf *conf; - - ASSERT_RTNL(); - - if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP)) - return; - - mutex_lock(&local->chanctx_mtx); - - /* - * Check that conf exists, even when clearing this function - * must be called with the AP's channel context still there - * as it would otherwise cause VLANs to have an invalid - * channel context pointer for a while, possibly pointing - * to a channel context that has already been freed. - */ - conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - WARN_ON(!conf); - - if (clear) - conf = NULL; - - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) - rcu_assign_pointer(vlan->vif.chanctx_conf, conf); - - mutex_unlock(&local->chanctx_mtx); -} - void ieee80211_iter_chan_contexts_atomic( struct ieee80211_hw *hw, void (*iter)(struct ieee80211_hw *hw, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index fa16e54980a..0e963bc1cea 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -128,7 +128,7 @@ static ssize_t sta_tx_latency_stat_write(struct file *file, if (!strcmp(buf, TX_LATENCY_DISABLED)) { if (!tx_latency) goto unlock; - rcu_assign_pointer(local->tx_latency, NULL); + RCU_INIT_POINTER(local->tx_latency, NULL); synchronize_rcu(); kfree(tx_latency); goto unlock; diff --git a/net/mac80211/debugfs.h b/net/mac80211/debugfs.h index 214ed4ecd73..60c35afee29 100644 --- a/net/mac80211/debugfs.h +++ b/net/mac80211/debugfs.h @@ -1,6 +1,8 @@ #ifndef __MAC80211_DEBUGFS_H #define __MAC80211_DEBUGFS_H +#include "ieee80211_i.h" + #ifdef CONFIG_MAC80211_DEBUGFS void debugfs_hw_add(struct ieee80211_local *local); int __printf(4, 5) mac80211_format_buffer(char __user *userbuf, size_t count, diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 04b5a14c8a0..e205ebabfa5 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -34,8 +34,7 @@ static ssize_t ieee80211_if_read( ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (sdata->dev->reg_state == NETREG_REGISTERED) - ret = (*format)(sdata, buf, sizeof(buf)); + ret = (*format)(sdata, buf, sizeof(buf)); read_unlock(&dev_base_lock); if (ret >= 0) @@ -62,8 +61,7 @@ static ssize_t ieee80211_if_write( ret = -ENODEV; rtnl_lock(); - if (sdata->dev->reg_state == NETREG_REGISTERED) - ret = (*write)(sdata, buf, count); + ret = (*write)(sdata, buf, count); rtnl_unlock(); return ret; @@ -133,7 +131,15 @@ static ssize_t ieee80211_if_fmt_##name( \ jiffies_to_msecs(sdata->field)); \ } -#define __IEEE80211_IF_FILE(name, _write) \ +#define _IEEE80211_IF_FILE_OPS(name, _read, _write) \ +static const struct file_operations name##_ops = { \ + .read = (_read), \ + .write = (_write), \ + .open = simple_open, \ + .llseek = generic_file_llseek, \ +} + +#define _IEEE80211_IF_FILE_R_FN(name) \ static ssize_t ieee80211_if_read_##name(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ @@ -141,28 +147,34 @@ static ssize_t ieee80211_if_read_##name(struct file *file, \ return ieee80211_if_read(file->private_data, \ userbuf, count, ppos, \ ieee80211_if_fmt_##name); \ -} \ -static const struct file_operations name##_ops = { \ - .read = ieee80211_if_read_##name, \ - .write = (_write), \ - .open = simple_open, \ - .llseek = generic_file_llseek, \ } -#define __IEEE80211_IF_FILE_W(name) \ +#define _IEEE80211_IF_FILE_W_FN(name) \ static ssize_t ieee80211_if_write_##name(struct file *file, \ const char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ return ieee80211_if_write(file->private_data, userbuf, count, \ ppos, ieee80211_if_parse_##name); \ -} \ -__IEEE80211_IF_FILE(name, ieee80211_if_write_##name) +} + +#define IEEE80211_IF_FILE_R(name) \ + _IEEE80211_IF_FILE_R_FN(name) \ + _IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name, NULL) + +#define IEEE80211_IF_FILE_W(name) \ + _IEEE80211_IF_FILE_W_FN(name) \ + _IEEE80211_IF_FILE_OPS(name, NULL, ieee80211_if_write_##name) +#define IEEE80211_IF_FILE_RW(name) \ + _IEEE80211_IF_FILE_R_FN(name) \ + _IEEE80211_IF_FILE_W_FN(name) \ + _IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name, \ + ieee80211_if_write_##name) #define IEEE80211_IF_FILE(name, field, format) \ - IEEE80211_IF_FMT_##format(name, field) \ - __IEEE80211_IF_FILE(name, NULL) + IEEE80211_IF_FMT_##format(name, field) \ + IEEE80211_IF_FILE_R(name) /* common attributes */ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); @@ -199,7 +211,7 @@ ieee80211_if_fmt_hw_queues(const struct ieee80211_sub_if_data *sdata, return len; } -__IEEE80211_IF_FILE(hw_queues, NULL); +IEEE80211_IF_FILE_R(hw_queues); /* STA attributes */ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); @@ -275,14 +287,7 @@ static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata, return -EINVAL; } - -__IEEE80211_IF_FILE_W(smps); - -static ssize_t ieee80211_if_fmt_tkip_mic_test( - const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) -{ - return -EOPNOTSUPP; -} +IEEE80211_IF_FILE_RW(smps); static ssize_t ieee80211_if_parse_tkip_mic_test( struct ieee80211_sub_if_data *sdata, const char *buf, int buflen) @@ -349,8 +354,19 @@ static ssize_t ieee80211_if_parse_tkip_mic_test( return buflen; } +IEEE80211_IF_FILE_W(tkip_mic_test); + +static ssize_t ieee80211_if_parse_beacon_loss( + struct ieee80211_sub_if_data *sdata, const char *buf, int buflen) +{ + if (!ieee80211_sdata_running(sdata) || !sdata->vif.bss_conf.assoc) + return -ENOTCONN; -__IEEE80211_IF_FILE_W(tkip_mic_test); + ieee80211_beacon_loss(&sdata->vif); + + return buflen; +} +IEEE80211_IF_FILE_W(beacon_loss); static ssize_t ieee80211_if_fmt_uapsd_queues( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) @@ -378,7 +394,7 @@ static ssize_t ieee80211_if_parse_uapsd_queues( return buflen; } -__IEEE80211_IF_FILE_W(uapsd_queues); +IEEE80211_IF_FILE_RW(uapsd_queues); static ssize_t ieee80211_if_fmt_uapsd_max_sp_len( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) @@ -406,7 +422,7 @@ static ssize_t ieee80211_if_parse_uapsd_max_sp_len( return buflen; } -__IEEE80211_IF_FILE_W(uapsd_max_sp_len); +IEEE80211_IF_FILE_RW(uapsd_max_sp_len); /* AP attributes */ IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC); @@ -419,7 +435,7 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast( return scnprintf(buf, buflen, "%u\n", skb_queue_len(&sdata->u.ap.ps.bc_buf)); } -__IEEE80211_IF_FILE(num_buffered_multicast, NULL); +IEEE80211_IF_FILE_R(num_buffered_multicast); /* IBSS attributes */ static ssize_t ieee80211_if_fmt_tsf( @@ -468,9 +484,10 @@ static ssize_t ieee80211_if_parse_tsf( } } + ieee80211_recalc_dtim(local, sdata); return buflen; } -__IEEE80211_IF_FILE_W(tsf); +IEEE80211_IF_FILE_RW(tsf); /* WDS attributes */ @@ -562,6 +579,7 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(beacon_timeout); DEBUGFS_ADD_MODE(smps, 0600); DEBUGFS_ADD_MODE(tkip_mic_test, 0200); + DEBUGFS_ADD_MODE(beacon_loss, 0200); DEBUGFS_ADD_MODE(uapsd_queues, 0600); DEBUGFS_ADD_MODE(uapsd_max_sp_len, 0600); } diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h index 79025e79f4d..9f5501a9a79 100644 --- a/net/mac80211/debugfs_netdev.h +++ b/net/mac80211/debugfs_netdev.h @@ -3,6 +3,8 @@ #ifndef __IEEE80211_DEBUGFS_NETDEV_H #define __IEEE80211_DEBUGFS_NETDEV_H +#include "ieee80211_i.h" + #ifdef CONFIG_MAC80211_DEBUGFS void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata); void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 80194b557a0..2ecb4deddb5 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -195,7 +195,7 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { - char _buf[12], *buf = _buf; + char _buf[12] = {}, *buf = _buf; struct sta_info *sta = file->private_data; bool start, tx; unsigned long tid; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 5d03c47c0a4..bd782dcffcc 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -5,11 +5,11 @@ #include "ieee80211_i.h" #include "trace.h" -static inline void check_sdata_in_driver(struct ieee80211_sub_if_data *sdata) +static inline bool check_sdata_in_driver(struct ieee80211_sub_if_data *sdata) { - WARN(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), - "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", - sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); + return !WARN(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), + "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", + sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); } static inline struct ieee80211_sub_if_data * @@ -168,7 +168,8 @@ static inline int drv_change_interface(struct ieee80211_local *local, might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_change_interface(local, sdata, type, p2p); ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); @@ -181,7 +182,8 @@ static inline void drv_remove_interface(struct ieee80211_local *local, { might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_remove_interface(local, sdata); local->ops->remove_interface(&local->hw, &sdata->vif); @@ -219,7 +221,8 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local, sdata->vif.type == NL80211_IFTYPE_MONITOR)) return; - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_bss_info_changed(local, sdata, info, changed); if (local->ops->bss_info_changed) @@ -242,22 +245,6 @@ static inline u64 drv_prepare_multicast(struct ieee80211_local *local, return ret; } -static inline void drv_set_multicast_list(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct netdev_hw_addr_list *mc_list) -{ - bool allmulti = sdata->flags & IEEE80211_SDATA_ALLMULTI; - - trace_drv_set_multicast_list(local, sdata, mc_list->count); - - check_sdata_in_driver(sdata); - - if (local->ops->set_multicast_list) - local->ops->set_multicast_list(&local->hw, &sdata->vif, - allmulti, mc_list); - trace_drv_return_void(local); -} - static inline void drv_configure_filter(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, @@ -294,7 +281,8 @@ static inline int drv_set_key(struct ieee80211_local *local, might_sleep(); sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_set_key(local, cmd, sdata, sta, key); ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); @@ -314,7 +302,8 @@ static inline void drv_update_tkip_key(struct ieee80211_local *local, ista = &sta->sta; sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_update_tkip_key(local, sdata, conf, ista, iv32); if (local->ops->update_tkip_key) @@ -331,7 +320,8 @@ static inline int drv_hw_scan(struct ieee80211_local *local, might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_hw_scan(local, sdata); ret = local->ops->hw_scan(&local->hw, &sdata->vif, req); @@ -344,7 +334,8 @@ static inline void drv_cancel_hw_scan(struct ieee80211_local *local, { might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_cancel_hw_scan(local, sdata); local->ops->cancel_hw_scan(&local->hw, &sdata->vif); @@ -361,7 +352,8 @@ drv_sched_scan_start(struct ieee80211_local *local, might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_sched_scan_start(local, sdata); ret = local->ops->sched_scan_start(&local->hw, &sdata->vif, @@ -370,16 +362,21 @@ drv_sched_scan_start(struct ieee80211_local *local, return ret; } -static inline void drv_sched_scan_stop(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) +static inline int drv_sched_scan_stop(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { + int ret; + might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_sched_scan_stop(local, sdata); - local->ops->sched_scan_stop(&local->hw, &sdata->vif); - trace_drv_return_void(local); + ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif); + trace_drv_return_int(local, ret); + + return ret; } static inline void drv_sw_scan_start(struct ieee80211_local *local) @@ -474,7 +471,8 @@ static inline void drv_sta_notify(struct ieee80211_local *local, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_sta_notify(local, sdata, cmd, sta); if (local->ops->sta_notify) @@ -491,7 +489,8 @@ static inline int drv_sta_add(struct ieee80211_local *local, might_sleep(); sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_sta_add(local, sdata, sta); if (local->ops->sta_add) @@ -509,7 +508,8 @@ static inline void drv_sta_remove(struct ieee80211_local *local, might_sleep(); sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_sta_remove(local, sdata, sta); if (local->ops->sta_remove) @@ -527,7 +527,8 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local, might_sleep(); sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; if (local->ops->sta_add_debugfs) local->ops->sta_add_debugfs(&local->hw, &sdata->vif, @@ -550,6 +551,23 @@ static inline void drv_sta_remove_debugfs(struct ieee80211_local *local, } #endif +static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta); + if (local->ops->sta_pre_rcu_remove) + local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif, + &sta->sta); + trace_drv_return_void(local); +} + static inline __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, @@ -562,7 +580,8 @@ int drv_sta_state(struct ieee80211_local *local, might_sleep(); sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); if (local->ops->sta_state) { @@ -586,7 +605,8 @@ static inline void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sta *sta, u32 changed) { sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && (sdata->vif.type != NL80211_IFTYPE_ADHOC && @@ -608,7 +628,8 @@ static inline int drv_conf_tx(struct ieee80211_local *local, might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_conf_tx(local, sdata, ac, params); if (local->ops->conf_tx) @@ -625,7 +646,8 @@ static inline u64 drv_get_tsf(struct ieee80211_local *local, might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return ret; trace_drv_get_tsf(local, sdata); if (local->ops->get_tsf) @@ -640,7 +662,8 @@ static inline void drv_set_tsf(struct ieee80211_local *local, { might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_set_tsf(local, sdata, tsf); if (local->ops->set_tsf) @@ -653,7 +676,8 @@ static inline void drv_reset_tsf(struct ieee80211_local *local, { might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_reset_tsf(local, sdata); if (local->ops->reset_tsf) @@ -685,7 +709,8 @@ static inline int drv_ampdu_action(struct ieee80211_local *local, might_sleep(); sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, buf_size); @@ -722,13 +747,19 @@ static inline void drv_rfkill_poll(struct ieee80211_local *local) } static inline void drv_flush(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, u32 queues, bool drop) { + struct ieee80211_vif *vif = sdata ? &sdata->vif : NULL; + might_sleep(); + if (sdata && !check_sdata_in_driver(sdata)) + return; + trace_drv_flush(local, queues, drop); if (local->ops->flush) - local->ops->flush(&local->hw, queues, drop); + local->ops->flush(&local->hw, vif, queues, drop); trace_drv_return_void(local); } @@ -844,7 +875,8 @@ static inline int drv_set_bitrate_mask(struct ieee80211_local *local, might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_set_bitrate_mask(local, sdata, mask); if (local->ops->set_bitrate_mask) @@ -859,7 +891,8 @@ static inline void drv_set_rekey_data(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_gtk_rekey_data *data) { - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_set_rekey_data(local, sdata, data); if (local->ops->set_rekey_data) @@ -927,7 +960,8 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, { might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_prepare_tx(local, sdata); @@ -954,6 +988,9 @@ static inline int drv_add_chanctx(struct ieee80211_local *local, static inline void drv_remove_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { + if (WARN_ON(!ctx->driver_present)) + return; + trace_drv_remove_chanctx(local, ctx); if (local->ops->remove_chanctx) local->ops->remove_chanctx(&local->hw, &ctx->conf); @@ -979,7 +1016,8 @@ static inline int drv_assign_vif_chanctx(struct ieee80211_local *local, { int ret = 0; - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_assign_vif_chanctx(local, sdata, ctx); if (local->ops->assign_vif_chanctx) { @@ -997,7 +1035,8 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx) { - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_unassign_vif_chanctx(local, sdata, ctx); if (local->ops->unassign_vif_chanctx) { @@ -1009,12 +1048,66 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline int +drv_switch_vif_chanctx(struct ieee80211_local *local, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, + enum ieee80211_chanctx_switch_mode mode) +{ + int ret = 0; + int i; + + if (!local->ops->switch_vif_chanctx) + return -EOPNOTSUPP; + + for (i = 0; i < n_vifs; i++) { + struct ieee80211_chanctx *new_ctx = + container_of(vifs[i].new_ctx, + struct ieee80211_chanctx, + conf); + struct ieee80211_chanctx *old_ctx = + container_of(vifs[i].old_ctx, + struct ieee80211_chanctx, + conf); + + WARN_ON_ONCE(!old_ctx->driver_present); + WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && + new_ctx->driver_present) || + (mode == CHANCTX_SWMODE_REASSIGN_VIF && + !new_ctx->driver_present)); + } + + trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); + ret = local->ops->switch_vif_chanctx(&local->hw, + vifs, n_vifs, mode); + trace_drv_return_int(local, ret); + + if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { + for (i = 0; i < n_vifs; i++) { + struct ieee80211_chanctx *new_ctx = + container_of(vifs[i].new_ctx, + struct ieee80211_chanctx, + conf); + struct ieee80211_chanctx *old_ctx = + container_of(vifs[i].old_ctx, + struct ieee80211_chanctx, + conf); + + new_ctx->driver_present = true; + old_ctx->driver_present = false; + } + } + + return ret; +} + static inline int drv_start_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_start_ap(local, sdata, &sdata->vif.bss_conf); if (local->ops->start_ap) @@ -1026,7 +1119,8 @@ static inline int drv_start_ap(struct ieee80211_local *local, static inline void drv_stop_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_stop_ap(local, sdata); if (local->ops->stop_ap) @@ -1049,7 +1143,8 @@ drv_set_default_unicast_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int key_idx) { - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; WARN_ON_ONCE(key_idx < -1 || key_idx > 3); @@ -1091,7 +1186,8 @@ static inline int drv_join_ibss(struct ieee80211_local *local, int ret = 0; might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf); if (local->ops->join_ibss) @@ -1104,7 +1200,8 @@ static inline void drv_leave_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); - check_sdata_in_driver(sdata); + if (!check_sdata_in_driver(sdata)) + return; trace_drv_leave_ibss(local, sdata); if (local->ops->leave_ibss) @@ -1112,4 +1209,17 @@ static inline void drv_leave_ibss(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, + struct ieee80211_sta *sta) +{ + u32 ret = 0; + + trace_drv_get_expected_throughput(sta); + if (local->ops->get_expected_throughput) + ret = local->ops->get_expected_throughput(sta); + trace_drv_return_u32(local, ret); + + return ret; +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 9a8be8f6922..15702ff64a4 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -31,6 +31,18 @@ static void __check_htcap_disable(struct ieee80211_ht_cap *ht_capa, } } +static void __check_htcap_enable(struct ieee80211_ht_cap *ht_capa, + struct ieee80211_ht_cap *ht_capa_mask, + struct ieee80211_sta_ht_cap *ht_cap, + u16 flag) +{ + __le16 le_flag = cpu_to_le16(flag); + + if ((ht_capa_mask->cap_info & le_flag) && + (ht_capa->cap_info & le_flag)) + ht_cap->cap |= flag; +} + void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap) { @@ -59,7 +71,7 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, smask = (u8 *)(&ht_capa_mask->mcs.rx_mask); /* NOTE: If you add more over-rides here, update register_hw - * ht_capa_mod_msk logic in main.c as well. + * ht_capa_mod_mask logic in main.c as well. * And, if this method can ever change ht_cap.ht_supported, fix * the check in ieee80211_add_ht_ie. */ @@ -86,6 +98,14 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, IEEE80211_HT_CAP_MAX_AMSDU); + /* Allow user to disable LDPC */ + __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap, + IEEE80211_HT_CAP_LDPC_CODING); + + /* Allow user to enable 40 MHz intolerant bit. */ + __check_htcap_enable(ht_capa, ht_capa_mask, ht_cap, + IEEE80211_HT_CAP_40MHZ_INTOLERANT); + /* Allow user to decrease AMPDU factor */ if (ht_capa_mask->ampdu_params_info & IEEE80211_HT_AMPDU_PARM_FACTOR) { @@ -375,7 +395,7 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.delba.params = cpu_to_le16(params); mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); - ieee80211_tx_skb_tid(sdata, skb, tid); + ieee80211_tx_skb(sdata, skb); } void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, @@ -466,7 +486,9 @@ void ieee80211_request_smps_ap_work(struct work_struct *work) u.ap.request_smps_work); sdata_lock(sdata); - __ieee80211_request_smps_ap(sdata, sdata->u.ap.driver_smps_mode); + if (sdata_dereference(sdata->u.ap.beacon, sdata)) + __ieee80211_request_smps_ap(sdata, + sdata->u.ap.driver_smps_mode); sdata_unlock(sdata); } @@ -479,9 +501,6 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, vif->type != NL80211_IFTYPE_AP)) return; - if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF)) - smps_mode = IEEE80211_SMPS_AUTOMATIC; - if (vif->type == NL80211_IFTYPE_STATION) { if (sdata->u.mgd.driver_smps_mode == smps_mode) return; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 2eda7b13124..18ee0a256b1 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -143,7 +143,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, *pos++ = csa_settings->block_tx ? 1 : 0; *pos++ = ieee80211_frequency_to_channel( csa_settings->chandef.chan->center_freq); - sdata->csa_counter_offset_beacon = (pos - presp->head); + sdata->csa_counter_offset_beacon[0] = (pos - presp->head); *pos++ = csa_settings->count; } @@ -220,7 +220,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss; u32 bss_change; @@ -229,7 +228,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct beacon_data *presp; enum nl80211_bss_scan_width scan_width; bool have_higher_than_11mbit; - bool radar_required = false; + bool radar_required; int err; sdata_assert_lock(sdata); @@ -254,7 +253,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, presp = rcu_dereference_protected(ifibss->presp, lockdep_is_held(&sdata->wdev.mtx)); - rcu_assign_pointer(ifibss->presp, NULL); + RCU_INIT_POINTER(ifibss->presp, NULL); if (presp) kfree_rcu(presp, rcu_head); @@ -263,7 +262,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, /* make a copy of the chandef, it could be modified below. */ chandef = *req_chandef; chan = chandef.chan; - if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef)) { + if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef, + NL80211_IFTYPE_ADHOC)) { if (chandef.width == NL80211_CHAN_WIDTH_5 || chandef.width == NL80211_CHAN_WIDTH_10 || chandef.width == NL80211_CHAN_WIDTH_20_NOHT || @@ -275,7 +275,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, chandef.width = NL80211_CHAN_WIDTH_20; chandef.center_freq1 = chan->center_freq; /* check again for downgraded chandef */ - if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef)) { + if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef, + NL80211_IFTYPE_ADHOC)) { sdata_info(sdata, "Failed to join IBSS, beacons forbidden\n"); return; @@ -283,29 +284,34 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, - &chandef); - if (err > 0) { - if (!ifibss->userspace_handles_dfs) { - sdata_info(sdata, - "Failed to join IBSS, DFS channel without control program\n"); - return; - } - radar_required = true; + &chandef, NL80211_IFTYPE_ADHOC); + if (err < 0) { + sdata_info(sdata, + "Failed to join IBSS, invalid chandef\n"); + return; + } + if (err > 0 && !ifibss->userspace_handles_dfs) { + sdata_info(sdata, + "Failed to join IBSS, DFS channel without control program\n"); + return; } - ieee80211_vif_release_channel(sdata); + radar_required = err; + + mutex_lock(&local->mtx); if (ieee80211_vif_use_channel(sdata, &chandef, ifibss->fixed_channel ? IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE)) { sdata_info(sdata, "Failed to join IBSS, no channel context\n"); + mutex_unlock(&local->mtx); return; } + sdata->radar_required = radar_required; + mutex_unlock(&local->mtx); memcpy(ifibss->bssid, bssid, ETH_ALEN); - sband = local->hw.wiphy->bands[chan->band]; - presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates, capability, tsf, &chandef, &have_higher_than_11mbit, NULL); @@ -315,7 +321,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, rcu_assign_pointer(ifibss->presp, presp); mgmt = (void *)presp->head; - sdata->radar_required = radar_required; sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.beacon_int = beacon_int; sdata->vif.bss_conf.basic_rates = basic_rates; @@ -363,7 +368,9 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ssid_len = 0; RCU_INIT_POINTER(ifibss->presp, NULL); kfree_rcu(presp, rcu_head); + mutex_lock(&local->mtx); ieee80211_vif_release_channel(sdata); + mutex_unlock(&local->mtx); sdata_info(sdata, "Failed to join IBSS, driver failure: %d\n", err); return; @@ -381,7 +388,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, presp->head_len, 0, GFP_KERNEL); cfg80211_put_bss(local->hw.wiphy, bss); netif_carrier_on(sdata->dev); - cfg80211_ibss_joined(sdata->dev, ifibss->bssid, GFP_KERNEL); + cfg80211_ibss_joined(sdata->dev, ifibss->bssid, chan, GFP_KERNEL); } static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, @@ -516,13 +523,7 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, if (old_presp) kfree_rcu(old_presp, rcu_head); - /* it might not send the beacon for a while. send an action frame - * immediately to announce the channel switch. - */ - if (csa_settings) - ieee80211_send_action_csa(sdata, csa_settings); - - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); + return BSS_CHANGED_BEACON; out: return ret; } @@ -531,10 +532,11 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; - int err; + int err, changed = 0; u16 capability; - sdata_lock(sdata); + sdata_assert_lock(sdata); + /* update cfg80211 bss information with the new channel */ if (!is_zero_ether_addr(ifibss->bssid)) { capability = WLAN_CAPABILITY_IBSS; @@ -559,11 +561,12 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) /* generate the beacon */ err = ieee80211_ibss_csa_beacon(sdata, NULL); - sdata_unlock(sdata); if (err < 0) return err; - return 0; + changed |= err; + + return changed; } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) @@ -687,12 +690,9 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) struct cfg80211_bss *cbss; struct beacon_data *presp; struct sta_info *sta; - int active_ibss; u16 capability; - active_ibss = ieee80211_sta_active_ibss(sdata); - - if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) { + if (!is_zero_ether_addr(ifibss->bssid)) { capability = WLAN_CAPABILITY_IBSS; if (ifibss->privacy) @@ -744,7 +744,9 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_IBSS); drv_leave_ibss(local, sdata); + mutex_lock(&local->mtx); ieee80211_vif_release_channel(sdata); + mutex_unlock(&local->mtx); } static void ieee80211_csa_connection_drop_work(struct work_struct *work) @@ -753,12 +755,16 @@ static void ieee80211_csa_connection_drop_work(struct work_struct *work) container_of(work, struct ieee80211_sub_if_data, u.ibss.csa_connection_drop_work); + sdata_lock(sdata); + ieee80211_ibss_disconnect(sdata); synchronize_rcu(); skb_queue_purge(&sdata->skb_queue); /* trigger a scan to find another IBSS network to join */ ieee80211_queue_work(&sdata->local->hw, &sdata->work); + + sdata_unlock(sdata); } static void ieee80211_ibss_csa_mark_radar(struct ieee80211_sub_if_data *sdata) @@ -770,7 +776,8 @@ static void ieee80211_ibss_csa_mark_radar(struct ieee80211_sub_if_data *sdata) * unavailable. */ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, - &ifibss->chandef); + &ifibss->chandef, + NL80211_IFTYPE_ADHOC); if (err > 0) cfg80211_radar_event(sdata->local->hw.wiphy, &ifibss->chandef, GFP_ATOMIC); @@ -784,17 +791,11 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings params; struct ieee80211_csa_ie csa_ie; struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_chanctx *chanctx; enum nl80211_channel_type ch_type; - int err, num_chanctx; + int err; u32 sta_flags; - if (sdata->vif.csa_active) - return true; - - if (!sdata->vif.bss_conf.ibss_joined) - return false; + sdata_assert_lock(sdata); sta_flags = IEEE80211_STA_DISABLE_VHT; switch (ifibss->chandef.width) { @@ -830,9 +831,6 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, params.count = csa_ie.count; params.chandef = csa_ie.chandef; - if (ifibss->chandef.chan->band != params.chandef.chan->band) - goto disconnect; - switch (ifibss->chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: @@ -865,7 +863,8 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, goto disconnect; } - if (!cfg80211_reg_can_beacon(sdata->local->hw.wiphy, ¶ms.chandef)) { + if (!cfg80211_reg_can_beacon(sdata->local->hw.wiphy, ¶ms.chandef, + NL80211_IFTYPE_ADHOC)) { sdata_info(sdata, "IBSS %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", ifibss->bssid, @@ -877,39 +876,23 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, - ¶ms.chandef); + ¶ms.chandef, + NL80211_IFTYPE_ADHOC); if (err < 0) goto disconnect; - if (err) { + if (err > 0 && !ifibss->userspace_handles_dfs) { /* IBSS-DFS only allowed with a control program */ - if (!ifibss->userspace_handles_dfs) - goto disconnect; - - params.radar_required = true; - } - - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (!chanctx_conf) { - rcu_read_unlock(); goto disconnect; } - /* don't handle for multi-VIF cases */ - chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); - if (chanctx->refcount > 1) { - rcu_read_unlock(); - goto disconnect; - } - num_chanctx = 0; - list_for_each_entry_rcu(chanctx, &sdata->local->chanctx_list, list) - num_chanctx++; + params.radar_required = err; - if (num_chanctx > 1) { - rcu_read_unlock(); - goto disconnect; + if (cfg80211_chandef_identical(¶ms.chandef, + &sdata->vif.bss_conf.chandef)) { + ibss_dbg(sdata, + "received csa with an identical chandef, ignoring\n"); + return true; } - rcu_read_unlock(); /* all checks done, now perform the channel switch. */ ibss_dbg(sdata, @@ -918,19 +901,9 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, params.block_tx = !!csa_ie.mode; - ieee80211_ibss_csa_beacon(sdata, ¶ms); - sdata->csa_radar_required = params.radar_required; - - if (params.block_tx) - ieee80211_stop_queues_by_reason(&sdata->local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); - - sdata->csa_chandef = params.chandef; - sdata->vif.csa_active = true; - - ieee80211_bss_info_change_notify(sdata, err); - drv_channel_switch_beacon(sdata, ¶ms.chandef); + if (ieee80211_channel_switch(sdata->local->hw.wiphy, sdata->dev, + ¶ms)) + goto disconnect; ieee80211_ibss_csa_mark_radar(sdata); @@ -966,7 +939,8 @@ ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata, if (len < required_len) return; - ieee80211_ibss_process_chanswitch(sdata, elems, false); + if (!sdata->vif.csa_active) + ieee80211_ibss_process_chanswitch(sdata, elems, false); } static void ieee80211_rx_mgmt_deauth_ibss(struct ieee80211_sub_if_data *sdata, @@ -1020,7 +994,6 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { struct ieee80211_local *local = sdata->local; - int freq; struct cfg80211_bss *cbss; struct ieee80211_bss *bss; struct sta_info *sta; @@ -1032,15 +1005,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; bool rates_updated = false; - if (elems->ds_params) - freq = ieee80211_channel_to_frequency(elems->ds_params[0], - band); - else - freq = rx_status->freq; - - channel = ieee80211_get_channel(local->hw.wiphy, freq); - - if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + if (!channel) return; if (sdata->vif.type == NL80211_IFTYPE_ADHOC && @@ -1147,7 +1113,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, goto put_bss; /* process channel switch */ - if (ieee80211_ibss_process_chanswitch(sdata, elems, true)) + if (sdata->vif.csa_active || + ieee80211_ibss_process_chanswitch(sdata, elems, true)) goto put_bss; /* same BSSID */ @@ -1492,6 +1459,11 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, memcpy(((struct ieee80211_mgmt *) skb->data)->da, mgmt->sa, ETH_ALEN); ibss_dbg(sdata, "Sending ProbeResp to %pM\n", mgmt->sa); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + /* avoid excessive retries for probe request to wildcard SSIDs */ + if (pos[1] == 0) + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_NO_ACK; + ieee80211_tx_skb(sdata, skb); } @@ -1667,7 +1639,33 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, u32 changed = 0; u32 rate_flags; struct ieee80211_supported_band *sband; + enum ieee80211_chanctx_mode chanmode; + struct ieee80211_local *local = sdata->local; + int radar_detect_width = 0; int i; + int ret; + + ret = cfg80211_chandef_dfs_required(local->hw.wiphy, + ¶ms->chandef, + sdata->wdev.iftype); + if (ret < 0) + return ret; + + if (ret > 0) { + if (!params->userspace_handles_dfs) + return -EINVAL; + radar_detect_width = BIT(params->chandef.width); + } + + chanmode = (params->channel_fixed && !ret) ? + IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE; + + mutex_lock(&local->chanctx_mtx); + ret = ieee80211_check_combinations(sdata, ¶ms->chandef, chanmode, + radar_detect_width); + mutex_unlock(&local->chanctx_mtx); + if (ret < 0) + return ret; if (params->bssid) { memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN); @@ -1679,10 +1677,11 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.control_port = params->control_port; sdata->u.ibss.userspace_handles_dfs = params->userspace_handles_dfs; sdata->u.ibss.basic_rates = params->basic_rates; + sdata->u.ibss.last_scan_completed = jiffies; /* fix basic_rates if channel does not support these rates */ rate_flags = ieee80211_chandef_rate_flags(¶ms->chandef); - sband = sdata->local->hw.wiphy->bands[params->chandef.chan->band]; + sband = local->hw.wiphy->bands[params->chandef.chan->band]; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) sdata->u.ibss.basic_rates &= ~BIT(i); @@ -1731,9 +1730,9 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, ieee80211_bss_info_change_notify(sdata, changed); sdata->smps_mode = IEEE80211_SMPS_OFF; - sdata->needed_rx_chains = sdata->local->rx_chains; + sdata->needed_rx_chains = local->rx_chains; - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + ieee80211_queue_work(&local->hw, &sdata->work); return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ed5bf8b4b5c..ac9836e0aab 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -232,6 +232,7 @@ struct ieee80211_rx_data { struct beacon_data { u8 *head, *tail; int head_len, tail_len; + struct ieee80211_meshconf_ie *meshconf; struct rcu_head rcu_head; }; @@ -245,7 +246,8 @@ struct ps_data { /* yes, this looks ugly, but guarantees that we can later use * bitmap_empty :) * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */ - u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)]; + u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)] + __aligned(__alignof__(unsigned long)); struct sk_buff_head bc_buf; atomic_t num_sta_ps; /* number of stations in PS mode */ int dtim_count; @@ -258,7 +260,7 @@ struct ieee80211_if_ap { /* to be used after channel switch. */ struct cfg80211_beacon_data *next_beacon; - struct list_head vlans; + struct list_head vlans; /* write-protected with RTNL and local->mtx */ struct ps_data ps; atomic_t num_mcast_sta; /* number of stations receiving multicast */ @@ -274,7 +276,7 @@ struct ieee80211_if_wds { }; struct ieee80211_if_vlan { - struct list_head list; + struct list_head list; /* write-protected with RTNL and local->mtx */ /* used for all tx if the VLAN is configured to 4-addr mode */ struct sta_info __rcu *sta; @@ -315,6 +317,7 @@ struct ieee80211_roc_work { bool started, abort, hw_begun, notified; bool to_be_freed; + bool on_channel; unsigned long hw_start_time; @@ -540,7 +543,10 @@ struct ieee80211_mesh_sync_ops { struct ieee80211_mgmt *mgmt, struct ieee802_11_elems *elems, struct ieee80211_rx_status *rx_status); - void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata); + + /* should be called with beacon_data under RCU read lock */ + void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata, + struct beacon_data *beacon); /* add other framework functions here */ }; @@ -611,9 +617,16 @@ struct ieee80211_if_mesh { struct ps_data ps; /* Channel Switching Support */ struct mesh_csa_settings __rcu *csa; - bool chsw_init; + enum { + IEEE80211_MESH_CSA_ROLE_NONE, + IEEE80211_MESH_CSA_ROLE_INIT, + IEEE80211_MESH_CSA_ROLE_REPEATER, + } csa_role; u8 chsw_ttl; u16 pre_value; + + /* offset from skb->data while building IE */ + int meshconf_offset; }; #ifdef CONFIG_MAC80211_MESH @@ -679,13 +692,20 @@ struct ieee80211_chanctx { struct list_head list; struct rcu_head rcu_head; + struct list_head assigned_vifs; + struct list_head reserved_vifs; + enum ieee80211_chanctx_mode mode; - int refcount; bool driver_present; struct ieee80211_chanctx_conf conf; }; +struct mac80211_qos_map { + struct cfg80211_qos_map qos_map; + struct rcu_head rcu_head; +}; + struct ieee80211_sub_if_data { struct list_head list; @@ -731,13 +751,24 @@ struct ieee80211_sub_if_data { int encrypt_headroom; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; + struct mac80211_qos_map __rcu *qos_map; struct work_struct csa_finalize_work; - int csa_counter_offset_beacon; - int csa_counter_offset_presp; + u16 csa_counter_offset_beacon[IEEE80211_MAX_CSA_COUNTERS_NUM]; + u16 csa_counter_offset_presp[IEEE80211_MAX_CSA_COUNTERS_NUM]; bool csa_radar_required; + bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */ struct cfg80211_chan_def csa_chandef; + struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */ + struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */ + + /* context reservation -- protected with chanctx_mtx */ + struct ieee80211_chanctx *reserved_chanctx; + struct cfg80211_chan_def reserved_chandef; + bool reserved_radar_required; + u8 csa_current_counter; + /* used to reconfigure hardware SM PS */ struct work_struct recalc_smps; @@ -776,10 +807,6 @@ struct ieee80211_sub_if_data { u32 mntr_flags; } u; - spinlock_t cleanup_stations_lock; - struct list_head cleanup_stations; - struct work_struct cleanup_stations_wk; - #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *subdir_stations; @@ -1117,6 +1144,7 @@ struct ieee80211_local { struct work_struct sched_scan_stopped_work; struct ieee80211_sub_if_data __rcu *sched_scan_sdata; + struct cfg80211_sched_scan_request *sched_scan_req; unsigned long leave_oper_channel_time; enum mac80211_scan_state next_scan_state; @@ -1227,6 +1255,8 @@ struct ieee80211_local { struct ieee80211_sub_if_data __rcu *p2p_sdata; + struct napi_struct *napi; + /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct cfg80211_chan_def monitor_chandef; @@ -1374,6 +1404,7 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, __le16 fc, bool acked); +void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); /* IBSS code */ @@ -1397,8 +1428,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_csa_settings *csa_settings, - bool csa_action); + struct cfg80211_csa_settings *csa_settings); int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata); /* scan/BSS handling */ @@ -1425,9 +1455,13 @@ void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss); /* scheduled scan handling */ +int +__ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, + struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata); +void ieee80211_sched_scan_end(struct ieee80211_local *local); void ieee80211_sched_scan_stopped_work(struct work_struct *work); /* off-channel helpers */ @@ -1442,7 +1476,10 @@ void ieee80211_sw_roc_work(struct work_struct *work); void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc); /* channel switch handling */ +bool ieee80211_csa_needs_block_tx(struct ieee80211_local *local); void ieee80211_csa_finalize_work(struct work_struct *work); +int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_csa_settings *params); /* interface handling */ int ieee80211_iface_init(void); @@ -1465,8 +1502,6 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); -int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_beacon_data *params); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { @@ -1539,6 +1574,9 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); void ieee80211_sta_set_rx_nss(struct sta_info *sta); +u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 opmode, + enum ieee80211_band band, bool nss_only); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum ieee80211_band band, bool nss_only); @@ -1591,7 +1629,7 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw) } /* utility functions/constants */ -extern void *mac80211_wiphy_privid; /* for wiphy privid */ +extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type); int ieee80211_frame_duration(enum ieee80211_band band, size_t len, @@ -1678,14 +1716,8 @@ void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb); -void ieee80211_add_pending_skbs_fn(struct ieee80211_local *local, - struct sk_buff_head *skbs, - void (*fn)(void *data), void *data); -static inline void ieee80211_add_pending_skbs(struct ieee80211_local *local, - struct sk_buff_head *skbs) -{ - ieee80211_add_pending_skbs_fn(local, skbs, NULL, NULL); -} +void ieee80211_add_pending_skbs(struct ieee80211_local *local, + struct sk_buff_head *skbs); void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); @@ -1754,6 +1786,16 @@ ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode); int __must_check +ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode mode, + bool radar_required); +int __must_check +ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata, + u32 *changed); +int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata); + +int __must_check ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 *changed); @@ -1765,11 +1807,11 @@ void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, bool clear); +int ieee80211_chanctx_refcount(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx); void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); -void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *chanctx); void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); @@ -1788,6 +1830,22 @@ ieee80211_cs_get(struct ieee80211_local *local, u32 cipher, int ieee80211_cs_headroom(struct ieee80211_local *local, struct cfg80211_crypto_settings *crypto, enum nl80211_iftype iftype); +void ieee80211_recalc_dtim(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); +int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode chanmode, + u8 radar_detect); +int ieee80211_max_num_channels(struct ieee80211_local *local); + +/* TDLS */ +int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, u8 dialog_token, + u16 status_code, u32 peer_capability, + const u8 *extra_ies, size_t extra_ies_len); +int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, enum nl80211_tdls_operation oper); + #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7aa9f9dea9d..388b863e821 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -101,9 +101,8 @@ static u32 __ieee80211_idle_on(struct ieee80211_local *local) static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, bool force_active) { - bool working = false, scanning, active; + bool working, scanning, active; unsigned int led_trig_start = 0, led_trig_stop = 0; - struct ieee80211_roc_work *roc; lockdep_assert_held(&local->mtx); @@ -111,12 +110,8 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, !list_empty(&local->chanctx_list) || local->monitors; - if (!local->ops->remain_on_channel) { - list_for_each_entry(roc, &local->roc_list, list) { - working = true; - break; - } - } + working = !local->ops->remain_on_channel && + !list_empty(&local->roc_list); scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) || test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); @@ -255,6 +250,7 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *nsdata; + int ret; ASSERT_RTNL(); @@ -305,7 +301,10 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, } } - return 0; + mutex_lock(&local->chanctx_mtx); + ret = ieee80211_check_combinations(sdata, NULL, 0, 0); + mutex_unlock(&local->chanctx_mtx); + return ret; } static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, @@ -400,6 +399,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) sdata->vif.type = NL80211_IFTYPE_MONITOR; snprintf(sdata->name, IFNAMSIZ, "%s-monitor", wiphy_name(local->hw.wiphy)); + sdata->wdev.iftype = NL80211_IFTYPE_MONITOR; sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; @@ -418,18 +418,24 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return ret; } + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, sdata); + mutex_unlock(&local->iflist_mtx); + + mutex_lock(&local->mtx); ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef, IEEE80211_CHANCTX_EXCLUSIVE); + mutex_unlock(&local->mtx); if (ret) { + mutex_lock(&local->iflist_mtx); + RCU_INIT_POINTER(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + synchronize_net(); drv_remove_interface(local, sdata); kfree(sdata); return ret; } - mutex_lock(&local->iflist_mtx); - rcu_assign_pointer(local->monitor_sdata, sdata); - mutex_unlock(&local->iflist_mtx); - return 0; } @@ -451,12 +457,14 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) return; } - rcu_assign_pointer(local->monitor_sdata, NULL); + RCU_INIT_POINTER(local->monitor_sdata, NULL); mutex_unlock(&local->iflist_mtx); synchronize_net(); + mutex_lock(&local->mtx); ieee80211_vif_release_channel(sdata); + mutex_unlock(&local->mtx); drv_remove_interface(local, sdata); @@ -489,7 +497,9 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (!sdata->bss) return -ENOLINK; + mutex_lock(&local->mtx); list_add(&sdata->u.vlan.list, &sdata->bss->vlans); + mutex_unlock(&local->mtx); master = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); @@ -719,8 +729,11 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) drv_stop(local); err_del_bss: sdata->bss = NULL; - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); + mutex_unlock(&local->mtx); + } /* might already be clear but that doesn't matter */ clear_bit(SDATA_STATE_RUNNING, &sdata->state); return res; @@ -766,12 +779,19 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_roc_purge(local, sdata); - if (sdata->vif.type == NL80211_IFTYPE_STATION) + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: ieee80211_mgd_stop(sdata); - - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + break; + case NL80211_IFTYPE_ADHOC: ieee80211_ibss_stop(sdata); - + break; + case NL80211_IFTYPE_AP: + cancel_work_sync(&sdata->u.ap.request_smps_work); + break; + default: + break; + } /* * Remove all stations associated with this interface. @@ -786,10 +806,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * This is relevant only in WDS mode, in all other modes we've * already removed all stations when disconnecting or similar, * so warn otherwise. - * - * We call sta_info_flush_cleanup() later, to combine RCU waits. */ - flushed = sta_info_flush_defer(sdata); + flushed = sta_info_flush(sdata); WARN_ON_ONCE((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)); @@ -820,7 +838,16 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, cancel_work_sync(&local->dynamic_ps_enable_work); cancel_work_sync(&sdata->recalc_smps); + sdata_lock(sdata); + mutex_lock(&local->mtx); sdata->vif.csa_active = false; + if (!ieee80211_csa_needs_block_tx(local)) + ieee80211_wake_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); + mutex_unlock(&local->mtx); + sdata_unlock(sdata); + cancel_work_sync(&sdata->csa_finalize_work); cancel_delayed_work_sync(&sdata->dfs_cac_timer_work); @@ -828,9 +855,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->wdev.cac_started) { chandef = sdata->vif.bss_conf.chandef; WARN_ON(local->suspended); - mutex_lock(&local->iflist_mtx); + mutex_lock(&local->mtx); ieee80211_vif_release_channel(sdata); - mutex_unlock(&local->iflist_mtx); + mutex_unlock(&local->mtx); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL); @@ -865,8 +892,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: + mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); - rcu_assign_pointer(sdata->vif.chanctx_conf, NULL); + mutex_unlock(&local->mtx); + RCU_INIT_POINTER(sdata->vif.chanctx_conf, NULL); /* no need to tell driver */ break; case NL80211_IFTYPE_MONITOR: @@ -885,29 +914,21 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, break; case NL80211_IFTYPE_P2P_DEVICE: /* relies on synchronize_rcu() below */ - rcu_assign_pointer(local->p2p_sdata, NULL); + RCU_INIT_POINTER(local->p2p_sdata, NULL); /* fall through */ default: cancel_work_sync(&sdata->work); /* * When we get here, the interface is marked down. + * Free the remaining keys, if there are any + * (shouldn't be, except maybe in WDS mode?) * - * sta_info_flush_cleanup() requires rcu_barrier() - * first to wait for the station call_rcu() calls - * to complete, and we also need synchronize_rcu() - * to wait for the RX path in case it is using the - * interface and enqueuing frames at this very time on + * Force the key freeing to always synchronize_net() + * to wait for the RX path in case it is using this + * interface enqueuing frames * at this very time on * another CPU. */ - synchronize_rcu(); - rcu_barrier(); - sta_info_flush_cleanup(sdata); - - /* - * Free all remaining keys, there shouldn't be any, - * except maybe in WDS mode? - */ - ieee80211_free_keys(sdata); + ieee80211_free_keys(sdata, true); /* fall through */ case NL80211_IFTYPE_AP: @@ -1018,17 +1039,6 @@ static void ieee80211_set_multicast_list(struct net_device *dev) atomic_dec(&local->iff_promiscs); sdata->flags ^= IEEE80211_SDATA_PROMISC; } - - /* - * TODO: If somebody needs this on AP interfaces, - * it can be enabled easily but multicast - * addresses from VLANs need to be synced. - */ - if (sdata->vif.type != NL80211_IFTYPE_MONITOR && - sdata->vif.type != NL80211_IFTYPE_AP_VLAN && - sdata->vif.type != NL80211_IFTYPE_AP) - drv_set_multicast_list(local, sdata, &dev->mc); - spin_lock_bh(&local->filter_lock); __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len); spin_unlock_bh(&local->filter_lock); @@ -1044,7 +1054,7 @@ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata) int i; /* free extra data */ - ieee80211_free_keys(sdata); + ieee80211_free_keys(sdata, false); ieee80211_debugfs_remove_netdev(sdata); @@ -1062,7 +1072,9 @@ static void ieee80211_uninit(struct net_device *dev) } static u16 ieee80211_netdev_select_queue(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + void *accel_priv, + select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } @@ -1079,7 +1091,9 @@ static const struct net_device_ops ieee80211_dataif_ops = { }; static u16 ieee80211_monitor_select_queue(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + void *accel_priv, + select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -1272,6 +1286,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE); sdata->control_port_no_encrypt = false; sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; + sdata->vif.bss_conf.idle = true; sdata->noack_map = 0; @@ -1285,6 +1300,8 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, INIT_WORK(&sdata->work, ieee80211_iface_work); INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work); INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work); + INIT_LIST_HEAD(&sdata->assigned_chanctx_list); + INIT_LIST_HEAD(&sdata->reserved_chanctx_list); switch (type) { case NL80211_IFTYPE_P2P_GO: @@ -1497,8 +1514,8 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, bool used = false; list_for_each_entry(sdata, &local->interfaces, list) { - if (memcmp(local->hw.wiphy->addresses[i].addr, - sdata->vif.addr, ETH_ALEN) == 0) { + if (ether_addr_equal(local->hw.wiphy->addresses[i].addr, + sdata->vif.addr)) { used = true; break; } @@ -1558,8 +1575,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, val += inc; list_for_each_entry(sdata, &local->interfaces, list) { - if (memcmp(tmp_addr, sdata->vif.addr, - ETH_ALEN) == 0) { + if (ether_addr_equal(tmp_addr, sdata->vif.addr)) { used = true; break; } @@ -1579,15 +1595,6 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, mutex_unlock(&local->iflist_mtx); } -static void ieee80211_cleanup_sdata_stas_wk(struct work_struct *wk) -{ - struct ieee80211_sub_if_data *sdata; - - sdata = container_of(wk, struct ieee80211_sub_if_data, cleanup_stations_wk); - - ieee80211_cleanup_sdata_stas(sdata); -} - int ieee80211_if_add(struct ieee80211_local *local, const char *name, struct wireless_dev **new_wdev, enum nl80211_iftype type, struct vif_params *params) @@ -1660,9 +1667,6 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, INIT_LIST_HEAD(&sdata->key_list); - spin_lock_init(&sdata->cleanup_stations_lock); - INIT_LIST_HEAD(&sdata->cleanup_stations); - INIT_WORK(&sdata->cleanup_stations_wk, ieee80211_cleanup_sdata_stas_wk); INIT_DELAYED_WORK(&sdata->dfs_cac_timer_work, ieee80211_dfs_cac_timer_work); INIT_DELAYED_WORK(&sdata->dec_tailroom_needed_wk, @@ -1776,7 +1780,6 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) } mutex_unlock(&local->iflist_mtx); unregister_netdevice_many(&unreg_list); - list_del(&unreg_list); list_for_each_entry_safe(sdata, tmp, &wdev_list, list) { list_del(&sdata->list); @@ -1792,20 +1795,19 @@ static int netdev_notify(struct notifier_block *nb, struct ieee80211_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) - return 0; + return NOTIFY_DONE; if (!dev->ieee80211_ptr || !dev->ieee80211_ptr->wiphy) - return 0; + return NOTIFY_DONE; if (dev->ieee80211_ptr->wiphy->privid != mac80211_wiphy_privid) - return 0; + return NOTIFY_DONE; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - memcpy(sdata->name, dev->name, IFNAMSIZ); - ieee80211_debugfs_rename_netdev(sdata); - return 0; + + return NOTIFY_OK; } static struct notifier_block mac80211_netdev_notifier = { diff --git a/net/mac80211/key.c b/net/mac80211/key.c index e568d98167d..16d97f044a2 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -325,7 +325,8 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, struct ieee80211_key *key; int i, j, err; - BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS); + if (WARN_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)) + return ERR_PTR(-EINVAL); key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); if (!key) @@ -481,8 +482,8 @@ int ieee80211_key_link(struct ieee80211_key *key, int idx, ret; bool pairwise; - BUG_ON(!sdata); - BUG_ON(!key); + if (WARN_ON(!sdata || !key)) + return -EINVAL; pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; idx = key->conf.keyidx; @@ -589,14 +590,10 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_iter_keys); -void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata) +static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata, + struct list_head *keys) { struct ieee80211_key *key, *tmp; - LIST_HEAD(keys); - - cancel_delayed_work_sync(&sdata->dec_tailroom_needed_wk); - - mutex_lock(&sdata->local->key_mtx); sdata->crypto_tx_tailroom_needed_cnt -= sdata->crypto_tx_tailroom_pending_dec; @@ -608,28 +605,51 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata) ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - list_add_tail(&key->list, &keys); + list_add_tail(&key->list, keys); } ieee80211_debugfs_key_update_default(sdata); +} - if (!list_empty(&keys)) { - synchronize_net(); - list_for_each_entry_safe(key, tmp, &keys, list) - __ieee80211_key_destroy(key, false); +void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, + bool force_synchronize) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *vlan; + struct ieee80211_key *key, *tmp; + LIST_HEAD(keys); + + cancel_delayed_work_sync(&sdata->dec_tailroom_needed_wk); + + mutex_lock(&local->key_mtx); + + ieee80211_free_keys_iface(sdata, &keys); + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + ieee80211_free_keys_iface(vlan, &keys); } + if (!list_empty(&keys) || force_synchronize) + synchronize_net(); + list_for_each_entry_safe(key, tmp, &keys, list) + __ieee80211_key_destroy(key, false); + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || sdata->crypto_tx_tailroom_pending_dec); + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || + vlan->crypto_tx_tailroom_pending_dec); + } - mutex_unlock(&sdata->local->key_mtx); + mutex_unlock(&local->key_mtx); } void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta) { - struct ieee80211_key *key, *tmp; - LIST_HEAD(keys); + struct ieee80211_key *key; int i; mutex_lock(&local->key_mtx); @@ -640,7 +660,7 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - list_add(&key->list, &keys); + __ieee80211_key_destroy(key, true); } for (i = 0; i < NUM_DEFAULT_KEYS; i++) { @@ -650,17 +670,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - list_add(&key->list, &keys); - } - - /* - * NB: the station code relies on this being - * done even if there aren't any keys - */ - synchronize_net(); - - list_for_each_entry_safe(key, tmp, &keys, list) __ieee80211_key_destroy(key, true); + } mutex_unlock(&local->key_mtx); } diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 0aebb889cab..19db68663d7 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -136,7 +136,8 @@ void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx, bool uni, bool multi); void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx); -void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata); +void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, + bool force_synchronize); void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta); void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index fa34cd2344b..d17c26d6e36 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -148,6 +148,8 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!rcu_access_pointer(sdata->vif.chanctx_conf)) continue; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; power = min(power, sdata->vif.bss_conf.txpower); } rcu_read_unlock(); @@ -199,7 +201,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; - if (!changed) + if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) return; drv_bss_info_changed(local, sdata, &sdata->vif.bss_conf, changed); @@ -250,12 +252,8 @@ static void ieee80211_restart_work(struct work_struct *work) /* wait for scan work complete */ flush_workqueue(local->workqueue); - mutex_lock(&local->mtx); - WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) || - rcu_dereference_protected(local->sched_scan_sdata, - lockdep_is_held(&local->mtx)), - "%s called with hardware scan in progress\n", __func__); - mutex_unlock(&local->mtx); + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), + "%s called with hardware scan in progress\n", __func__); rtnl_lock(); ieee80211_scan_cancel(local); @@ -342,7 +340,7 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, sdata_unlock(sdata); - return NOTIFY_DONE; + return NOTIFY_OK; } #endif @@ -373,7 +371,7 @@ static int ieee80211_ifa6_changed(struct notifier_block *nb, drv_ipv6_addr_change(local, sdata, idev); - return NOTIFY_DONE; + return NOTIFY_OK; } #endif @@ -448,7 +446,9 @@ static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { .cap_info = cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_MAX_AMSDU | IEEE80211_HT_CAP_SGI_20 | - IEEE80211_HT_CAP_SGI_40), + IEEE80211_HT_CAP_SGI_40 | + IEEE80211_HT_CAP_LDPC_CODING | + IEEE80211_HT_CAP_40MHZ_INTOLERANT), .mcs = { .rx_mask = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, }, @@ -850,17 +850,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* TODO: consider VHT for RX chains, hopefully it's the same */ } - local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) + - sizeof(void *) * channels, GFP_KERNEL); - if (!local->int_scan_req) - return -ENOMEM; - - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { - if (!local->hw.wiphy->bands[band]) - continue; - local->int_scan_req->rates[band] = (u32) -1; - } - /* if low-level driver supports AP, we also support VLAN */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) { hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN); @@ -884,6 +873,17 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return -EINVAL; } + local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) + + sizeof(void *) * channels, GFP_KERNEL); + if (!local->int_scan_req) + return -ENOMEM; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (!local->hw.wiphy->bands[band]) + continue; + local->int_scan_req->rates[band] = (u32) -1; + } + #ifndef CONFIG_MAC80211_MESH /* mesh depends on Kconfig, but drivers should set it if they want */ local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT); @@ -897,10 +897,15 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 supports control port protocol changing */ local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; - else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; + if (hw->max_signal <= 0) { + result = -EINVAL; + goto fail_wiphy_register; + } + } WARN((local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) && (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK), @@ -951,6 +956,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) local->hw.wiphy->flags |= WIPHY_FLAG_TDLS_EXTERNAL_SETUP; + local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM; + result = wiphy_register(local->hw.wiphy); if (result < 0) goto fail_wiphy_register; @@ -1075,6 +1082,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_register_hw); +void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi, + struct net_device *napi_dev, + int (*poll)(struct napi_struct *, int), + int weight) +{ + struct ieee80211_local *local = hw_to_local(hw); + + netif_napi_add(napi_dev, napi, poll, weight); + local->napi = napi; +} +EXPORT_SYMBOL_GPL(ieee80211_napi_add); + void ieee80211_unregister_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 89df62b2b68..6495a3f0428 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -259,6 +259,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, *pos++ = WLAN_EID_MESH_CONFIG; *pos++ = meshconf_len; + /* save a pointer for quick updates in pre-tbtt */ + ifmsh->meshconf_offset = pos - skb->data; + /* Active path selection protocol ID */ *pos++ = ifmsh->mesh_pp_id; /* Active path selection metric ID */ @@ -363,20 +366,15 @@ int mesh_add_rsn_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) return 0; /* find RSN IE */ - data = ifmsh->ie; - while (data < ifmsh->ie + ifmsh->ie_len) { - if (*data == WLAN_EID_RSN) { - len = data[1] + 2; - break; - } - data++; - } + data = cfg80211_find_ie(WLAN_EID_RSN, ifmsh->ie, ifmsh->ie_len); + if (!data) + return 0; - if (len) { - if (skb_tailroom(skb) < len) - return -ENOMEM; - memcpy(skb_put(skb, len), data, len); - } + len = data[1] + 2; + + if (skb_tailroom(skb) < len) + return -ENOMEM; + memcpy(skb_put(skb, len), data, len); return 0; } @@ -681,11 +679,11 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) *pos++ = 0x0; *pos++ = ieee80211_frequency_to_channel( csa->settings.chandef.chan->center_freq); - sdata->csa_counter_offset_beacon = hdr_len + 6; + sdata->csa_counter_offset_beacon[0] = hdr_len + 6; *pos++ = csa->settings.count; *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; *pos++ = 6; - if (ifmsh->chsw_init) { + if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) { *pos++ = ifmsh->mshcfg.dot11MeshTTL; *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; } else { @@ -723,6 +721,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) bcn->tail_len = skb->len; memcpy(bcn->tail, skb->data, bcn->tail_len); + bcn->meshconf = (struct ieee80211_meshconf_ie *) + (bcn->tail + ifmsh->meshconf_offset); dev_kfree_skb(skb); rcu_assign_pointer(ifmsh->beacon, bcn); @@ -802,6 +802,7 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) return -ENOMEM; } + ieee80211_recalc_dtim(local, sdata); ieee80211_bss_info_change_notify(sdata, changed); netif_carrier_on(sdata->dev); @@ -823,7 +824,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); bcn = rcu_dereference_protected(ifmsh->beacon, lockdep_is_held(&sdata->wdev.mtx)); - rcu_assign_pointer(ifmsh->beacon, NULL); + RCU_INIT_POINTER(ifmsh->beacon, NULL); kfree_rcu(bcn, rcu_head); /* flush STAs and mpaths on this iface */ @@ -853,18 +854,12 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, { struct cfg80211_csa_settings params; struct ieee80211_csa_ie csa_ie; - struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_chanctx *chanctx; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; enum ieee80211_band band = ieee80211_get_sdata_band(sdata); - int err, num_chanctx; + int err; u32 sta_flags; - if (sdata->vif.csa_active) - return true; - - if (!ifmsh->mesh_id) - return false; + sdata_assert_lock(sdata); sta_flags = IEEE80211_STA_DISABLE_VHT; switch (sdata->vif.bss_conf.chandef.width) { @@ -890,10 +885,6 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, params.chandef = csa_ie.chandef; params.count = csa_ie.count; - if (sdata->vif.bss_conf.chandef.chan->band != - params.chandef.chan->band) - return false; - if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, ¶ms.chandef, IEEE80211_CHAN_DISABLED)) { sdata_info(sdata, @@ -907,33 +898,22 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, - ¶ms.chandef); + ¶ms.chandef, + NL80211_IFTYPE_MESH_POINT); if (err < 0) return false; - if (err) { - params.radar_required = true; + if (err > 0) /* TODO: DFS not (yet) supported */ return false; - } - - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (!chanctx_conf) - goto failed_chswitch; - - /* don't handle for multi-VIF cases */ - chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); - if (chanctx->refcount > 1) - goto failed_chswitch; - num_chanctx = 0; - list_for_each_entry_rcu(chanctx, &sdata->local->chanctx_list, list) - num_chanctx++; + params.radar_required = err; - if (num_chanctx > 1) - goto failed_chswitch; - - rcu_read_unlock(); + if (cfg80211_chandef_identical(¶ms.chandef, + &sdata->vif.bss_conf.chandef)) { + mcsa_dbg(sdata, + "received csa with an identical chandef, ignoring\n"); + return true; + } mcsa_dbg(sdata, "received channel switch announcement to go to channel %d MHz\n", @@ -947,30 +927,16 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, ifmsh->pre_value = csa_ie.pre_value; } - if (ifmsh->chsw_ttl < ifmsh->mshcfg.dot11MeshTTL) { - if (ieee80211_mesh_csa_beacon(sdata, ¶ms, false) < 0) - return false; - } else { + if (ifmsh->chsw_ttl >= ifmsh->mshcfg.dot11MeshTTL) return false; - } - sdata->csa_radar_required = params.radar_required; + ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_REPEATER; - if (params.block_tx) - ieee80211_stop_queues_by_reason(&sdata->local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); - - sdata->csa_chandef = params.chandef; - sdata->vif.csa_active = true; - - ieee80211_bss_info_change_notify(sdata, err); - drv_channel_switch_beacon(sdata, ¶ms.chandef); + if (ieee80211_channel_switch(sdata->local->hw.wiphy, sdata->dev, + ¶ms) < 0) + return false; return true; -failed_chswitch: - rcu_read_unlock(); - return false; } static void @@ -1080,7 +1046,8 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, ifmsh->sync_ops->rx_bcn_presp(sdata, stype, mgmt, &elems, rx_status); - if (!ifmsh->chsw_init) + if (ifmsh->csa_role != IEEE80211_MESH_CSA_ROLE_INIT && + !sdata->vif.csa_active) ieee80211_mesh_process_chnswitch(sdata, &elems, true); } @@ -1089,29 +1056,30 @@ int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_csa_settings *tmp_csa_settings; int ret = 0; + int changed = 0; /* Reset the TTL value and Initiator flag */ - ifmsh->chsw_init = false; + ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; ifmsh->chsw_ttl = 0; /* Remove the CSA and MCSP elements from the beacon */ tmp_csa_settings = rcu_dereference(ifmsh->csa); - rcu_assign_pointer(ifmsh->csa, NULL); - kfree_rcu(tmp_csa_settings, rcu_head); + RCU_INIT_POINTER(ifmsh->csa, NULL); + if (tmp_csa_settings) + kfree_rcu(tmp_csa_settings, rcu_head); ret = ieee80211_mesh_rebuild_beacon(sdata); if (ret) return -EINVAL; - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); + changed |= BSS_CHANGED_BEACON; mcsa_dbg(sdata, "complete switching to center freq %d MHz", sdata->vif.bss_conf.chandef.chan->center_freq); - return 0; + return changed; } int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_csa_settings *csa_settings, - bool csa_action) + struct cfg80211_csa_settings *csa_settings) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_csa_settings *tmp_csa_settings; @@ -1130,17 +1098,12 @@ int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, ret = ieee80211_mesh_rebuild_beacon(sdata); if (ret) { tmp_csa_settings = rcu_dereference(ifmsh->csa); - rcu_assign_pointer(ifmsh->csa, NULL); + RCU_INIT_POINTER(ifmsh->csa, NULL); kfree_rcu(tmp_csa_settings, rcu_head); return ret; } - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); - - if (csa_action) - ieee80211_send_action_csa(sdata, csa_settings); - - return 0; + return BSS_CHANGED_BEACON; } static int mesh_fwd_csa_frame(struct ieee80211_sub_if_data *sdata, @@ -1204,7 +1167,8 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, ifmsh->pre_value = pre_value; - if (!ieee80211_mesh_process_chnswitch(sdata, &elems, false)) { + if (!sdata->vif.csa_active && + !ieee80211_mesh_process_chnswitch(sdata, &elems, false)) { mcsa_dbg(sdata, "Failed to process CSA action frame"); return; } @@ -1251,7 +1215,7 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, sdata_lock(sdata); /* mesh already went down */ - if (!sdata->wdev.mesh_id_len) + if (!sdata->u.mesh.mesh_id_len) goto out; rx_status = IEEE80211_SKB_RXCB(skb); @@ -1304,7 +1268,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata) sdata_lock(sdata); /* mesh already went down */ - if (!sdata->wdev.mesh_id_len) + if (!sdata->u.mesh.mesh_id_len) goto out; if (ifmsh->preq_queue_len && @@ -1359,7 +1323,7 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) mesh_rmc_init(sdata); ifmsh->last_preq = jiffies; ifmsh->next_perr = jiffies; - ifmsh->chsw_init = false; + ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; /* Allocate all mesh structures when creating the first mesh interface. */ if (!mesh_allocated) ieee80211s_init(); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index f9514685d45..94758b9c9ed 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -37,7 +37,7 @@ static inline u32 u32_field_get(const u8 *preq_elem, int offset, bool ae) return get_unaligned_le32(preq_elem + offset); } -static inline u32 u16_field_get(const u8 *preq_elem, int offset, bool ae) +static inline u16 u16_field_get(const u8 *preq_elem, int offset, bool ae) { if (ae) offset += 6; @@ -544,9 +544,10 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, if (time_after(jiffies, ifmsh->last_sn_update + net_traversal_jiffies(sdata)) || time_before(jiffies, ifmsh->last_sn_update)) { - target_sn = ++ifmsh->sn; + ++ifmsh->sn; ifmsh->last_sn_update = jiffies; } + target_sn = ifmsh->sn; } else if (is_broadcast_ether_addr(target_addr) && (target_flags & IEEE80211_PREQ_TO_FLAG)) { rcu_read_lock(); diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 7d050ed6fe5..cf032a8db9d 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -287,8 +287,10 @@ static void mesh_path_move_to_queue(struct mesh_path *gate_mpath, struct sk_buff_head failq; unsigned long flags; - BUG_ON(gate_mpath == from_mpath); - BUG_ON(!gate_mpath->next_hop); + if (WARN_ON(gate_mpath == from_mpath)) + return; + if (WARN_ON(!gate_mpath->next_hop)) + return; __skb_queue_head_init(&failq); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index cf83217103f..e8f60aa2e84 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -437,6 +437,7 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); set_sta_flag(sta, WLAN_STA_WME); + sta->sta.wme = true; return sta; } diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 2802f9d9279..ad8b377b4b9 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -36,6 +36,7 @@ static struct sk_buff *mps_qos_null_get(struct sta_info *sta) sdata->vif.addr); nullfunc->frame_control = fc; nullfunc->duration_id = 0; + nullfunc->seq_ctrl = 0; /* no address resolution for this frame -> set addr 1 immediately */ memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memset(skb_put(skb, 2), 0, 2); /* append QoS control field */ diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index d1cf2d55349..09625d6205c 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -164,12 +164,15 @@ no_sync: rcu_read_unlock(); } -static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata) +static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, + struct beacon_data *beacon) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + u8 cap; WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); - BUG_ON(!rcu_read_lock_held()); + WARN_ON(!rcu_read_lock_held()); + cap = beacon->meshconf->meshconf_cap; spin_lock_bh(&ifmsh->sync_offset_lock); @@ -194,6 +197,10 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata) ifmsh->adjusting_tbtt = false; } spin_unlock_bh(&ifmsh->sync_offset_lock); + + beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ? + IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap : + ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap; } static const struct sync_method sync_methods[] = { diff --git a/net/mac80211/michael.h b/net/mac80211/michael.h index 3b848dad958..0e4886f881f 100644 --- a/net/mac80211/michael.h +++ b/net/mac80211/michael.h @@ -11,6 +11,7 @@ #define MICHAEL_H #include <linux/types.h> +#include <linux/ieee80211.h> #define MICHAEL_MIC_LEN 8 diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 900ead344f5..3345401be1b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -131,13 +131,13 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) if (unlikely(!sdata->u.mgd.associated)) return; + ifmgd->probe_send_count = 0; + if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) return; mod_timer(&sdata->u.mgd.conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); - - ifmgd->probe_send_count = 0; } static int ecw2cw(int ecw) @@ -222,6 +222,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, switch (vht_oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: vht_chandef.width = chandef->width; + vht_chandef.center_freq1 = chandef->center_freq1; break; case IEEE80211_VHT_CHANWIDTH_80MHZ: vht_chandef.width = NL80211_CHAN_WIDTH_80; @@ -271,6 +272,28 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ret = 0; out: + /* + * When tracking the current AP, don't do any further checks if the + * new chandef is identical to the one we're currently using for the + * connection. This keeps us from playing ping-pong with regulatory, + * without it the following can happen (for example): + * - connect to an AP with 80 MHz, world regdom allows 80 MHz + * - AP advertises regdom US + * - CRDA loads regdom US with 80 MHz prohibited (old database) + * - the code below detects an unsupported channel, downgrades, and + * we disconnect from the AP in the caller + * - disconnect causes CRDA to reload world regdomain and the game + * starts anew. + * (see https://bugzilla.kernel.org/show_bug.cgi?id=70881) + * + * It seems possible that there are still scenarios with CSA or real + * bandwidth changes where a this could happen, but those cases are + * less common and wouldn't completely prevent using the AP. + */ + if (tracking && + cfg80211_chandef_identical(chandef, &sdata->vif.bss_conf.chandef)) + return ret; + /* don't print the message below for VHT mismatch if VHT is disabled */ if (ret & IEEE80211_STA_DISABLE_VHT) vht_chandef = *chandef; @@ -508,6 +531,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, u8 *pos; u32 cap; struct ieee80211_sta_vht_cap vht_cap; + u32 mask, ap_bf_sts, our_bf_sts; BUILD_BUG_ON(sizeof(vht_cap) != sizeof(sband->vht_cap)); @@ -535,6 +559,16 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) cap &= ~IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE; + mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; + + ap_bf_sts = le32_to_cpu(ap_vht_cap->vht_cap_info) & mask; + our_bf_sts = cap & mask; + + if (ap_bf_sts < our_bf_sts) { + cap &= ~mask; + cap |= ap_bf_sts; + } + /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); @@ -745,6 +779,34 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) ieee80211_add_ht_ie(sdata, skb, assoc_data->ap_ht_param, sband, chan, sdata->smps_mode); + /* if present, add any custom IEs that go before VHT */ + if (assoc_data->ie_len) { + static const u8 before_vht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_PWR_CAPABILITY, + WLAN_EID_SUPPORTED_CHANNELS, + WLAN_EID_RSN, + WLAN_EID_QOS_CAPA, + WLAN_EID_RRM_ENABLED_CAPABILITIES, + WLAN_EID_MOBILITY_DOMAIN, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + WLAN_EID_HT_CAPABILITY, + WLAN_EID_BSS_COEX_2040, + WLAN_EID_EXT_CAPABILITY, + WLAN_EID_QOS_TRAFFIC_CAPA, + WLAN_EID_TIM_BCAST_REQ, + WLAN_EID_INTERWORKING, + }; + noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len, + before_vht, ARRAY_SIZE(before_vht), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, assoc_data->ie + offset, noffset - offset); + offset = noffset; + } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); @@ -888,7 +950,9 @@ static void ieee80211_chswitch_work(struct work_struct *work) if (!ifmgd->associated) goto out; + mutex_lock(&local->mtx); ret = ieee80211_vif_change_channel(sdata, &changed); + mutex_unlock(&local->mtx); if (ret) { sdata_info(sdata, "vif channel switch failed, disconnecting\n"); @@ -911,16 +975,23 @@ static void ieee80211_chswitch_work(struct work_struct *work) /* XXX: shouldn't really modify cfg80211-owned data! */ ifmgd->associated->channel = sdata->csa_chandef.chan; + ieee80211_bss_info_change_notify(sdata, changed); + + mutex_lock(&local->mtx); + sdata->vif.csa_active = false; /* XXX: wait for a beacon first? */ - ieee80211_wake_queues_by_reason(&local->hw, + if (!ieee80211_csa_needs_block_tx(local)) + ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_CSA); + mutex_unlock(&local->mtx); - ieee80211_bss_info_change_notify(sdata, changed); - - out: - sdata->vif.csa_active = false; ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; + + ieee80211_sta_reset_beacon_monitor(sdata); + ieee80211_sta_reset_conn_monitor(sdata); + +out: sdata_unlock(sdata); } @@ -999,7 +1070,6 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, } ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED; - sdata->vif.csa_active = true; mutex_lock(&local->chanctx_mtx); if (local->use_chanctx) { @@ -1026,7 +1096,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, } chanctx = container_of(rcu_access_pointer(sdata->vif.chanctx_conf), struct ieee80211_chanctx, conf); - if (chanctx->refcount > 1) { + if (ieee80211_chanctx_refcount(local, chanctx) > 1) { sdata_info(sdata, "channel switch with multiple interfaces on the same channel, disconnecting\n"); ieee80211_queue_work(&local->hw, @@ -1038,10 +1108,15 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->csa_chandef = csa_ie.chandef; - if (csa_ie.mode) + mutex_lock(&local->mtx); + sdata->vif.csa_active = true; + sdata->csa_block_tx = csa_ie.mode; + + if (sdata->csa_block_tx) ieee80211_stop_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); + mutex_unlock(&local->mtx); if (local->ops->channel_switch) { /* use driver's channel switch callback */ @@ -1401,10 +1476,14 @@ void ieee80211_dfs_cac_timer_work(struct work_struct *work) dfs_cac_timer_work); struct cfg80211_chan_def chandef = sdata->vif.bss_conf.chandef; - ieee80211_vif_release_channel(sdata); - cfg80211_cac_event(sdata->dev, &chandef, - NL80211_RADAR_CAC_FINISHED, - GFP_KERNEL); + mutex_lock(&sdata->local->mtx); + if (sdata->wdev.cac_started) { + ieee80211_vif_release_channel(sdata); + cfg80211_cac_event(sdata->dev, &chandef, + NL80211_RADAR_CAC_FINISHED, + GFP_KERNEL); + } + mutex_unlock(&sdata->local->mtx); } /* MLME */ @@ -1698,7 +1777,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(ifmgd->bssid, 0, ETH_ALEN); /* remove AP and TDLS peers */ - sta_info_flush_defer(sdata); + sta_info_flush(sdata); /* finally reset all BSS / config parameters */ changed |= ieee80211_reset_erp_info(sdata); @@ -1747,8 +1826,16 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ifmgd->have_beacon = false; ifmgd->flags = 0; + mutex_lock(&local->mtx); ieee80211_vif_release_channel(sdata); + sdata->vif.csa_active = false; + if (!ieee80211_csa_needs_block_tx(local)) + ieee80211_wake_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); + mutex_unlock(&local->mtx); + sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; } @@ -1975,6 +2062,7 @@ EXPORT_SYMBOL(ieee80211_ap_probereq_get); static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; @@ -1988,10 +2076,14 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, true, frame_buf); ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; + + mutex_lock(&local->mtx); sdata->vif.csa_active = false; - ieee80211_wake_queues_by_reason(&sdata->local->hw, + if (!ieee80211_csa_needs_block_tx(local)) + ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_CSA); + mutex_unlock(&local->mtx); cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); @@ -2070,7 +2162,9 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, memset(sdata->u.mgd.bssid, 0, ETH_ALEN); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; + mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); } cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); @@ -2200,6 +2294,62 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, /* ignore frame -- wait for timeout */ } +#define case_WLAN(type) \ + case WLAN_REASON_##type: return #type + +static const char *ieee80211_get_reason_code_string(u16 reason_code) +{ + switch (reason_code) { + case_WLAN(UNSPECIFIED); + case_WLAN(PREV_AUTH_NOT_VALID); + case_WLAN(DEAUTH_LEAVING); + case_WLAN(DISASSOC_DUE_TO_INACTIVITY); + case_WLAN(DISASSOC_AP_BUSY); + case_WLAN(CLASS2_FRAME_FROM_NONAUTH_STA); + case_WLAN(CLASS3_FRAME_FROM_NONASSOC_STA); + case_WLAN(DISASSOC_STA_HAS_LEFT); + case_WLAN(STA_REQ_ASSOC_WITHOUT_AUTH); + case_WLAN(DISASSOC_BAD_POWER); + case_WLAN(DISASSOC_BAD_SUPP_CHAN); + case_WLAN(INVALID_IE); + case_WLAN(MIC_FAILURE); + case_WLAN(4WAY_HANDSHAKE_TIMEOUT); + case_WLAN(GROUP_KEY_HANDSHAKE_TIMEOUT); + case_WLAN(IE_DIFFERENT); + case_WLAN(INVALID_GROUP_CIPHER); + case_WLAN(INVALID_PAIRWISE_CIPHER); + case_WLAN(INVALID_AKMP); + case_WLAN(UNSUPP_RSN_VERSION); + case_WLAN(INVALID_RSN_IE_CAP); + case_WLAN(IEEE8021X_FAILED); + case_WLAN(CIPHER_SUITE_REJECTED); + case_WLAN(DISASSOC_UNSPECIFIED_QOS); + case_WLAN(DISASSOC_QAP_NO_BANDWIDTH); + case_WLAN(DISASSOC_LOW_ACK); + case_WLAN(DISASSOC_QAP_EXCEED_TXOP); + case_WLAN(QSTA_LEAVE_QBSS); + case_WLAN(QSTA_NOT_USE); + case_WLAN(QSTA_REQUIRE_SETUP); + case_WLAN(QSTA_TIMEOUT); + case_WLAN(QSTA_CIPHER_NOT_SUPP); + case_WLAN(MESH_PEER_CANCELED); + case_WLAN(MESH_MAX_PEERS); + case_WLAN(MESH_CONFIG); + case_WLAN(MESH_CLOSE); + case_WLAN(MESH_MAX_RETRIES); + case_WLAN(MESH_CONFIRM_TIMEOUT); + case_WLAN(MESH_INVALID_GTK); + case_WLAN(MESH_INCONSISTENT_PARAM); + case_WLAN(MESH_INVALID_SECURITY); + case_WLAN(MESH_PATH_ERROR); + case_WLAN(MESH_PATH_NOFORWARD); + case_WLAN(MESH_PATH_DEST_UNREACHABLE); + case_WLAN(MAC_EXISTS_IN_MBSS); + case_WLAN(MESH_CHAN_REGULATORY); + case_WLAN(MESH_CHAN); + default: return "<unknown>"; + } +} static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) @@ -2221,8 +2371,8 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); - sdata_info(sdata, "deauthenticated from %pM (Reason: %u)\n", - bssid, reason_code); + sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", + bssid, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); @@ -2319,7 +2469,9 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, memset(sdata->u.mgd.bssid, 0, ETH_ALEN); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; + mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); } kfree(assoc_data); @@ -2653,28 +2805,20 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { struct ieee80211_local *local = sdata->local; - int freq; struct ieee80211_bss *bss; struct ieee80211_channel *channel; sdata_assert_lock(sdata); - if (elems->ds_params) - freq = ieee80211_channel_to_frequency(elems->ds_params[0], - rx_status->band); - else - freq = rx_status->freq; - - channel = ieee80211_get_channel(local->hw.wiphy, freq); - - if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + if (!channel) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, channel); if (bss) { - ieee80211_rx_bss_put(local, bss); sdata->vif.bss_conf.beacon_rate = bss->beacon_rate; + ieee80211_rx_bss_put(local, bss); } } @@ -3424,6 +3568,9 @@ static void ieee80211_sta_bcn_mon_timer(unsigned long data) if (local->quiescing) return; + if (sdata->vif.csa_active) + return; + sdata->u.mgd.connection_loss = false; ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.beacon_connection_loss_work); @@ -3439,6 +3586,9 @@ static void ieee80211_sta_conn_mon_timer(unsigned long data) if (local->quiescing) return; + if (sdata->vif.csa_active) + return; + ieee80211_queue_work(&local->hw, &ifmgd->monitor_work); } @@ -3469,6 +3619,38 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) } #ifdef CONFIG_PM +void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + + sdata_lock(sdata); + + if (ifmgd->auth_data || ifmgd->assoc_data) { + const u8 *bssid = ifmgd->auth_data ? + ifmgd->auth_data->bss->bssid : + ifmgd->assoc_data->bss->bssid; + + /* + * If we are trying to authenticate / associate while suspending, + * cfg80211 won't know and won't actually abort those attempts, + * thus we need to do that ourselves. + */ + ieee80211_send_deauth_disassoc(sdata, bssid, + IEEE80211_STYPE_DEAUTH, + WLAN_REASON_DEAUTH_LEAVING, + false, frame_buf); + if (ifmgd->assoc_data) + ieee80211_destroy_assoc_data(sdata, false); + if (ifmgd->auth_data) + ieee80211_destroy_auth_data(sdata, false); + cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, + IEEE80211_DEAUTH_FRAME_LEN); + } + + sdata_unlock(sdata); +} + void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -3553,7 +3735,7 @@ int ieee80211_max_network_latency(struct notifier_block *nb, ieee80211_recalc_ps(local, latency_usec); mutex_unlock(&local->iflist_mtx); - return 0; + return NOTIFY_OK; } static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, @@ -3670,6 +3852,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, /* will change later if needed */ sdata->smps_mode = IEEE80211_SMPS_OFF; + mutex_lock(&local->mtx); /* * If this fails (possibly due to channel context sharing * on incompatible channels, e.g. 80+80 and 160 sharing the @@ -3681,13 +3864,15 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, /* don't downgrade for 5 and 10 MHz channels, though. */ if (chandef.width == NL80211_CHAN_WIDTH_5 || chandef.width == NL80211_CHAN_WIDTH_10) - return ret; + goto out; while (ret && chandef.width != NL80211_CHAN_WIDTH_20_NOHT) { ifmgd->flags |= ieee80211_chandef_downgrade(&chandef); ret = ieee80211_vif_use_channel(sdata, &chandef, IEEE80211_CHANCTX_SHARED); } + out: + mutex_unlock(&local->mtx); return ret; } @@ -3738,6 +3923,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); + sta_info_free(local, new_sta); return -EINVAL; } rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def); @@ -4283,37 +4469,41 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; bool tx = !req->local_state_change; - bool report_frame = false; - sdata_info(sdata, - "deauthenticating from %pM by local choice (reason=%d)\n", - req->bssid, req->reason_code); + if (ifmgd->auth_data && + ether_addr_equal(ifmgd->auth_data->bss->bssid, req->bssid)) { + sdata_info(sdata, + "aborting authentication with %pM by local choice (Reason: %u=%s)\n", + req->bssid, req->reason_code, + ieee80211_get_reason_code_string(req->reason_code)); - if (ifmgd->auth_data) { drv_mgd_prepare_tx(sdata->local, sdata); ieee80211_send_deauth_disassoc(sdata, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); ieee80211_destroy_auth_data(sdata, false); + cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, + IEEE80211_DEAUTH_FRAME_LEN); - report_frame = true; - goto out; + return 0; } if (ifmgd->associated && ether_addr_equal(ifmgd->associated->bssid, req->bssid)) { + sdata_info(sdata, + "deauthenticating from %pM by local choice (Reason: %u=%s)\n", + req->bssid, req->reason_code, + ieee80211_get_reason_code_string(req->reason_code)); + ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); - report_frame = true; - } - - out: - if (report_frame) cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); + return 0; + } - return 0; + return -ENOTCONN; } int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, @@ -4333,8 +4523,8 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, return -ENOLINK; sdata_info(sdata, - "disassociating from %pM by local choice (reason=%d)\n", - req->bss->bssid, req->reason_code); + "disassociating from %pM by local choice (Reason: %u=%s)\n", + req->bss->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); memcpy(bssid, req->bss->bssid, ETH_ALEN); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DISASSOC, diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 0c2a29484c0..7a17decd27f 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -333,7 +333,7 @@ void ieee80211_sw_roc_work(struct work_struct *work) container_of(work, struct ieee80211_roc_work, work.work); struct ieee80211_sub_if_data *sdata = roc->sdata; struct ieee80211_local *local = sdata->local; - bool started; + bool started, on_channel; mutex_lock(&local->mtx); @@ -354,13 +354,26 @@ void ieee80211_sw_roc_work(struct work_struct *work) if (!roc->started) { struct ieee80211_roc_work *dep; - /* start this ROC */ + WARN_ON(local->use_chanctx); + + /* If actually operating on the desired channel (with at least + * 20 MHz channel width) don't stop all the operations but still + * treat it as though the ROC operation started properly, so + * other ROC operations won't interfere with this one. + */ + roc->on_channel = roc->chan == local->_oper_chandef.chan && + local->_oper_chandef.width != NL80211_CHAN_WIDTH_5 && + local->_oper_chandef.width != NL80211_CHAN_WIDTH_10; - /* switch channel etc */ + /* start this ROC */ ieee80211_recalc_idle(local); - local->tmp_channel = roc->chan; - ieee80211_hw_config(local, 0); + if (!roc->on_channel) { + ieee80211_offchannel_stop_vifs(local); + + local->tmp_channel = roc->chan; + ieee80211_hw_config(local, 0); + } /* tell userspace or send frame */ ieee80211_handle_roc_started(roc); @@ -379,9 +392,10 @@ void ieee80211_sw_roc_work(struct work_struct *work) finish: list_del(&roc->list); started = roc->started; + on_channel = roc->on_channel; ieee80211_roc_notify_destroy(roc, !roc->abort); - if (started) { + if (started && !on_channel) { ieee80211_flush_queues(local, NULL); local->tmp_channel = NULL; diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 34012620434..d478b880a0a 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -37,9 +37,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND); - /* flush out all packets and station cleanup call_rcu()s */ + /* flush out all packets */ synchronize_net(); - rcu_barrier(); ieee80211_flush_queues(local, NULL); @@ -101,10 +100,18 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) /* remove all interfaces that were created in the driver */ list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata) || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_MONITOR) + if (!ieee80211_sdata_running(sdata)) continue; + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_MONITOR: + continue; + case NL80211_IFTYPE_STATION: + ieee80211_mgd_quiesce(sdata); + break; + default: + break; + } drv_remove_interface(local, sdata); } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 22b223f13c9..8fdadfd94ba 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -10,15 +10,15 @@ #include <linux/kernel.h> #include <linux/rtnetlink.h> -#include <linux/slab.h> #include <linux/module.h> +#include <linux/slab.h> #include "rate.h" #include "ieee80211_i.h" #include "debugfs.h" struct rate_control_alg { struct list_head list; - struct rate_control_ops *ops; + const struct rate_control_ops *ops; }; static LIST_HEAD(rate_ctrl_algs); @@ -29,7 +29,7 @@ module_param(ieee80211_default_rc_algo, charp, 0644); MODULE_PARM_DESC(ieee80211_default_rc_algo, "Default rate control algorithm for mac80211 to use"); -int ieee80211_rate_control_register(struct rate_control_ops *ops) +int ieee80211_rate_control_register(const struct rate_control_ops *ops) { struct rate_control_alg *alg; @@ -60,7 +60,7 @@ int ieee80211_rate_control_register(struct rate_control_ops *ops) } EXPORT_SYMBOL(ieee80211_rate_control_register); -void ieee80211_rate_control_unregister(struct rate_control_ops *ops) +void ieee80211_rate_control_unregister(const struct rate_control_ops *ops) { struct rate_control_alg *alg; @@ -76,32 +76,31 @@ void ieee80211_rate_control_unregister(struct rate_control_ops *ops) } EXPORT_SYMBOL(ieee80211_rate_control_unregister); -static struct rate_control_ops * +static const struct rate_control_ops * ieee80211_try_rate_control_ops_get(const char *name) { struct rate_control_alg *alg; - struct rate_control_ops *ops = NULL; + const struct rate_control_ops *ops = NULL; if (!name) return NULL; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { - if (!strcmp(alg->ops->name, name)) - if (try_module_get(alg->ops->module)) { - ops = alg->ops; - break; - } + if (!strcmp(alg->ops->name, name)) { + ops = alg->ops; + break; + } } mutex_unlock(&rate_ctrl_mutex); return ops; } /* Get the rate control algorithm. */ -static struct rate_control_ops * +static const struct rate_control_ops * ieee80211_rate_control_ops_get(const char *name) { - struct rate_control_ops *ops; + const struct rate_control_ops *ops; const char *alg_name; kparam_block_sysfs_write(ieee80211_default_rc_algo); @@ -111,10 +110,6 @@ ieee80211_rate_control_ops_get(const char *name) alg_name = name; ops = ieee80211_try_rate_control_ops_get(alg_name); - if (!ops) { - request_module("rc80211_%s", alg_name); - ops = ieee80211_try_rate_control_ops_get(alg_name); - } if (!ops && name) /* try default if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo); @@ -127,11 +122,6 @@ ieee80211_rate_control_ops_get(const char *name) return ops; } -static void ieee80211_rate_control_ops_put(struct rate_control_ops *ops) -{ - module_put(ops->module); -} - #ifdef CONFIG_MAC80211_DEBUGFS static ssize_t rcname_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -158,11 +148,11 @@ static struct rate_control_ref *rate_control_alloc(const char *name, ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); if (!ref) - goto fail_ref; + return NULL; ref->local = local; ref->ops = ieee80211_rate_control_ops_get(name); if (!ref->ops) - goto fail_ops; + goto free; #ifdef CONFIG_MAC80211_DEBUGFS debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); @@ -172,14 +162,11 @@ static struct rate_control_ref *rate_control_alloc(const char *name, ref->priv = ref->ops->alloc(&local->hw, debugfsdir); if (!ref->priv) - goto fail_priv; + goto free; return ref; -fail_priv: - ieee80211_rate_control_ops_put(ref->ops); -fail_ops: +free: kfree(ref); -fail_ref: return NULL; } @@ -192,7 +179,6 @@ static void rate_control_free(struct rate_control_ref *ctrl_ref) ctrl_ref->local->debugfs.rcdir = NULL; #endif - ieee80211_rate_control_ops_put(ctrl_ref->ops); kfree(ctrl_ref); } diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index b95e16c0708..9aa2a1190a8 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -21,7 +21,7 @@ struct rate_control_ref { struct ieee80211_local *local; - struct rate_control_ops *ops; + const struct rate_control_ops *ops; void *priv; }; diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index d2f19f7e709..1c1469c36dc 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -135,7 +135,7 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) u32 usecs; int i; - for (i=0; i < MAX_THR_RATES; i++) + for (i = 0; i < MAX_THR_RATES; i++) tmp_tp_rate[i] = 0; for (i = 0; i < mi->n_rates; i++) { @@ -190,7 +190,7 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is choosen as max_prob_rate */ - if (mr->probability >= MINSTREL_FRAC(95,100)) { + if (mr->probability >= MINSTREL_FRAC(95, 100)) { if (mr->cur_tp >= mi->r[tmp_prob_rate].cur_tp) tmp_prob_rate = i; } else { @@ -220,7 +220,7 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) static void minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *priv_sta, + struct ieee80211_sta *sta, void *priv_sta, struct sk_buff *skb) { struct minstrel_priv *mp = priv; @@ -260,7 +260,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, static inline unsigned int minstrel_get_retry_count(struct minstrel_rate *mr, - struct ieee80211_tx_info *info) + struct ieee80211_tx_info *info) { unsigned int retry = mr->adjusted_retry_count; @@ -657,7 +657,18 @@ minstrel_free(void *priv) kfree(priv); } -struct rate_control_ops mac80211_minstrel = { +static u32 minstrel_get_expected_throughput(void *priv_sta) +{ + struct minstrel_sta_info *mi = priv_sta; + int idx = mi->max_tp_rate[0]; + + /* convert pkt per sec in kbps (1200 is the average pkt size used for + * computing cur_tp + */ + return MINSTREL_TRUNC(mi->r[idx].cur_tp) * 1200 * 8 / 1024; +} + +const struct rate_control_ops mac80211_minstrel = { .name = "minstrel", .tx_status = minstrel_tx_status, .get_rate = minstrel_get_rate, @@ -670,6 +681,7 @@ struct rate_control_ops mac80211_minstrel = { .add_sta_debugfs = minstrel_add_sta_debugfs, .remove_sta_debugfs = minstrel_remove_sta_debugfs, #endif + .get_expected_throughput = minstrel_get_expected_throughput, }; int __init diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index f4301f4b2e4..046d1bd598a 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -123,7 +123,7 @@ struct minstrel_debugfs_info { char buf[]; }; -extern struct rate_control_ops mac80211_minstrel; +extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index d2ed18d82fe..85c1e74b771 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -22,7 +22,7 @@ #define MCS_NBITS (AVG_PKT_SIZE << 3) /* Number of symbols for a packet with (bps) bits per symbol */ -#define MCS_NSYMS(bps) ((MCS_NBITS + (bps) - 1) / (bps)) +#define MCS_NSYMS(bps) DIV_ROUND_UP(MCS_NBITS, (bps)) /* Transmission time (nanoseconds) for a packet containing (syms) symbols */ #define MCS_SYMBOL_TIME(sgi, syms) \ @@ -63,7 +63,7 @@ #define CCK_DURATION(_bitrate, _short, _len) \ (1000 * (10 /* SIFS */ + \ - (_short ? 72 + 24 : 144 + 48 ) + \ + (_short ? 72 + 24 : 144 + 48) + \ (8 * (_len + 4) * 10) / (_bitrate))) #define CCK_ACK_DURATION(_bitrate, _short) \ @@ -124,7 +124,7 @@ const struct mcs_group minstrel_mcs_groups[] = { #define MINSTREL_CCK_GROUP (ARRAY_SIZE(minstrel_mcs_groups) - 1) -static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES]; +static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); @@ -226,8 +226,9 @@ minstrel_ht_calc_tp(struct minstrel_ht_sta *mi, int group, int rate) nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len); nsecs += minstrel_mcs_groups[group].duration[rate]; - tp = 1000000 * ((prob * 1000) / nsecs); + /* prob is scaled - see MINSTREL_FRAC above */ + tp = 1000000 * ((prob * 1000) / nsecs); mr->cur_tp = MINSTREL_TRUNC(tp); } @@ -1031,7 +1032,23 @@ minstrel_ht_free(void *priv) mac80211_minstrel.free(priv); } -static struct rate_control_ops mac80211_minstrel_ht = { +static u32 minstrel_ht_get_expected_throughput(void *priv_sta) +{ + struct minstrel_ht_sta_priv *msp = priv_sta; + struct minstrel_ht_sta *mi = &msp->ht; + int i, j; + + if (!msp->is_ht) + return mac80211_minstrel.get_expected_throughput(priv_sta); + + i = mi->max_tp_rate / MCS_GROUP_RATES; + j = mi->max_tp_rate % MCS_GROUP_RATES; + + /* convert cur_tp from pkt per second in kbps */ + return mi->groups[i].rates[j].cur_tp * AVG_PKT_SIZE * 8 / 1024; +} + +static const struct rate_control_ops mac80211_minstrel_ht = { .name = "minstrel_ht", .tx_status = minstrel_ht_tx_status, .get_rate = minstrel_ht_get_rate, @@ -1045,11 +1062,11 @@ static struct rate_control_ops mac80211_minstrel_ht = { .add_sta_debugfs = minstrel_ht_add_sta_debugfs, .remove_sta_debugfs = minstrel_ht_remove_sta_debugfs, #endif + .get_expected_throughput = minstrel_ht_get_expected_throughput, }; -static void -init_sample_table(void) +static void __init init_sample_table(void) { int col, i, new_idx; u8 rnd[MCS_GROUP_RATES]; diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c index 958fad07b54..d0da2a70fe6 100644 --- a/net/mac80211/rc80211_pid_algo.c +++ b/net/mac80211/rc80211_pid_algo.c @@ -452,7 +452,7 @@ static void rate_control_pid_free_sta(void *priv, struct ieee80211_sta *sta, kfree(priv_sta); } -static struct rate_control_ops mac80211_rcpid = { +static const struct rate_control_ops mac80211_rcpid = { .name = "pid", .tx_status = rate_control_pid_tx_status, .get_rate = rate_control_pid_get_rate, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2dfa7552273..394e201cde6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -40,8 +40,6 @@ static struct sk_buff *remove_monitor_info(struct ieee80211_local *local, struct sk_buff *skb) { - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) { if (likely(skb->len > FCS_LEN)) __pskb_trim(skb, skb->len - FCS_LEN); @@ -53,31 +51,28 @@ static struct sk_buff *remove_monitor_info(struct ieee80211_local *local, } } - if (status->vendor_radiotap_len) - __pskb_pull(skb, status->vendor_radiotap_len); - return skb; } -static inline int should_drop_frame(struct sk_buff *skb, int present_fcs_len) +static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - struct ieee80211_hdr *hdr; - - hdr = (void *)(skb->data + status->vendor_radiotap_len); + struct ieee80211_hdr *hdr = (void *)skb->data; if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC | RX_FLAG_AMPDU_IS_ZEROLEN)) - return 1; - if (unlikely(skb->len < 16 + present_fcs_len + - status->vendor_radiotap_len)) - return 1; + return true; + + if (unlikely(skb->len < 16 + present_fcs_len)) + return true; + if (ieee80211_is_ctl(hdr->frame_control) && !ieee80211_is_pspoll(hdr->frame_control) && !ieee80211_is_back_req(hdr->frame_control)) - return 1; - return 0; + return true; + + return false; } static int @@ -90,8 +85,6 @@ ieee80211_rx_radiotap_space(struct ieee80211_local *local, len = sizeof(struct ieee80211_radiotap_header) + 8; /* allocate extra bitmaps */ - if (status->vendor_radiotap_len) - len += 4; if (status->chains) len += 4 * hweight8(status->chains); @@ -127,18 +120,6 @@ ieee80211_rx_radiotap_space(struct ieee80211_local *local, len += 2 * hweight8(status->chains); } - if (status->vendor_radiotap_len) { - if (WARN_ON_ONCE(status->vendor_radiotap_align == 0)) - status->vendor_radiotap_align = 1; - /* align standard part of vendor namespace */ - len = ALIGN(len, 2); - /* allocate standard part of vendor namespace */ - len += 6; - /* align vendor-defined part */ - len = ALIGN(len, status->vendor_radiotap_align); - /* vendor-defined part is already in skb */ - } - return len; } @@ -172,7 +153,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, it_present = &rthdr->it_present; /* radiotap header, set always present flags */ - rthdr->it_len = cpu_to_le16(rtap_len + status->vendor_radiotap_len); + rthdr->it_len = cpu_to_le16(rtap_len); it_present_val = BIT(IEEE80211_RADIOTAP_FLAGS) | BIT(IEEE80211_RADIOTAP_CHANNEL) | BIT(IEEE80211_RADIOTAP_RX_FLAGS); @@ -190,14 +171,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL); } - if (status->vendor_radiotap_len) { - it_present_val |= BIT(IEEE80211_RADIOTAP_VENDOR_NAMESPACE) | - BIT(IEEE80211_RADIOTAP_EXT); - put_unaligned_le32(it_present_val, it_present); - it_present++; - it_present_val = status->vendor_radiotap_bitmap; - } - put_unaligned_le32(it_present_val, it_present); pos = (void *)(it_present + 1); @@ -307,6 +280,8 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos |= IEEE80211_RADIOTAP_MCS_BW_40; if (status->flag & RX_FLAG_HT_GF) *pos |= IEEE80211_RADIOTAP_MCS_FMT_GF; + if (status->flag & RX_FLAG_LDPC) + *pos |= IEEE80211_RADIOTAP_MCS_FEC_LDPC; stbc = (status->flag & RX_FLAG_STBC_MASK) >> RX_FLAG_STBC_SHIFT; *pos |= stbc << IEEE80211_RADIOTAP_MCS_STBC_SHIFT; pos++; @@ -349,20 +324,25 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); /* known field - how to handle 80+80? */ - if (status->flag & RX_FLAG_80P80MHZ) + if (status->vht_flag & RX_VHT_FLAG_80P80MHZ) known &= ~IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH; put_unaligned_le16(known, pos); pos += 2; /* flags */ if (status->flag & RX_FLAG_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; + /* in VHT, STBC is binary */ + if (status->flag & RX_FLAG_STBC_MASK) + *pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC; + if (status->vht_flag & RX_VHT_FLAG_BF) + *pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED; pos++; /* bandwidth */ - if (status->flag & RX_FLAG_80MHZ) + if (status->vht_flag & RX_VHT_FLAG_80MHZ) *pos++ = 4; - else if (status->flag & RX_FLAG_80P80MHZ) + else if (status->vht_flag & RX_VHT_FLAG_80P80MHZ) *pos++ = 0; /* marked not known above */ - else if (status->flag & RX_FLAG_160MHZ) + else if (status->vht_flag & RX_VHT_FLAG_160MHZ) *pos++ = 11; else if (status->flag & RX_FLAG_40MHZ) *pos++ = 1; @@ -372,6 +352,8 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos = (status->rate_idx << 4) | status->vht_nss; pos += 4; /* coding field */ + if (status->flag & RX_FLAG_LDPC) + *pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0; pos++; /* group ID */ pos++; @@ -383,21 +365,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos++ = status->chain_signal[chain]; *pos++ = chain; } - - if (status->vendor_radiotap_len) { - /* ensure 2 byte alignment for the vendor field as required */ - if ((pos - (u8 *)rthdr) & 1) - *pos++ = 0; - *pos++ = status->vendor_radiotap_oui[0]; - *pos++ = status->vendor_radiotap_oui[1]; - *pos++ = status->vendor_radiotap_oui[2]; - *pos++ = status->vendor_radiotap_subns; - put_unaligned_le16(status->vendor_radiotap_len, pos); - pos += 2; - /* align the actual payload as requested */ - while ((pos - (u8 *)rthdr) & (status->vendor_radiotap_align - 1)) - *pos++ = 0; - } } /* @@ -428,8 +395,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) present_fcs_len = FCS_LEN; - /* ensure hdr->frame_control and vendor radiotap data are in skb head */ - if (!pskb_may_pull(origskb, 2 + status->vendor_radiotap_len)) { + /* ensure hdr->frame_control is in skb head */ + if (!pskb_may_pull(origskb, 2)) { dev_kfree_skb(origskb); return NULL; } @@ -599,10 +566,10 @@ static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - if (skb->len < 24 || is_multicast_ether_addr(hdr->addr1)) + if (is_multicast_ether_addr(hdr->addr1)) return 0; - return ieee80211_is_robust_mgmt_frame(hdr); + return ieee80211_is_robust_mgmt_frame(skb); } @@ -610,10 +577,10 @@ static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - if (skb->len < 24 || !is_multicast_ether_addr(hdr->addr1)) + if (!is_multicast_ether_addr(hdr->addr1)) return 0; - return ieee80211_is_robust_mgmt_frame(hdr); + return ieee80211_is_robust_mgmt_frame(skb); } @@ -626,7 +593,7 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb) if (skb->len < 24 + sizeof(*mmie) || !is_multicast_ether_addr(hdr->da)) return -1; - if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) hdr)) + if (!ieee80211_is_robust_mgmt_frame(skb)) return -1; /* not a robust management frame */ mmie = (struct ieee80211_mmie *) @@ -1128,6 +1095,13 @@ static void sta_ps_end(struct sta_info *sta) sta->sta.addr, sta->sta.aid); if (test_sta_flag(sta, WLAN_STA_PS_DRIVER)) { + /* + * Clear the flag only if the other one is still set + * so that the TX path won't start TX'ing new frames + * directly ... In the case that the driver flag isn't + * set ieee80211_sta_ps_deliver_wakeup() will clear it. + */ + clear_sta_flag(sta, WLAN_STA_PS_STA); ps_dbg(sta->sdata, "STA %pM aid %d driver-ps-blocked\n", sta->sta.addr, sta->sta.aid); return; @@ -1258,9 +1232,11 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) && test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { sta->last_rx = jiffies; - if (ieee80211_is_data(hdr->frame_control)) { + if (ieee80211_is_data(hdr->frame_control) && + !is_multicast_ether_addr(hdr->addr1)) { sta->last_rx_rate_idx = status->rate_idx; sta->last_rx_rate_flag = status->flag; + sta->last_rx_rate_vht_flag = status->vht_flag; sta->last_rx_rate_vht_nss = status->vht_nss; } } @@ -1273,6 +1249,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) if (ieee80211_is_data(hdr->frame_control)) { sta->last_rx_rate_idx = status->rate_idx; sta->last_rx_rate_flag = status->flag; + sta->last_rx_rate_vht_flag = status->vht_flag; sta->last_rx_rate_vht_nss = status->vht_nss; } } @@ -1311,18 +1288,15 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) !ieee80211_has_morefrags(hdr->frame_control) && !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) && (rx->sdata->vif.type == NL80211_IFTYPE_AP || - rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) { + rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && + /* PM bit is only checked in frames where it isn't reserved, + * in AP mode it's reserved in non-bufferable management frames + * (cf. IEEE 802.11-2012 8.2.4.1.7 Power Management field) + */ + (!ieee80211_is_mgmt(hdr->frame_control) || + ieee80211_is_bufferable_mmpdu(hdr->frame_control))) { if (test_sta_flag(sta, WLAN_STA_PS_STA)) { - /* - * Ignore doze->wake transitions that are - * indicated by non-data frames, the standard - * is unclear here, but for example going to - * PS mode and then scanning would cause a - * doze->wake transition for the probe request, - * and that is clearly undesirable. - */ - if (ieee80211_is_data(hdr->frame_control) && - !ieee80211_has_pm(hdr->frame_control)) + if (!ieee80211_has_pm(hdr->frame_control)) sta_ps_end(sta); } else { if (ieee80211_has_pm(hdr->frame_control)) @@ -1845,8 +1819,7 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) * having configured keys. */ if (unlikely(ieee80211_is_action(fc) && !rx->key && - ieee80211_is_robust_mgmt_frame( - (struct ieee80211_hdr *) rx->skb->data))) + ieee80211_is_robust_mgmt_frame(rx->skb))) return -EACCES; } @@ -1963,20 +1936,17 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) } } - if (skb) { - int align __maybe_unused; - #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - /* - * 'align' will only take the values 0 or 2 here - * since all frames are required to be aligned - * to 2-byte boundaries when being passed to - * mac80211; the code here works just as well if - * that isn't true, but mac80211 assumes it can - * access fields as 2-byte aligned (e.g. for - * compare_ether_addr) + if (skb) { + /* 'align' will only take the values 0 or 2 here since all + * frames are required to be aligned to 2-byte boundaries + * when being passed to mac80211; the code here works just + * as well if that isn't true, but mac80211 assumes it can + * access fields as 2-byte aligned (e.g. for ether_addr_equal) */ - align = ((unsigned long)(skb->data + sizeof(struct ethhdr))) & 3; + int align; + + align = (unsigned long)(skb->data + sizeof(struct ethhdr)) & 3; if (align) { if (WARN_ON(skb_headroom(skb) < 3)) { dev_kfree_skb(skb); @@ -1989,14 +1959,17 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) skb_set_tail_pointer(skb, len); } } + } #endif - if (skb) { - /* deliver to local stack */ - skb->protocol = eth_type_trans(skb, dev); - memset(skb->cb, 0, sizeof(skb->cb)); + if (skb) { + /* deliver to local stack */ + skb->protocol = eth_type_trans(skb, dev); + memset(skb->cb, 0, sizeof(skb->cb)); + if (rx->local->napi) + napi_gro_receive(rx->local->napi, skb); + else netif_receive_skb(skb); - } } if (xmit_skb) { @@ -3079,8 +3052,8 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* main receive path */ -static int prepare_for_handlers(struct ieee80211_rx_data *rx, - struct ieee80211_hdr *hdr) +static bool prepare_for_handlers(struct ieee80211_rx_data *rx, + struct ieee80211_hdr *hdr) { struct ieee80211_sub_if_data *sdata = rx->sdata; struct sk_buff *skb = rx->skb; @@ -3091,29 +3064,29 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!bssid && !sdata->u.mgd.use_4addr) - return 0; + return false; if (!multicast && !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { if (!(sdata->dev->flags & IFF_PROMISC) || sdata->u.mgd.use_4addr) - return 0; + return false; status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } break; case NL80211_IFTYPE_ADHOC: if (!bssid) - return 0; + return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) - return 0; + return false; if (ieee80211_is_beacon(hdr->frame_control)) { - return 1; + return true; } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { - return 0; + return false; } else if (!multicast && !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { if (!(sdata->dev->flags & IFF_PROMISC)) - return 0; + return false; status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } else if (!rx->sta) { int rate_idx; @@ -3129,7 +3102,7 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, if (!multicast && !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { if (!(sdata->dev->flags & IFF_PROMISC)) - return 0; + return false; status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } @@ -3138,7 +3111,7 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, case NL80211_IFTYPE_AP: if (!bssid) { if (!ether_addr_equal(sdata->vif.addr, hdr->addr1)) - return 0; + return false; } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { /* * Accept public action frames even when the @@ -3148,26 +3121,26 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, */ if (!multicast && !ether_addr_equal(sdata->vif.addr, hdr->addr1)) - return 0; + return false; if (ieee80211_is_public_action(hdr, skb->len)) - return 1; + return true; if (!ieee80211_is_beacon(hdr->frame_control)) - return 0; + return false; status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } break; case NL80211_IFTYPE_WDS: if (bssid || !ieee80211_is_data(hdr->frame_control)) - return 0; + return false; if (!ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2)) - return 0; + return false; break; case NL80211_IFTYPE_P2P_DEVICE: if (!ieee80211_is_public_action(hdr, skb->len) && !ieee80211_is_probe_req(hdr->frame_control) && !ieee80211_is_probe_resp(hdr->frame_control) && !ieee80211_is_beacon(hdr->frame_control)) - return 0; + return false; if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) && !multicast) status->rx_flags &= ~IEEE80211_RX_RA_MATCH; @@ -3178,7 +3151,7 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, break; } - return 1; + return true; } /* @@ -3194,13 +3167,11 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; - int prepares; rx->skb = skb; status->rx_flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(rx, hdr); - if (!prepares) + if (!prepare_for_handlers(rx, hdr)) return false; if (!consume) { @@ -3221,7 +3192,7 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, } /* - * This is the actual Rx frames handler. as it blongs to Rx path it must + * This is the actual Rx frames handler. as it belongs to Rx path it must * be called with rcu_read_lock protection. */ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 4d73c46df86..f40661eb75b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -271,10 +271,11 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) return true; } -static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, - bool was_hw_scan) +static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); + bool hw_scan = local->ops->hw_scan; + bool was_scanning = local->scanning; lockdep_assert_held(&local->mtx); @@ -290,7 +291,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, if (WARN_ON(!local->scan_req)) return; - if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { + if (hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { int rc; rc = drv_hw_scan(local, @@ -308,7 +309,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, if (local->scan_req != local->int_scan_req) cfg80211_scan_done(local->scan_req, aborted); local->scan_req = NULL; - rcu_assign_pointer(local->scan_sdata, NULL); + RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; @@ -316,7 +317,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, /* Set power back to normal operating levels. */ ieee80211_hw_config(local, 0); - if (!was_hw_scan) { + if (!hw_scan) { ieee80211_configure_filter(local); drv_sw_scan_complete(local); ieee80211_offchannel_return(local); @@ -327,7 +328,8 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); ieee80211_mesh_notify_scan_completed(local); - ieee80211_start_next_roc(local); + if (was_scanning) + ieee80211_start_next_roc(local); } void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) @@ -470,9 +472,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, if (local->ops->hw_scan) { u8 *ies; - local->hw_scan_ies_bufsize = 2 + IEEE80211_MAX_SSID_LEN + - local->scan_ies_len + - req->ie_len; + local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; local->hw_scan_req = kmalloc( sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]) + @@ -559,7 +559,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_idle(local); local->scan_req = NULL; - rcu_assign_pointer(local->scan_sdata, NULL); + RCU_INIT_POINTER(local->scan_sdata, NULL); } return rc; @@ -747,7 +747,7 @@ void ieee80211_scan_work(struct work_struct *work) container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata; unsigned long next_delay = 0; - bool aborted, hw_scan; + bool aborted; mutex_lock(&local->mtx); @@ -773,7 +773,7 @@ void ieee80211_scan_work(struct work_struct *work) int rc; local->scan_req = NULL; - rcu_assign_pointer(local->scan_sdata, NULL); + RCU_INIT_POINTER(local->scan_sdata, NULL); rc = __ieee80211_start_scan(sdata, req); if (rc) { @@ -786,14 +786,6 @@ void ieee80211_scan_work(struct work_struct *work) } /* - * Avoid re-scheduling when the sdata is going away. - */ - if (!ieee80211_sdata_running(sdata)) { - aborted = true; - goto out_complete; - } - - /* * as long as no delay is required advance immediately * without scheduling a new work */ @@ -834,8 +826,7 @@ void ieee80211_scan_work(struct work_struct *work) goto out; out_complete: - hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); - __ieee80211_scan_completed(&local->hw, aborted, hw_scan); + __ieee80211_scan_completed(&local->hw, aborted); out: mutex_unlock(&local->mtx); } @@ -973,33 +964,25 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) */ cancel_delayed_work(&local->scan_work); /* and clean up */ - __ieee80211_scan_completed(&local->hw, true, false); + __ieee80211_scan_completed(&local->hw, true); out: mutex_unlock(&local->mtx); } -int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, - struct cfg80211_sched_scan_request *req) +int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, + struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_sched_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz; - iebufsz = 2 + IEEE80211_MAX_SSID_LEN + - local->scan_ies_len + req->ie_len; + iebufsz = local->scan_ies_len + req->ie_len; - mutex_lock(&local->mtx); + lockdep_assert_held(&local->mtx); - if (rcu_access_pointer(local->sched_scan_sdata)) { - ret = -EBUSY; - goto out; - } - - if (!local->ops->sched_scan_start) { - ret = -ENOTSUPP; - goto out; - } + if (!local->ops->sched_scan_start) + return -ENOTSUPP; for (i = 0; i < IEEE80211_NUM_BANDS; i++) { if (!local->hw.wiphy->bands[i]) @@ -1020,13 +1003,39 @@ int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, } ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); - if (ret == 0) + if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); + local->sched_scan_req = req; + } out_free: while (i > 0) kfree(sched_scan_ies.ie[--i]); -out: + + if (ret) { + /* Clean in case of failure after HW restart or upon resume. */ + RCU_INIT_POINTER(local->sched_scan_sdata, NULL); + local->sched_scan_req = NULL; + } + + return ret; +} + +int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, + struct cfg80211_sched_scan_request *req) +{ + struct ieee80211_local *local = sdata->local; + int ret; + + mutex_lock(&local->mtx); + + if (rcu_access_pointer(local->sched_scan_sdata)) { + mutex_unlock(&local->mtx); + return -EBUSY; + } + + ret = __ieee80211_request_sched_scan_start(sdata, req); + mutex_unlock(&local->mtx); return ret; } @@ -1043,9 +1052,14 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) goto out; } - if (rcu_access_pointer(local->sched_scan_sdata)) - drv_sched_scan_stop(local, sdata); + /* We don't want to restart sched scan anymore. */ + local->sched_scan_req = NULL; + if (rcu_access_pointer(local->sched_scan_sdata)) { + ret = drv_sched_scan_stop(local, sdata); + if (!ret) + rcu_assign_pointer(local->sched_scan_sdata, NULL); + } out: mutex_unlock(&local->mtx); @@ -1062,12 +1076,8 @@ void ieee80211_sched_scan_results(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_sched_scan_results); -void ieee80211_sched_scan_stopped_work(struct work_struct *work) +void ieee80211_sched_scan_end(struct ieee80211_local *local) { - struct ieee80211_local *local = - container_of(work, struct ieee80211_local, - sched_scan_stopped_work); - mutex_lock(&local->mtx); if (!rcu_access_pointer(local->sched_scan_sdata)) { @@ -1075,13 +1085,25 @@ void ieee80211_sched_scan_stopped_work(struct work_struct *work) return; } - rcu_assign_pointer(local->sched_scan_sdata, NULL); + RCU_INIT_POINTER(local->sched_scan_sdata, NULL); + + /* If sched scan was aborted by the driver. */ + local->sched_scan_req = NULL; mutex_unlock(&local->mtx); cfg80211_sched_scan_stopped(local->hw.wiphy); } +void ieee80211_sched_scan_stopped_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, + sched_scan_stopped_work); + + ieee80211_sched_scan_end(local); +} + void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8ed97f76c3c..a9b46d8ea22 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -91,7 +91,7 @@ static int sta_info_hash_del(struct ieee80211_local *local, return -ENOENT; } -static void cleanup_single_sta(struct sta_info *sta) +static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; @@ -99,24 +99,8 @@ static void cleanup_single_sta(struct sta_info *sta) struct ieee80211_local *local = sdata->local; struct ps_data *ps; - /* - * At this point, when being called as call_rcu callback, - * neither mac80211 nor the driver can reference this - * sta struct any more except by still existing timers - * associated with this station that we clean up below. - * - * Note though that this still uses the sdata and even - * calls the driver in AP and mesh mode, so interfaces - * of those types mush use call sta_info_flush_cleanup() - * (typically via sta_info_flush()) before deconfiguring - * the driver. - * - * In station mode, nothing happens here so it doesn't - * have to (and doesn't) do that, this is intentional to - * speed up roaming. - */ - - if (test_sta_flag(sta, WLAN_STA_PS_STA)) { + if (test_sta_flag(sta, WLAN_STA_PS_STA) || + test_sta_flag(sta, WLAN_STA_PS_DRIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; @@ -126,6 +110,7 @@ static void cleanup_single_sta(struct sta_info *sta) return; clear_sta_flag(sta, WLAN_STA_PS_STA); + clear_sta_flag(sta, WLAN_STA_PS_DRIVER); atomic_dec(&ps->num_sta_ps); sta_info_recalc_tim(sta); @@ -156,39 +141,15 @@ static void cleanup_single_sta(struct sta_info *sta) ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } - - sta_info_free(local, sta); } -void ieee80211_cleanup_sdata_stas(struct ieee80211_sub_if_data *sdata) -{ - struct sta_info *sta; - - spin_lock_bh(&sdata->cleanup_stations_lock); - while (!list_empty(&sdata->cleanup_stations)) { - sta = list_first_entry(&sdata->cleanup_stations, - struct sta_info, list); - list_del(&sta->list); - spin_unlock_bh(&sdata->cleanup_stations_lock); - - cleanup_single_sta(sta); - - spin_lock_bh(&sdata->cleanup_stations_lock); - } - - spin_unlock_bh(&sdata->cleanup_stations_lock); -} - -static void free_sta_rcu(struct rcu_head *h) +static void cleanup_single_sta(struct sta_info *sta) { - struct sta_info *sta = container_of(h, struct sta_info, rcu_head); struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; - spin_lock(&sdata->cleanup_stations_lock); - list_add_tail(&sta->list, &sdata->cleanup_stations); - spin_unlock(&sdata->cleanup_stations_lock); - - ieee80211_queue_work(&sdata->local->hw, &sdata->cleanup_stations_wk); + __cleanup_single_sta(sta); + sta_info_free(local, sta); } /* protected by RCU */ @@ -279,6 +240,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); + kfree(rcu_dereference_raw(sta->sta.rates)); kfree(sta); } @@ -348,7 +310,37 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (!sta) return NULL; + rcu_read_lock(); + tx_latency = rcu_dereference(local->tx_latency); + /* init stations Tx latency statistics && TID bins */ + if (tx_latency) { + sta->tx_lat = kzalloc(IEEE80211_NUM_TIDS * + sizeof(struct ieee80211_tx_latency_stat), + GFP_ATOMIC); + if (!sta->tx_lat) { + rcu_read_unlock(); + goto free; + } + + if (tx_latency->n_ranges) { + for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + /* size of bins is size of the ranges +1 */ + sta->tx_lat[i].bin_count = + tx_latency->n_ranges + 1; + sta->tx_lat[i].bins = + kcalloc(sta->tx_lat[i].bin_count, + sizeof(u32), GFP_ATOMIC); + if (!sta->tx_lat[i].bins) { + rcu_read_unlock(); + goto free; + } + } + } + } + rcu_read_unlock(); + spin_lock_init(&sta->lock); + spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_unblock_wk, sta_unblock); INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); @@ -372,10 +364,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++) ewma_init(&sta->chain_signal_avg[i], 1024, 8); - if (sta_prepare_rate_control(local, sta, gfp)) { - kfree(sta); - return NULL; - } + if (sta_prepare_rate_control(local, sta, gfp)) + goto free; for (i = 0; i < IEEE80211_NUM_TIDS; i++) { /* @@ -419,34 +409,17 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, } } - rcu_read_lock(); - - tx_latency = rcu_dereference(local->tx_latency); - /* init stations Tx latency statistics && TID bins */ - if (tx_latency) - sta->tx_lat = kzalloc(IEEE80211_NUM_TIDS * - sizeof(struct ieee80211_tx_latency_stat), - GFP_ATOMIC); - - /* - * if Tx latency and bins are enabled and the previous allocation - * succeeded - */ - if (tx_latency && tx_latency->n_ranges && sta->tx_lat) - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - /* size of bins is size of the ranges +1 */ - sta->tx_lat[i].bin_count = - tx_latency->n_ranges + 1; - sta->tx_lat[i].bins = kcalloc(sta->tx_lat[i].bin_count, - sizeof(u32), - GFP_ATOMIC); - } - - rcu_read_unlock(); - sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); - return sta; + +free: + if (sta->tx_lat) { + for (i = 0; i < IEEE80211_NUM_TIDS; i++) + kfree(sta->tx_lat[i].bins); + kfree(sta->tx_lat); + } + kfree(sta); + return NULL; } static int sta_info_insert_check(struct sta_info *sta) @@ -525,21 +498,26 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) goto out_err; } - /* notify driver */ - err = sta_info_insert_drv_state(local, sdata, sta); - if (err) - goto out_err; - local->num_sta++; local->sta_generation++; smp_mb(); + /* simplify things and don't accept BA sessions yet */ + set_sta_flag(sta, WLAN_STA_BLOCK_BA); + /* make the station visible */ sta_info_hash_add(local, sta); list_add_rcu(&sta->list, &local->sta_list); + /* notify driver */ + err = sta_info_insert_drv_state(local, sdata, sta); + if (err) + goto out_remove; + set_sta_flag(sta, WLAN_STA_INSERTED); + /* accept BA sessions now */ + clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_recalc_min_chandef(sdata); ieee80211_sta_debugfs_add(sta); @@ -560,6 +538,12 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) mesh_accept_plinks_update(sdata); return 0; + out_remove: + sta_info_hash_del(local, sta); + list_del_rcu(&sta->list); + local->num_sta--; + synchronize_net(); + __cleanup_single_sta(sta); out_err: mutex_unlock(&local->sta_mtx); rcu_read_lock(); @@ -569,7 +553,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; - int err = 0; + int err; might_sleep(); @@ -587,7 +571,6 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) return 0; out_free: - BUG_ON(!err); sta_info_free(local, sta); return err; } @@ -842,7 +825,7 @@ static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, return have_buffered; } -int __must_check __sta_info_destroy(struct sta_info *sta) +static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -868,12 +851,35 @@ int __must_check __sta_info_destroy(struct sta_info *sta) ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); ret = sta_info_hash_del(local, sta); - if (ret) + if (WARN_ON(ret)) return ret; list_del_rcu(&sta->list); - /* this always calls synchronize_net() */ + drv_sta_pre_rcu_remove(local, sta->sdata, sta); + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + rcu_access_pointer(sdata->u.vlan.sta) == sta) + RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); + + return 0; +} + +static void __sta_info_destroy_part2(struct sta_info *sta) +{ + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + int ret; + + /* + * NOTE: This assumes at least synchronize_net() was done + * after _part1 and before _part2! + */ + + might_sleep(); + lockdep_assert_held(&local->sta_mtx); + + /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); sta->dead = true; @@ -881,9 +887,6 @@ int __must_check __sta_info_destroy(struct sta_info *sta) local->num_sta--; local->sta_generation++; - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); - while (sta->sta_state > IEEE80211_STA_NONE) { ret = sta_info_move_state(sta, sta->sta_state - 1); if (ret) { @@ -906,7 +909,19 @@ int __must_check __sta_info_destroy(struct sta_info *sta) ieee80211_sta_debugfs_remove(sta); ieee80211_recalc_min_chandef(sdata); - call_rcu(&sta->rcu_head, free_sta_rcu); + cleanup_single_sta(sta); +} + +int __must_check __sta_info_destroy(struct sta_info *sta) +{ + int err = __sta_info_destroy_part1(sta); + + if (err) + return err; + + synchronize_net(); + + __sta_info_destroy_part2(sta); return 0; } @@ -976,32 +991,38 @@ void sta_info_stop(struct ieee80211_local *local) } -int sta_info_flush_defer(struct ieee80211_sub_if_data *sdata) +int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; + LIST_HEAD(free_list); int ret = 0; might_sleep(); + WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); + WARN_ON(vlans && !sdata->bss); + mutex_lock(&local->sta_mtx); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { - if (sdata == sta->sdata) { - WARN_ON(__sta_info_destroy(sta)); + if (sdata == sta->sdata || + (vlans && sdata->bss == sta->sdata->bss)) { + if (!WARN_ON(__sta_info_destroy_part1(sta))) + list_add(&sta->free_list, &free_list); ret++; } } + + if (!list_empty(&free_list)) { + synchronize_net(); + list_for_each_entry_safe(sta, tmp, &free_list, free_list) + __sta_info_destroy_part2(sta); + } mutex_unlock(&local->sta_mtx); return ret; } -void sta_info_flush_cleanup(struct ieee80211_sub_if_data *sdata) -{ - ieee80211_cleanup_sdata_stas(sdata); - cancel_work_sync(&sdata->cleanup_stations_wk); -} - void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { @@ -1071,10 +1092,14 @@ struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, } EXPORT_SYMBOL(ieee80211_find_sta); -static void clear_sta_ps_flags(void *_sta) +/* powersave support code */ +void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { - struct sta_info *sta = _sta; struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct sk_buff_head pending; + int filtered = 0, buffered = 0, ac; + unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP || @@ -1085,20 +1110,6 @@ static void clear_sta_ps_flags(void *_sta) else return; - clear_sta_flag(sta, WLAN_STA_PS_DRIVER); - if (test_and_clear_sta_flag(sta, WLAN_STA_PS_STA)) - atomic_dec(&ps->num_sta_ps); -} - -/* powersave support code */ -void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) -{ - struct ieee80211_sub_if_data *sdata = sta->sdata; - struct ieee80211_local *local = sdata->local; - struct sk_buff_head pending; - int filtered = 0, buffered = 0, ac; - unsigned long flags; - clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); @@ -1109,6 +1120,8 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) skb_queue_head_init(&pending); + /* sync with ieee80211_tx_h_unicast_ps_buf */ + spin_lock(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; @@ -1127,10 +1140,16 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) buffered += tmp - count; } - ieee80211_add_pending_skbs_fn(local, &pending, clear_sta_ps_flags, sta); + ieee80211_add_pending_skbs(local, &pending); + clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + clear_sta_flag(sta, WLAN_STA_PS_STA); + spin_unlock(&sta->ps_lock); + + atomic_dec(&ps->num_sta_ps); /* This station just woke up and isn't aware of our SMPS state */ - if (!ieee80211_smps_is_restrictive(sta->known_smps_mode, + if (!ieee80211_vif_is_mesh(&sdata->vif) && + !ieee80211_smps_is_restrictive(sta->known_smps_mode, sdata->smps_mode) && sta->known_smps_mode != sdata->bss->req_smps && sta_info_tx_streams(sta) != 1) { @@ -1153,7 +1172,8 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, int tid, - enum ieee80211_frame_release_type reason) + enum ieee80211_frame_release_type reason, + bool call_driver) { struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; @@ -1187,6 +1207,7 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); + nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); @@ -1211,7 +1232,9 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; - drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); + if (call_driver) + drv_allow_buffered_frames(local, sta, BIT(tid), 1, + reason, false); skb->dev = sdata->dev; @@ -1227,6 +1250,17 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); } +static int find_highest_prio_tid(unsigned long tids) +{ + /* lower 3 TIDs aren't ordered perfectly */ + if (tids & 0xF8) + return fls(tids) - 1; + /* TID 0 is BE just like TID 3 */ + if (tids & BIT(0)) + return 0; + return fls(tids) - 1; +} + static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, @@ -1234,7 +1268,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; - bool found = false; bool more_data = false; int ac; unsigned long driver_release_tids = 0; @@ -1245,9 +1278,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, __skb_queue_head_init(&frames); - /* - * Get response frame(s) and more data bit for it. - */ + /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; @@ -1256,43 +1287,48 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, tids = ieee80211_tids_for_ac(ac); - if (!found) { - driver_release_tids = sta->driver_buffered_tids & tids; - if (driver_release_tids) { - found = true; - } else { - struct sk_buff *skb; - - while (n_frames > 0) { - skb = skb_dequeue(&sta->tx_filtered[ac]); - if (!skb) { - skb = skb_dequeue( - &sta->ps_tx_buf[ac]); - if (skb) - local->total_ps_buffered--; - } - if (!skb) - break; - n_frames--; - found = true; - __skb_queue_tail(&frames, skb); - } - } + /* if we already have frames from software, then we can't also + * release from hardware queues + */ + if (skb_queue_empty(&frames)) + driver_release_tids |= sta->driver_buffered_tids & tids; - /* - * If the driver has data on more than one TID then + if (driver_release_tids) { + /* If the driver has data on more than one TID then * certainly there's more data if we release just a - * single frame now (from a single TID). + * single frame now (from a single TID). This will + * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) { more_data = true; driver_release_tids = - BIT(ffs(driver_release_tids) - 1); + BIT(find_highest_prio_tid( + driver_release_tids)); break; } + } else { + struct sk_buff *skb; + + while (n_frames > 0) { + skb = skb_dequeue(&sta->tx_filtered[ac]); + if (!skb) { + skb = skb_dequeue( + &sta->ps_tx_buf[ac]); + if (skb) + local->total_ps_buffered--; + } + if (!skb) + break; + n_frames--; + __skb_queue_tail(&frames, skb); + } } + /* If we have more frames buffered on this AC, then set the + * more-data bit and abort the loop since we can't send more + * data from other ACs before the buffered frames from this. + */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) { more_data = true; @@ -1300,7 +1336,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, } } - if (!found) { + if (skb_queue_empty(&frames) && !driver_release_tids) { int tid; /* @@ -1321,15 +1357,13 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, /* This will evaluate to 1, 3, 5 or 7. */ tid = 7 - ((ffs(~ignored_acs) - 1) << 1); - ieee80211_send_null_response(sdata, sta, tid, reason); - return; - } - - if (!driver_release_tids) { + ieee80211_send_null_response(sdata, sta, tid, reason, true); + } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; + bool need_null = false; skb_queue_head_init(&pending); @@ -1363,22 +1397,57 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); - /* end service period after last frame */ - if (skb_queue_empty(&frames)) { - if (reason == IEEE80211_FRAME_RELEASE_UAPSD && - qoshdr) - *qoshdr |= IEEE80211_QOS_CTL_EOSP; + tids |= BIT(skb->priority); + + __skb_queue_tail(&pending, skb); + + /* end service period after last frame or add one */ + if (!skb_queue_empty(&frames)) + continue; + if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { + /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; + break; } - if (qoshdr) - tids |= BIT(*qoshdr & IEEE80211_QOS_CTL_TID_MASK); - else - tids |= BIT(0); + /* For uAPSD, things are a bit more complicated. If the + * last frame has a QoS header (i.e. is a QoS-data or + * QoS-nulldata frame) then just set the EOSP bit there + * and be done. + * If the frame doesn't have a QoS header (which means + * it should be a bufferable MMPDU) then we can't set + * the EOSP bit in the QoS header; add a QoS-nulldata + * frame to the list to send it after the MMPDU. + * + * Note that this code is only in the mac80211-release + * code path, we assume that the driver will not buffer + * anything but QoS-data frames, or if it does, will + * create the QoS-nulldata frame by itself if needed. + * + * Cf. 802.11-2012 10.2.1.10 (c). + */ + if (qoshdr) { + *qoshdr |= IEEE80211_QOS_CTL_EOSP; - __skb_queue_tail(&pending, skb); + info->flags |= IEEE80211_TX_STATUS_EOSP | + IEEE80211_TX_CTL_REQ_TX_STATUS; + } else { + /* The standard isn't completely clear on this + * as it says the more-data bit should be set + * if there are more BUs. The QoS-Null frame + * we're about to send isn't buffered yet, we + * only create it below, but let's pretend it + * was buffered just in case some clients only + * expect more-data=0 when eosp=1. + */ + hdr->frame_control |= + cpu_to_le16(IEEE80211_FCTL_MOREDATA); + need_null = true; + num++; + } + break; } drv_allow_buffered_frames(local, sta, tids, num, @@ -1386,17 +1455,22 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, ieee80211_add_pending_skbs(local, &pending); + if (need_null) + ieee80211_send_null_response( + sdata, sta, find_highest_prio_tid(tids), + reason, false); + sta_info_recalc_tim(sta); } else { /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. - * Note that, as per the comment above, it'll also have to see - * if there is more than just one frame on the specific TID that - * we're releasing from, and it needs to set the more-data bit - * accordingly if we tell it that there's no more data. If we do - * tell it there's more data, then of course the more-data bit - * needs to be set anyway. + * Note that the driver also has to check the number of frames + * on the TIDs we're releasing from - if there are more than + * n_frames it has to set the more-data bit (if we didn't ask + * it to set it anyway due to other buffered frames); if there + * are fewer than n_frames it has to make sure to adjust that + * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); @@ -1404,9 +1478,9 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us - * that the TID became empty before returning here from the + * that the TID(s) became empty before returning here from the * release function. - * Either way, however, when the driver tells us that the TID + * Either way, however, when the driver tells us that the TID(s) * became empty we'll do the TIM recalculation. */ } @@ -1495,6 +1569,8 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; + trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); + if (buffered) set_bit(tid, &sta->driver_buffered_tids); else diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 0218caf5c14..4acc5fc402f 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -247,6 +247,7 @@ struct ieee80211_tx_latency_stat { * mac80211 is communicating with. * * @list: global linked list entry + * @free_list: list entry for keeping track of stations to free * @hnext: hash table linked list pointer * @local: pointer to the global information * @sdata: virtual interface this station belongs to @@ -260,12 +261,14 @@ struct ieee80211_tx_latency_stat { * "the" transmit rate * @last_rx_rate_idx: rx status rate index of the last data packet * @last_rx_rate_flag: rx status flag of the last data packet + * @last_rx_rate_vht_flag: rx status vht flag of the last data packet * @last_rx_rate_vht_nss: rx status nss of last data packet * @lock: used for locking all fields that require locking, see comments * in the header file. * @drv_unblock_wk: used for driver PS unblocking * @listen_interval: listen interval of this station, when we're acting as AP * @_flags: STA flags, see &enum ieee80211_sta_info_flags, do not use directly + * @ps_lock: used for powersave (when mac80211 is the AP) related locking * @ps_tx_buf: buffers (per AC) of frames to transmit to this station * when it leaves power saving state or polls * @tx_filtered: buffers (per AC) of frames we already tried to @@ -329,7 +332,7 @@ struct ieee80211_tx_latency_stat { */ struct sta_info { /* General information, mostly static */ - struct list_head list; + struct list_head list, free_list; struct rcu_head rcu_head; struct sta_info __rcu *hnext; struct ieee80211_local *local; @@ -355,10 +358,8 @@ struct sta_info { /* use the accessors defined below */ unsigned long _flags; - /* - * STA powersave frame queues, no more than the internal - * locking required. - */ + /* STA powersave lock and frame queues */ + spinlock_t ps_lock; struct sk_buff_head ps_tx_buf[IEEE80211_NUM_ACS]; struct sk_buff_head tx_filtered[IEEE80211_NUM_ACS]; unsigned long driver_buffered_tids; @@ -396,6 +397,7 @@ struct sta_info { struct ieee80211_tx_rate last_tx_rate; int last_rx_rate_idx; u32 last_rx_rate_flag; + u32 last_rx_rate_vht_flag; u8 last_rx_rate_vht_nss; u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; @@ -605,21 +607,6 @@ void sta_info_recalc_tim(struct sta_info *sta); void sta_info_init(struct ieee80211_local *local); void sta_info_stop(struct ieee80211_local *local); -int sta_info_flush_defer(struct ieee80211_sub_if_data *sdata); - -/** - * sta_info_flush_cleanup - flush the sta_info cleanup queue - * @sdata: the interface - * - * Flushes the sta_info cleanup queue for a given interface; - * this is necessary before the interface is removed or, for - * AP/mesh interfaces, before it is deconfigured. - * - * Note an rcu_barrier() must precede the function, after all - * stations have been flushed/removed to ensure the call_rcu() - * calls that add stations to the cleanup queue have completed. - */ -void sta_info_flush_cleanup(struct ieee80211_sub_if_data *sdata); /** * sta_info_flush - flush matching STA entries from the STA table @@ -627,15 +614,13 @@ void sta_info_flush_cleanup(struct ieee80211_sub_if_data *sdata); * Returns the number of removed STA entries. * * @sdata: sdata to remove all stations from + * @vlans: if the given interface is an AP interface, also flush VLANs */ +int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans); + static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata) { - int ret = sta_info_flush_defer(sdata); - - rcu_barrier(); - sta_info_flush_cleanup(sdata); - - return ret; + return __sta_info_flush(sdata, false); } void sta_set_rate_info_tx(struct sta_info *sta, @@ -651,6 +636,4 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta); void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta); void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta); -void ieee80211_cleanup_sdata_stas(struct ieee80211_sub_if_data *sdata); - #endif /* STA_INFO_H */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 1ee85c40243..ba29ebc8614 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -314,10 +314,9 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; - if ((info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) || - (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT)) + if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; - else if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) + if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); @@ -479,7 +478,7 @@ static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, u32 msrmnt; u16 tid; u8 *qc; - int i, bin_range_count, bin_count; + int i, bin_range_count; u32 *bin_ranges; __le16 fc; struct ieee80211_tx_latency_stat *tx_lat; @@ -522,7 +521,6 @@ static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, /* count how many Tx frames transmitted with the appropriate latency */ bin_range_count = tx_latency->n_ranges; bin_ranges = tx_latency->ranges; - bin_count = tx_lat->bin_count; for (i = 0; i < bin_range_count; i++) { if (msrmnt <= bin_ranges[i]) { @@ -543,6 +541,23 @@ static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, */ #define STA_LOST_PKT_THRESHOLD 50 +static void ieee80211_lost_packet(struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + /* This packet was aggregated but doesn't carry status info */ + if ((info->flags & IEEE80211_TX_CTL_AMPDU) && + !(info->flags & IEEE80211_TX_STAT_AMPDU)) + return; + + if (++sta->lost_packets < STA_LOST_PKT_THRESHOLD) + return; + + cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, + sta->lost_packets, GFP_ATOMIC); + sta->lost_packets = 0; +} + void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) { struct sk_buff *skb2; @@ -619,6 +634,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) sta, true, acked); if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) && + (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->last_tx_rate = info->status.rates[rates_idx]; @@ -681,12 +697,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->lost_packets) sta->lost_packets = 0; - } else if (++sta->lost_packets >= STA_LOST_PKT_THRESHOLD) { - cfg80211_cqm_pktloss_notify(sta->sdata->dev, - sta->sta.addr, - sta->lost_packets, - GFP_ATOMIC); - sta->lost_packets = 0; + } else { + ieee80211_lost_packet(sta, skb); } } diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c new file mode 100644 index 00000000000..652813b2d3d --- /dev/null +++ b/net/mac80211/tdls.c @@ -0,0 +1,325 @@ +/* + * mac80211 TDLS handling code + * + * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2014, Intel Corporation + * + * This file is GPLv2 as found in COPYING. + */ + +#include <linux/ieee80211.h> +#include "ieee80211_i.h" + +static void ieee80211_tdls_add_ext_capab(struct sk_buff *skb) +{ + u8 *pos = (void *)skb_put(skb, 7); + + *pos++ = WLAN_EID_EXT_CAPABILITY; + *pos++ = 5; /* len */ + *pos++ = 0x0; + *pos++ = 0x0; + *pos++ = 0x0; + *pos++ = 0x0; + *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED; +} + +static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + u16 capab; + + capab = 0; + if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ) + return capab; + + if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) + capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; + if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) + capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + + return capab; +} + +static void ieee80211_tdls_add_link_ie(struct sk_buff *skb, const u8 *src_addr, + const u8 *peer, const u8 *bssid) +{ + struct ieee80211_tdls_lnkie *lnkid; + + lnkid = (void *)skb_put(skb, sizeof(struct ieee80211_tdls_lnkie)); + + lnkid->ie_type = WLAN_EID_LINK_ID; + lnkid->ie_len = sizeof(struct ieee80211_tdls_lnkie) - 2; + + memcpy(lnkid->bssid, bssid, ETH_ALEN); + memcpy(lnkid->init_sta, src_addr, ETH_ALEN); + memcpy(lnkid->resp_sta, peer, ETH_ALEN); +} + +static int +ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, u8 dialog_token, + u16 status_code, struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_tdls_data *tf; + + tf = (void *)skb_put(skb, offsetof(struct ieee80211_tdls_data, u)); + + memcpy(tf->da, peer, ETH_ALEN); + memcpy(tf->sa, sdata->vif.addr, ETH_ALEN); + tf->ether_type = cpu_to_be16(ETH_P_TDLS); + tf->payload_type = WLAN_TDLS_SNAP_RFTYPE; + + switch (action_code) { + case WLAN_TDLS_SETUP_REQUEST: + tf->category = WLAN_CATEGORY_TDLS; + tf->action_code = WLAN_TDLS_SETUP_REQUEST; + + skb_put(skb, sizeof(tf->u.setup_req)); + tf->u.setup_req.dialog_token = dialog_token; + tf->u.setup_req.capability = + cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); + + ieee80211_add_srates_ie(sdata, skb, false, band); + ieee80211_add_ext_srates_ie(sdata, skb, false, band); + ieee80211_tdls_add_ext_capab(skb); + break; + case WLAN_TDLS_SETUP_RESPONSE: + tf->category = WLAN_CATEGORY_TDLS; + tf->action_code = WLAN_TDLS_SETUP_RESPONSE; + + skb_put(skb, sizeof(tf->u.setup_resp)); + tf->u.setup_resp.status_code = cpu_to_le16(status_code); + tf->u.setup_resp.dialog_token = dialog_token; + tf->u.setup_resp.capability = + cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); + + ieee80211_add_srates_ie(sdata, skb, false, band); + ieee80211_add_ext_srates_ie(sdata, skb, false, band); + ieee80211_tdls_add_ext_capab(skb); + break; + case WLAN_TDLS_SETUP_CONFIRM: + tf->category = WLAN_CATEGORY_TDLS; + tf->action_code = WLAN_TDLS_SETUP_CONFIRM; + + skb_put(skb, sizeof(tf->u.setup_cfm)); + tf->u.setup_cfm.status_code = cpu_to_le16(status_code); + tf->u.setup_cfm.dialog_token = dialog_token; + break; + case WLAN_TDLS_TEARDOWN: + tf->category = WLAN_CATEGORY_TDLS; + tf->action_code = WLAN_TDLS_TEARDOWN; + + skb_put(skb, sizeof(tf->u.teardown)); + tf->u.teardown.reason_code = cpu_to_le16(status_code); + break; + case WLAN_TDLS_DISCOVERY_REQUEST: + tf->category = WLAN_CATEGORY_TDLS; + tf->action_code = WLAN_TDLS_DISCOVERY_REQUEST; + + skb_put(skb, sizeof(tf->u.discover_req)); + tf->u.discover_req.dialog_token = dialog_token; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, u8 dialog_token, + u16 status_code, struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_mgmt *mgmt; + + mgmt = (void *)skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, peer, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + switch (action_code) { + case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: + skb_put(skb, 1 + sizeof(mgmt->u.action.u.tdls_discover_resp)); + mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; + mgmt->u.action.u.tdls_discover_resp.action_code = + WLAN_PUB_ACTION_TDLS_DISCOVER_RES; + mgmt->u.action.u.tdls_discover_resp.dialog_token = + dialog_token; + mgmt->u.action.u.tdls_discover_resp.capability = + cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); + + ieee80211_add_srates_ie(sdata, skb, false, band); + ieee80211_add_ext_srates_ie(sdata, skb, false, band); + ieee80211_tdls_add_ext_capab(skb); + break; + default: + return -EINVAL; + } + + return 0; +} + +int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, u8 dialog_token, + u16 status_code, u32 peer_capability, + const u8 *extra_ies, size_t extra_ies_len) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb = NULL; + bool send_direct; + int ret; + + if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) + return -ENOTSUPP; + + /* make sure we are in managed mode, and associated */ + if (sdata->vif.type != NL80211_IFTYPE_STATION || + !sdata->u.mgd.associated) + return -EINVAL; + + tdls_dbg(sdata, "TDLS mgmt action %d peer %pM\n", + action_code, peer); + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + max(sizeof(struct ieee80211_mgmt), + sizeof(struct ieee80211_tdls_data)) + + 50 + /* supported rates */ + 7 + /* ext capab */ + extra_ies_len + + sizeof(struct ieee80211_tdls_lnkie)); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, local->hw.extra_tx_headroom); + + switch (action_code) { + case WLAN_TDLS_SETUP_REQUEST: + case WLAN_TDLS_SETUP_RESPONSE: + case WLAN_TDLS_SETUP_CONFIRM: + case WLAN_TDLS_TEARDOWN: + case WLAN_TDLS_DISCOVERY_REQUEST: + ret = ieee80211_prep_tdls_encap_data(wiphy, dev, peer, + action_code, dialog_token, + status_code, skb); + send_direct = false; + break; + case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: + ret = ieee80211_prep_tdls_direct(wiphy, dev, peer, action_code, + dialog_token, status_code, + skb); + send_direct = true; + break; + default: + ret = -ENOTSUPP; + break; + } + + if (ret < 0) + goto fail; + + if (extra_ies_len) + memcpy(skb_put(skb, extra_ies_len), extra_ies, extra_ies_len); + + /* the TDLS link IE is always added last */ + switch (action_code) { + case WLAN_TDLS_SETUP_REQUEST: + case WLAN_TDLS_SETUP_CONFIRM: + case WLAN_TDLS_TEARDOWN: + case WLAN_TDLS_DISCOVERY_REQUEST: + /* we are the initiator */ + ieee80211_tdls_add_link_ie(skb, sdata->vif.addr, peer, + sdata->u.mgd.bssid); + break; + case WLAN_TDLS_SETUP_RESPONSE: + case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: + /* we are the responder */ + ieee80211_tdls_add_link_ie(skb, peer, sdata->vif.addr, + sdata->u.mgd.bssid); + break; + default: + ret = -ENOTSUPP; + goto fail; + } + + if (send_direct) { + ieee80211_tx_skb(sdata, skb); + return 0; + } + + /* + * According to 802.11z: Setup req/resp are sent in AC_BK, otherwise + * we should default to AC_VI. + */ + switch (action_code) { + case WLAN_TDLS_SETUP_REQUEST: + case WLAN_TDLS_SETUP_RESPONSE: + skb_set_queue_mapping(skb, IEEE80211_AC_BK); + skb->priority = 2; + break; + default: + skb_set_queue_mapping(skb, IEEE80211_AC_VI); + skb->priority = 5; + break; + } + + /* disable bottom halves when entering the Tx path */ + local_bh_disable(); + ret = ieee80211_subif_start_xmit(skb, dev); + local_bh_enable(); + + return ret; + +fail: + dev_kfree_skb(skb); + return ret; +} + +int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, enum nl80211_tdls_operation oper) +{ + struct sta_info *sta; + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) + return -ENOTSUPP; + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EINVAL; + + tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer); + + switch (oper) { + case NL80211_TDLS_ENABLE_LINK: + rcu_read_lock(); + sta = sta_info_get(sdata, peer); + if (!sta) { + rcu_read_unlock(); + return -ENOLINK; + } + + set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); + rcu_read_unlock(); + break; + case NL80211_TDLS_DISABLE_LINK: + return sta_info_destroy_addr(sdata, peer); + case NL80211_TDLS_TEARDOWN: + case NL80211_TDLS_SETUP: + case NL80211_TDLS_DISCOVERY_REQ: + /* We don't support in-driver setup/teardown/discovery */ + return -ENOTSUPP; + default: + return -ENOTSUPP; + } + + return 0; +} diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index 124b1fdc20d..0ae207771a5 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -186,7 +186,7 @@ void ieee80211_get_tkip_p1k_iv(struct ieee80211_key_conf *keyconf, EXPORT_SYMBOL(ieee80211_get_tkip_p1k_iv); void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, - const u8 *ta, u32 iv32, u16 *p1k) + const u8 *ta, u32 iv32, u16 *p1k) { const u8 *tk = &keyconf->key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY]; struct tkip_ctx ctx; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index e9ccf22f6dd..cfe1a0688b5 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -21,10 +21,10 @@ #define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ __field(bool, p2p) \ - __string(vif_name, sdata->dev ? sdata->dev->name : "<nodev>") + __string(vif_name, sdata->name) #define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ __entry->p2p = sdata->vif.p2p; \ - __assign_str(vif_name, sdata->dev ? sdata->dev->name : sdata->name) + __assign_str(vif_name, sdata->name) #define VIF_PR_FMT " vif:%s(%d%s)" #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" @@ -184,6 +184,20 @@ TRACE_EVENT(drv_return_bool, "true" : "false") ); +TRACE_EVENT(drv_return_u32, + TP_PROTO(struct ieee80211_local *local, u32 ret), + TP_ARGS(local, ret), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u32, ret) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + ), + TP_printk(LOCAL_PR_FMT " - %u", LOCAL_PR_ARG, __entry->ret) +); + TRACE_EVENT(drv_return_u64, TP_PROTO(struct ieee80211_local *local, u64 ret), TP_ARGS(local, ret), @@ -443,30 +457,6 @@ TRACE_EVENT(drv_prepare_multicast, ) ); -TRACE_EVENT(drv_set_multicast_list, - TP_PROTO(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, int mc_count), - - TP_ARGS(local, sdata, mc_count), - - TP_STRUCT__entry( - LOCAL_ENTRY - __field(bool, allmulti) - __field(int, mc_count) - ), - - TP_fast_assign( - LOCAL_ASSIGN; - __entry->allmulti = sdata->flags & IEEE80211_SDATA_ALLMULTI; - __entry->mc_count = mc_count; - ), - - TP_printk( - LOCAL_PR_FMT " configure mc filter, count=%d, allmulti=%d", - LOCAL_PR_ARG, __entry->mc_count, __entry->allmulti - ) -); - TRACE_EVENT(drv_configure_filter, TP_PROTO(struct ieee80211_local *local, unsigned int changed_flags, @@ -577,7 +567,7 @@ TRACE_EVENT(drv_update_tkip_key, TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " iv32:%#x", - LOCAL_PR_ARG,VIF_PR_ARG,STA_PR_ARG, __entry->iv32 + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->iv32 ) ); @@ -790,7 +780,7 @@ TRACE_EVENT(drv_sta_rc_update, ) ); -TRACE_EVENT(drv_sta_add, +DECLARE_EVENT_CLASS(sta_event, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), @@ -815,29 +805,25 @@ TRACE_EVENT(drv_sta_add, ) ); -TRACE_EVENT(drv_sta_remove, +DEFINE_EVENT(sta_event, drv_sta_add, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta), + TP_ARGS(local, sdata, sta) +); - TP_ARGS(local, sdata, sta), - - TP_STRUCT__entry( - LOCAL_ENTRY - VIF_ENTRY - STA_ENTRY - ), - - TP_fast_assign( - LOCAL_ASSIGN; - VIF_ASSIGN; - STA_ASSIGN; - ), +DEFINE_EVENT(sta_event, drv_sta_remove, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta), + TP_ARGS(local, sdata, sta) +); - TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG - ) +DEFINE_EVENT(sta_event, drv_sta_pre_rcu_remove, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta), + TP_ARGS(local, sdata, sta) ); TRACE_EVENT(drv_conf_tx, @@ -1403,6 +1389,91 @@ TRACE_EVENT(drv_change_chanctx, ) ); +#if !defined(__TRACE_VIF_ENTRY) +#define __TRACE_VIF_ENTRY +struct trace_vif_entry { + enum nl80211_iftype vif_type; + bool p2p; + char vif_name[IFNAMSIZ]; +} __packed; + +struct trace_chandef_entry { + u32 control_freq; + u32 chan_width; + u32 center_freq1; + u32 center_freq2; +} __packed; + +struct trace_switch_entry { + struct trace_vif_entry vif; + struct trace_chandef_entry old_chandef; + struct trace_chandef_entry new_chandef; +} __packed; + +#define SWITCH_ENTRY_ASSIGN(to, from) local_vifs[i].to = vifs[i].from +#endif + +TRACE_EVENT(drv_switch_vif_chanctx, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, enum ieee80211_chanctx_switch_mode mode), + TP_ARGS(local, vifs, n_vifs, mode), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, n_vifs) + __field(u32, mode) + __dynamic_array(u8, vifs, + sizeof(struct trace_switch_entry) * n_vifs) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->n_vifs = n_vifs; + __entry->mode = mode; + { + struct trace_switch_entry *local_vifs = + __get_dynamic_array(vifs); + int i; + + for (i = 0; i < n_vifs; i++) { + struct ieee80211_sub_if_data *sdata; + + sdata = container_of(vifs[i].vif, + struct ieee80211_sub_if_data, + vif); + + SWITCH_ENTRY_ASSIGN(vif.vif_type, vif->type); + SWITCH_ENTRY_ASSIGN(vif.p2p, vif->p2p); + strncpy(local_vifs[i].vif.vif_name, + sdata->name, + sizeof(local_vifs[i].vif.vif_name)); + SWITCH_ENTRY_ASSIGN(old_chandef.control_freq, + old_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(old_chandef.chan_width, + old_ctx->def.width); + SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1, + old_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2, + old_ctx->def.center_freq2); + SWITCH_ENTRY_ASSIGN(new_chandef.control_freq, + new_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(new_chandef.chan_width, + new_ctx->def.width); + SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1, + new_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2, + new_ctx->def.center_freq2); + } + } + ), + + TP_printk( + LOCAL_PR_FMT " n_vifs:%d mode:%d", + LOCAL_PR_ARG, __entry->n_vifs, __entry->mode + ) +); + DECLARE_EVENT_CLASS(local_sdata_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, @@ -1527,6 +1598,24 @@ DEFINE_EVENT(local_sdata_evt, drv_leave_ibss, TP_ARGS(local, sdata) ); +TRACE_EVENT(drv_get_expected_throughput, + TP_PROTO(struct ieee80211_sta *sta), + + TP_ARGS(sta), + + TP_STRUCT__entry( + STA_ENTRY + ), + + TP_fast_assign( + STA_ASSIGN; + ), + + TP_printk( + STA_PR_FMT, STA_PR_ARG + ) +); + /* * Tracing for API calls that drivers call. */ @@ -1863,6 +1952,33 @@ TRACE_EVENT(api_eosp, ) ); +TRACE_EVENT(api_sta_set_buffered, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, + u8 tid, bool buffered), + + TP_ARGS(local, sta, tid, buffered), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(u8, tid) + __field(bool, buffered) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->tid = tid; + __entry->buffered = buffered; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT " tid:%d buffered:%d", + LOCAL_PR_ARG, STA_PR_ARG, __entry->tid, __entry->buffered + ) +); + /* * Tracing for internal functions * (which may also be called in response to driver calls) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 6d59e21cdb9..1a252c606ad 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -414,6 +414,9 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (ieee80211_has_order(hdr->frame_control)) return TX_CONTINUE; + if (ieee80211_is_probe_req(hdr->frame_control)) + return TX_CONTINUE; + if (tx->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) info->hw_queue = tx->sdata->vif.cab_queue; @@ -452,8 +455,7 @@ static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta, if (sta == NULL || !test_sta_flag(sta, WLAN_STA_MFP)) return 0; - if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) - skb->data)) + if (!ieee80211_is_robust_mgmt_frame(skb)) return 0; return 1; @@ -475,11 +477,8 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) !(info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER))) { int ac = skb_get_queue_mapping(tx->skb); - /* only deauth, disassoc and action are bufferable MMPDUs */ if (ieee80211_is_mgmt(hdr->frame_control) && - !ieee80211_is_deauth(hdr->frame_control) && - !ieee80211_is_disassoc(hdr->frame_control) && - !ieee80211_is_action(hdr->frame_control)) { + !ieee80211_is_bufferable_mmpdu(hdr->frame_control)) { info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; return TX_CONTINUE; } @@ -488,6 +487,20 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) sta->sta.addr, sta->sta.aid, ac); if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); + + /* sync with ieee80211_sta_ps_deliver_wakeup */ + spin_lock(&sta->ps_lock); + /* + * STA woke up the meantime and all the frames on ps_tx_buf have + * been queued to pending queue. No reordering can happen, go + * ahead and Tx the packet. + */ + if (!test_sta_flag(sta, WLAN_STA_PS_STA) && + !test_sta_flag(sta, WLAN_STA_PS_DRIVER)) { + spin_unlock(&sta->ps_lock); + return TX_CONTINUE; + } + if (skb_queue_len(&sta->ps_tx_buf[ac]) >= STA_MAX_TX_BUFFER) { struct sk_buff *old = skb_dequeue(&sta->ps_tx_buf[ac]); ps_dbg(tx->sdata, @@ -500,7 +513,9 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) info->control.jiffies = jiffies; info->control.vif = &tx->sdata->vif; info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb); + spin_unlock(&sta->ps_lock); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, @@ -563,7 +578,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) tx->key = key; else if (ieee80211_is_mgmt(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && - ieee80211_is_robust_mgmt_frame(hdr) && + ieee80211_is_robust_mgmt_frame(tx->skb) && (key = rcu_dereference(tx->sdata->default_mgmt_key))) tx->key = key; else if (is_multicast_ether_addr(hdr->addr1) && @@ -578,12 +593,12 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) tx->key = NULL; else if (tx->skb->protocol == tx->sdata->control_port_protocol) tx->key = NULL; - else if (ieee80211_is_robust_mgmt_frame(hdr) && + else if (ieee80211_is_robust_mgmt_frame(tx->skb) && !(ieee80211_is_action(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_MFP))) tx->key = NULL; else if (ieee80211_is_mgmt(hdr->frame_control) && - !ieee80211_is_robust_mgmt_frame(hdr)) + !ieee80211_is_robust_mgmt_frame(tx->skb)) tx->key = NULL; else { I802_DEBUG_INC(tx->local->tx_handlers_drop_unencrypted); @@ -874,7 +889,7 @@ static int ieee80211_fragment(struct ieee80211_tx_data *tx, } /* adjust first fragment's length */ - skb->len = hdrlen + per_fragm; + skb_trim(skb, hdrlen + per_fragm); return 0; } @@ -1073,6 +1088,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, queued = true; info->control.vif = &tx->sdata->vif; info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) purge_skb = __skb_dequeue(&tid_tx->pending); @@ -2161,7 +2177,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (ieee80211_is_data_qos(fc)) { __le16 *qos_control; - qos_control = (__le16*) skb_push(skb, 2); + qos_control = (__le16 *) skb_push(skb, 2); memcpy(skb_push(skb, hdrlen - 2), &hdr, hdrlen - 2); /* * Maybe we could actually set some fields here, for now just @@ -2312,7 +2328,8 @@ void ieee80211_tx_pending(unsigned long data) /* functions for drivers to get certain frames */ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, - struct ps_data *ps, struct sk_buff *skb) + struct ps_data *ps, struct sk_buff *skb, + bool is_template) { u8 *pos, *tim; int aid0 = 0; @@ -2323,13 +2340,14 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, if (atomic_read(&ps->num_sta_ps) > 0) /* in the hope that this is faster than * checking byte-for-byte */ - have_bits = !bitmap_empty((unsigned long*)ps->tim, + have_bits = !bitmap_empty((unsigned long *)ps->tim, IEEE80211_MAX_AID+1); - - if (ps->dtim_count == 0) - ps->dtim_count = sdata->vif.bss_conf.dtim_period - 1; - else - ps->dtim_count--; + if (!is_template) { + if (ps->dtim_count == 0) + ps->dtim_count = sdata->vif.bss_conf.dtim_period - 1; + else + ps->dtim_count--; + } tim = pos = (u8 *) skb_put(skb, 6); *pos++ = WLAN_EID_TIM; @@ -2375,7 +2393,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, - struct ps_data *ps, struct sk_buff *skb) + struct ps_data *ps, struct sk_buff *skb, + bool is_template) { struct ieee80211_local *local = sdata->local; @@ -2387,33 +2406,24 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, * of the tim bitmap in mac80211 and the driver. */ if (local->tim_in_locked_section) { - __ieee80211_beacon_add_tim(sdata, ps, skb); + __ieee80211_beacon_add_tim(sdata, ps, skb, is_template); } else { spin_lock_bh(&local->tim_lock); - __ieee80211_beacon_add_tim(sdata, ps, skb); + __ieee80211_beacon_add_tim(sdata, ps, skb, is_template); spin_unlock_bh(&local->tim_lock); } return 0; } -void ieee80211_csa_finish(struct ieee80211_vif *vif) -{ - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - - ieee80211_queue_work(&sdata->local->hw, - &sdata->csa_finalize_work); -} -EXPORT_SYMBOL(ieee80211_csa_finish); - -static void ieee80211_update_csa(struct ieee80211_sub_if_data *sdata, - struct beacon_data *beacon) +static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata, + struct beacon_data *beacon) { struct probe_resp *resp; - int counter_offset_beacon = sdata->csa_counter_offset_beacon; - int counter_offset_presp = sdata->csa_counter_offset_presp; u8 *beacon_data; size_t beacon_data_len; + int i; + u8 count = sdata->csa_current_counter; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: @@ -2431,36 +2441,57 @@ static void ieee80211_update_csa(struct ieee80211_sub_if_data *sdata, default: return; } - if (WARN_ON(counter_offset_beacon >= beacon_data_len)) - return; - /* warn if the driver did not check for/react to csa completeness */ - if (WARN_ON(beacon_data[counter_offset_beacon] == 0)) - return; + for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; ++i) { + u16 counter_offset_beacon = + sdata->csa_counter_offset_beacon[i]; + u16 counter_offset_presp = sdata->csa_counter_offset_presp[i]; - beacon_data[counter_offset_beacon]--; + if (counter_offset_beacon) { + if (WARN_ON(counter_offset_beacon >= beacon_data_len)) + return; - if (sdata->vif.type == NL80211_IFTYPE_AP && counter_offset_presp) { - rcu_read_lock(); - resp = rcu_dereference(sdata->u.ap.probe_resp); + beacon_data[counter_offset_beacon] = count; + } - /* if nl80211 accepted the offset, this should not happen. */ - if (WARN_ON(!resp)) { + if (sdata->vif.type == NL80211_IFTYPE_AP && + counter_offset_presp) { + rcu_read_lock(); + resp = rcu_dereference(sdata->u.ap.probe_resp); + + /* If nl80211 accepted the offset, this should + * not happen. + */ + if (WARN_ON(!resp)) { + rcu_read_unlock(); + return; + } + resp->data[counter_offset_presp] = count; rcu_read_unlock(); - return; } - resp->data[counter_offset_presp]--; - rcu_read_unlock(); } } +u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + sdata->csa_current_counter--; + + /* the counter should never reach 0 */ + WARN_ON(!sdata->csa_current_counter); + + return sdata->csa_current_counter; +} +EXPORT_SYMBOL(ieee80211_csa_update_counter); + bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; u8 *beacon_data; size_t beacon_data_len; - int counter_beacon = sdata->csa_counter_offset_beacon; + int counter_beacon = sdata->csa_counter_offset_beacon[0]; int ret = false; if (!ieee80211_sdata_running(sdata)) @@ -2501,7 +2532,7 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) if (WARN_ON(counter_beacon > beacon_data_len)) goto out; - if (beacon_data[counter_beacon] == 0) + if (beacon_data[counter_beacon] == 1) ret = true; out: rcu_read_unlock(); @@ -2510,9 +2541,11 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) } EXPORT_SYMBOL(ieee80211_csa_is_complete); -struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - u16 *tim_offset, u16 *tim_length) +static struct sk_buff * +__ieee80211_beacon_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_mutable_offsets *offs, + bool is_template) { struct ieee80211_local *local = hw_to_local(hw); struct sk_buff *skb = NULL; @@ -2521,6 +2554,7 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, enum ieee80211_band band; struct ieee80211_tx_rate_control txrc; struct ieee80211_chanctx_conf *chanctx_conf; + int csa_off_base = 0; rcu_read_lock(); @@ -2530,18 +2564,20 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (!ieee80211_sdata_running(sdata) || !chanctx_conf) goto out; - if (tim_offset) - *tim_offset = 0; - if (tim_length) - *tim_length = 0; + if (offs) + memset(offs, 0, sizeof(*offs)); if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_if_ap *ap = &sdata->u.ap; struct beacon_data *beacon = rcu_dereference(ap->beacon); if (beacon) { - if (sdata->vif.csa_active) - ieee80211_update_csa(sdata, beacon); + if (sdata->vif.csa_active) { + if (!is_template) + ieee80211_csa_update_counter(vif); + + ieee80211_set_csa(sdata, beacon); + } /* * headroom, head length, @@ -2549,7 +2585,8 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, */ skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + - beacon->tail_len + 256); + beacon->tail_len + 256 + + local->hw.extra_beacon_tailroom); if (!skb) goto out; @@ -2557,12 +2594,16 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, memcpy(skb_put(skb, beacon->head_len), beacon->head, beacon->head_len); - ieee80211_beacon_add_tim(sdata, &ap->ps, skb); + ieee80211_beacon_add_tim(sdata, &ap->ps, skb, + is_template); - if (tim_offset) - *tim_offset = beacon->head_len; - if (tim_length) - *tim_length = skb->len - beacon->head_len; + if (offs) { + offs->tim_offset = beacon->head_len; + offs->tim_length = skb->len - beacon->head_len; + + /* for AP the csa offsets are from tail */ + csa_off_base = skb->len; + } if (beacon->tail) memcpy(skb_put(skb, beacon->tail_len), @@ -2577,11 +2618,15 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (!presp) goto out; - if (sdata->vif.csa_active) - ieee80211_update_csa(sdata, presp); + if (sdata->vif.csa_active) { + if (!is_template) + ieee80211_csa_update_counter(vif); + ieee80211_set_csa(sdata, presp); + } - skb = dev_alloc_skb(local->tx_headroom + presp->head_len); + skb = dev_alloc_skb(local->tx_headroom + presp->head_len + + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); @@ -2598,28 +2643,57 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (!bcn) goto out; - if (sdata->vif.csa_active) - ieee80211_update_csa(sdata, bcn); + if (sdata->vif.csa_active) { + if (!is_template) + /* TODO: For mesh csa_counter is in TU, so + * decrementing it by one isn't correct, but + * for now we leave it consistent with overall + * mac80211's behavior. + */ + ieee80211_csa_update_counter(vif); + + ieee80211_set_csa(sdata, bcn); + } if (ifmsh->sync_ops) - ifmsh->sync_ops->adjust_tbtt( - sdata); + ifmsh->sync_ops->adjust_tbtt(sdata, bcn); skb = dev_alloc_skb(local->tx_headroom + bcn->head_len + 256 + /* TIM IE */ - bcn->tail_len); + bcn->tail_len + + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); memcpy(skb_put(skb, bcn->head_len), bcn->head, bcn->head_len); - ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb); + ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb, is_template); + + if (offs) { + offs->tim_offset = bcn->head_len; + offs->tim_length = skb->len - bcn->head_len; + } + memcpy(skb_put(skb, bcn->tail_len), bcn->tail, bcn->tail_len); } else { WARN_ON(1); goto out; } + /* CSA offsets */ + if (offs) { + int i; + + for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; i++) { + u16 csa_off = sdata->csa_counter_offset_beacon[i]; + + if (!csa_off) + continue; + + offs->csa_counter_offs[i] = csa_off_base + csa_off; + } + } + band = chanctx_conf->def.chan->band; info = IEEE80211_SKB_CB(skb); @@ -2650,6 +2724,32 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, out: rcu_read_unlock(); return skb; + +} + +struct sk_buff * +ieee80211_beacon_get_template(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_mutable_offsets *offs) +{ + return __ieee80211_beacon_get(hw, vif, offs, true); +} +EXPORT_SYMBOL(ieee80211_beacon_get_template); + +struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 *tim_offset, u16 *tim_length) +{ + struct ieee80211_mutable_offsets offs = {}; + struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false); + + if (tim_offset) + *tim_offset = offs.tim_offset; + + if (tim_length) + *tim_length = offs.tim_length; + + return bcn; } EXPORT_SYMBOL(ieee80211_beacon_get_tim); @@ -2887,7 +2987,7 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw, cpu_to_le16(IEEE80211_FCTL_MOREDATA); } - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + if (sdata->vif.type == NL80211_IFTYPE_AP) sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); if (!ieee80211_tx_prepare(sdata, &tx, skb)) break; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 875e172c001..a6cda52ed92 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -34,7 +34,7 @@ #include "wep.h" /* privid for wiphys to determine whether they belong to us or not */ -void *mac80211_wiphy_privid = &mac80211_wiphy_privid; +const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid; struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) { @@ -76,7 +76,7 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, } if (ieee80211_is_ctl(fc)) { - if(ieee80211_is_pspoll(fc)) + if (ieee80211_is_pspoll(fc)) return hdr->addr1; if (ieee80211_is_back_req(fc)) { @@ -435,9 +435,8 @@ void ieee80211_add_pending_skb(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } -void ieee80211_add_pending_skbs_fn(struct ieee80211_local *local, - struct sk_buff_head *skbs, - void (*fn)(void *data), void *data) +void ieee80211_add_pending_skbs(struct ieee80211_local *local, + struct sk_buff_head *skbs) { struct ieee80211_hw *hw = &local->hw; struct sk_buff *skb; @@ -461,9 +460,6 @@ void ieee80211_add_pending_skbs_fn(struct ieee80211_local *local, __skb_queue_tail(&local->pending[queue], skb); } - if (fn) - fn(data); - for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, IEEE80211_QUEUE_STOP_REASON_SKB_ADD); @@ -558,7 +554,7 @@ void ieee80211_flush_queues(struct ieee80211_local *local, ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_FLUSH); - drv_flush(local, queues, false); + drv_flush(local, sdata, queues, false); ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_FLUSH); @@ -642,6 +638,17 @@ void ieee80211_iterate_active_interfaces_rtnl( } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_rtnl); +struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + + if (!ieee80211_sdata_running(sdata) || + !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) + return NULL; + return &sdata->vif; +} +EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); + /* * Nothing should have been stuffed into the workqueue during * the suspend->resume cycle. If this WARN is seen then there @@ -1089,11 +1096,12 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, int err; /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24 + 6 + extra_len); + skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN); if (!skb) return; - skb_reserve(skb, local->hw.extra_tx_headroom); + skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN); mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6); memset(mgmt, 0, 24 + 6); @@ -1270,13 +1278,32 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, * that calculates local->scan_ies_len. */ - /* add any remaining custom IEs */ + /* insert custom IEs that go before VHT */ if (ie && ie_len) { - noffset = ie_len; + static const u8 before_vht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_REQUEST, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_DS_PARAMS, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + WLAN_EID_HT_CAPABILITY, + WLAN_EID_BSS_COEX_2040, + WLAN_EID_EXT_CAPABILITY, + WLAN_EID_SSID_LIST, + WLAN_EID_CHANNEL_USAGE, + WLAN_EID_INTERWORKING, + /* mesh ID can't happen here */ + /* 60 GHz can't happen here right now */ + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_vht, ARRAY_SIZE(before_vht), + offset); if (end - pos < noffset - offset) goto out_err; memcpy(pos, ie + offset, noffset - offset); pos += noffset - offset; + offset = noffset; } if (sband->vht_cap.vht_supported) { @@ -1286,6 +1313,15 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, sband->vht_cap.cap); } + /* add any remaining custom IEs */ + if (ie && ie_len) { + noffset = ie_len; + if (end - pos < noffset - offset) + goto out_err; + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + } + return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); @@ -1363,7 +1399,6 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, enum ieee80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; - struct ieee80211_rate *bitrates; size_t num_rates; u32 supp_rates, rate_flags; int i, j, shift; @@ -1375,7 +1410,6 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, if (WARN_ON(!sband)) return 1; - bitrates = sband->bitrates; num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + @@ -1424,6 +1458,44 @@ void ieee80211_stop_device(struct ieee80211_local *local) drv_stop(local); } +static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_chanctx *ctx; + + /* + * We get here if during resume the device can't be restarted properly. + * We might also get here if this happens during HW reset, which is a + * slightly different situation and we need to drop all connections in + * the latter case. + * + * Ask cfg80211 to turn off all interfaces, this will result in more + * warnings but at least we'll then get into a clean stopped state. + */ + + local->resuming = false; + local->suspended = false; + local->started = false; + + /* scheduled scan clearly can't be running any more, but tell + * cfg80211 and clear local state + */ + ieee80211_sched_scan_end(local); + + list_for_each_entry(sdata, &local->interfaces, list) + sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; + + /* Mark channel contexts as not being in the driver any more to avoid + * removing them from the driver during the shutdown process... + */ + mutex_lock(&local->chanctx_mtx); + list_for_each_entry(ctx, &local->chanctx_list, list) + ctx->driver_present = false; + mutex_unlock(&local->chanctx_mtx); + + cfg80211_shutdown_all_interfaces(local->hw.wiphy); +} + static void ieee80211_assign_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { @@ -1451,6 +1523,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) struct sta_info *sta; int res, i; bool reconfig_due_to_wowlan = false; + struct ieee80211_sub_if_data *sched_scan_sdata; + bool sched_scan_stopped = false; #ifdef CONFIG_PM if (local->suspended) @@ -1485,9 +1559,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) */ res = drv_start(local); if (res) { - WARN(local->suspended, "Hardware became unavailable " - "upon resume. This could be a software issue " - "prior to suspend or a hardware issue.\n"); + if (local->suspended) + WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); + else + WARN(1, "Hardware became unavailable during restart.\n"); + ieee80211_handle_reconfig_failure(local); return res; } @@ -1511,7 +1587,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) WARN_ON(local->resuming); res = drv_add_interface(local, sdata); if (WARN_ON(res)) { - rcu_assign_pointer(local->monitor_sdata, NULL); + RCU_INIT_POINTER(local->monitor_sdata, NULL); synchronize_net(); kfree(sdata); } @@ -1530,17 +1606,17 @@ int ieee80211_reconfig(struct ieee80211_local *local) list_for_each_entry(ctx, &local->chanctx_list, list) WARN_ON(drv_add_chanctx(local, ctx)); mutex_unlock(&local->chanctx_mtx); - } - list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) - continue; - ieee80211_assign_chanctx(local, sdata); - } + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + ieee80211_assign_chanctx(local, sdata); + } - sdata = rtnl_dereference(local->monitor_sdata); - if (sdata && ieee80211_sdata_running(sdata)) - ieee80211_assign_chanctx(local, sdata); + sdata = rtnl_dereference(local->monitor_sdata); + if (sdata && ieee80211_sdata_running(sdata)) + ieee80211_assign_chanctx(local, sdata); + } /* add STAs back */ mutex_lock(&local->sta_mtx); @@ -1636,13 +1712,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) } break; case NL80211_IFTYPE_WDS: - break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: - /* ignore virtual */ - break; case NL80211_IFTYPE_P2P_DEVICE: - changed = BSS_CHANGED_IDLE; + /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: @@ -1728,6 +1801,26 @@ int ieee80211_reconfig(struct ieee80211_local *local) IEEE80211_QUEUE_STOP_REASON_SUSPEND); /* + * Reconfigure sched scan if it was interrupted by FW restart or + * suspend. + */ + mutex_lock(&local->mtx); + sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, + lockdep_is_held(&local->mtx)); + if (sched_scan_sdata && local->sched_scan_req) + /* + * Sched scan stopped, but we don't want to report it. Instead, + * we're trying to reschedule. + */ + if (__ieee80211_request_sched_scan_start(sched_scan_sdata, + local->sched_scan_req)) + sched_scan_stopped = true; + mutex_unlock(&local->mtx); + + if (sched_scan_stopped) + cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); + + /* * If this is for hw restart things are still running. * We may want to change that later, however. */ @@ -1754,6 +1847,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) #else WARN_ON(1); #endif + return 0; } @@ -2238,11 +2332,11 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, ri.nss = status->vht_nss; if (status->flag & RX_FLAG_40MHZ) ri.flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH; - if (status->flag & RX_FLAG_80MHZ) + if (status->vht_flag & RX_VHT_FLAG_80MHZ) ri.flags |= RATE_INFO_FLAGS_80_MHZ_WIDTH; - if (status->flag & RX_FLAG_80P80MHZ) + if (status->vht_flag & RX_VHT_FLAG_80P80MHZ) ri.flags |= RATE_INFO_FLAGS_80P80_MHZ_WIDTH; - if (status->flag & RX_FLAG_160MHZ) + if (status->vht_flag & RX_VHT_FLAG_160MHZ) ri.flags |= RATE_INFO_FLAGS_160_MHZ_WIDTH; if (status->flag & RX_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; @@ -2281,9 +2375,14 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; + mutex_lock(&local->mtx); mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { - cancel_delayed_work_sync(&sdata->dfs_cac_timer_work); + /* it might be waiting for the local->mtx, but then + * by the time it gets it, sdata->wdev.cac_started + * will no longer be true + */ + cancel_delayed_work(&sdata->dfs_cac_timer_work); if (sdata->wdev.cac_started) { chandef = sdata->vif.bss_conf.chandef; @@ -2295,6 +2394,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) } } mutex_unlock(&local->iflist_mtx); + mutex_unlock(&local->mtx); } void ieee80211_dfs_radar_detected_work(struct work_struct *work) @@ -2554,3 +2654,302 @@ int ieee80211_cs_headroom(struct ieee80211_local *local, return headroom; } + +static bool +ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i) +{ + s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1); + int skip; + + if (end > 0) + return false; + + /* End time is in the past, check for repetitions */ + skip = DIV_ROUND_UP(-end, data->desc[i].interval); + if (data->count[i] < 255) { + if (data->count[i] <= skip) { + data->count[i] = 0; + return false; + } + + data->count[i] -= skip; + } + + data->desc[i].start += skip * data->desc[i].interval; + + return true; +} + +static bool +ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf, + s32 *offset) +{ + bool ret = false; + int i; + + for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { + s32 cur; + + if (!data->count[i]) + continue; + + if (ieee80211_extend_noa_desc(data, tsf + *offset, i)) + ret = true; + + cur = data->desc[i].start - tsf; + if (cur > *offset) + continue; + + cur = data->desc[i].start + data->desc[i].duration - tsf; + if (cur > *offset) + *offset = cur; + } + + return ret; +} + +static u32 +ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf) +{ + s32 offset = 0; + int tries = 0; + /* + * arbitrary limit, used to avoid infinite loops when combined NoA + * descriptors cover the full time period. + */ + int max_tries = 5; + + ieee80211_extend_absent_time(data, tsf, &offset); + do { + if (!ieee80211_extend_absent_time(data, tsf, &offset)) + break; + + tries++; + } while (tries < max_tries); + + return offset; +} + +void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf) +{ + u32 next_offset = BIT(31) - 1; + int i; + + data->absent = 0; + data->has_next_tsf = false; + for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { + s32 start; + + if (!data->count[i]) + continue; + + ieee80211_extend_noa_desc(data, tsf, i); + start = data->desc[i].start - tsf; + if (start <= 0) + data->absent |= BIT(i); + + if (next_offset > start) + next_offset = start; + + data->has_next_tsf = true; + } + + if (data->absent) + next_offset = ieee80211_get_noa_absent_time(data, tsf); + + data->next_tsf = tsf + next_offset; +} +EXPORT_SYMBOL(ieee80211_update_p2p_noa); + +int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, + struct ieee80211_noa_data *data, u32 tsf) +{ + int ret = 0; + int i; + + memset(data, 0, sizeof(*data)); + + for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { + const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i]; + + if (!desc->count || !desc->duration) + continue; + + data->count[i] = desc->count; + data->desc[i].start = le32_to_cpu(desc->start_time); + data->desc[i].duration = le32_to_cpu(desc->duration); + data->desc[i].interval = le32_to_cpu(desc->interval); + + if (data->count[i] > 1 && + data->desc[i].interval < data->desc[i].duration) + continue; + + ieee80211_extend_noa_desc(data, tsf, i); + ret++; + } + + if (ret) + ieee80211_update_p2p_noa(data, tsf); + + return ret; +} +EXPORT_SYMBOL(ieee80211_parse_p2p_noa); + +void ieee80211_recalc_dtim(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + u64 tsf = drv_get_tsf(local, sdata); + u64 dtim_count = 0; + u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; + u8 dtim_period = sdata->vif.bss_conf.dtim_period; + struct ps_data *ps; + u8 bcns_from_dtim; + + if (tsf == -1ULL || !beacon_int || !dtim_period) + return; + + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + if (!sdata->bss) + return; + + ps = &sdata->bss->ps; + } else if (ieee80211_vif_is_mesh(&sdata->vif)) { + ps = &sdata->u.mesh.ps; + } else { + return; + } + + /* + * actually finds last dtim_count, mac80211 will update in + * __beacon_add_tim(). + * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period + */ + do_div(tsf, beacon_int); + bcns_from_dtim = do_div(tsf, dtim_period); + /* just had a DTIM */ + if (!bcns_from_dtim) + dtim_count = 0; + else + dtim_count = dtim_period - bcns_from_dtim; + + ps->dtim_count = dtim_count; +} + +int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode chanmode, + u8 radar_detect) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *sdata_iter; + enum nl80211_iftype iftype = sdata->wdev.iftype; + int num[NUM_NL80211_IFTYPES]; + struct ieee80211_chanctx *ctx; + int num_different_channels = 0; + int total = 1; + + lockdep_assert_held(&local->chanctx_mtx); + + if (WARN_ON(hweight32(radar_detect) > 1)) + return -EINVAL; + + if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED && + !chandef->chan)) + return -EINVAL; + + if (chandef) + num_different_channels = 1; + + if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) + return -EINVAL; + + /* Always allow software iftypes */ + if (local->hw.wiphy->software_iftypes & BIT(iftype)) { + if (radar_detect) + return -EINVAL; + return 0; + } + + memset(num, 0, sizeof(num)); + + if (iftype != NL80211_IFTYPE_UNSPECIFIED) + num[iftype] = 1; + + list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->conf.radar_enabled) + radar_detect |= BIT(ctx->conf.def.width); + if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) { + num_different_channels++; + continue; + } + if (chandef && chanmode == IEEE80211_CHANCTX_SHARED && + cfg80211_chandef_compatible(chandef, + &ctx->conf.def)) + continue; + num_different_channels++; + } + + list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) { + struct wireless_dev *wdev_iter; + + wdev_iter = &sdata_iter->wdev; + + if (sdata_iter == sdata || + rcu_access_pointer(sdata_iter->vif.chanctx_conf) == NULL || + local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype)) + continue; + + num[wdev_iter->iftype]++; + total++; + } + + if (total == 1 && !radar_detect) + return 0; + + return cfg80211_check_combinations(local->hw.wiphy, + num_different_channels, + radar_detect, num); +} + +static void +ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c, + void *data) +{ + u32 *max_num_different_channels = data; + + *max_num_different_channels = max(*max_num_different_channels, + c->num_different_channels); +} + +int ieee80211_max_num_channels(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + int num[NUM_NL80211_IFTYPES] = {}; + struct ieee80211_chanctx *ctx; + int num_different_channels = 0; + u8 radar_detect = 0; + u32 max_num_different_channels = 1; + int err; + + lockdep_assert_held(&local->chanctx_mtx); + + list_for_each_entry(ctx, &local->chanctx_list, list) { + num_different_channels++; + + if (ctx->conf.radar_enabled) + radar_detect |= BIT(ctx->conf.def.width); + } + + list_for_each_entry_rcu(sdata, &local->interfaces, list) + num[sdata->wdev.iftype]++; + + err = cfg80211_iter_combinations(local->hw.wiphy, + num_different_channels, radar_detect, + num, ieee80211_iter_max_chans, + &max_num_different_channels); + if (err < 0) + return err; + + return max_num_different_channels; +} diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index d75f35c6e1a..9265adfdabf 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -129,9 +129,12 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; - /* A VHT STA must support 40 MHz */ - if (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) - return; + /* + * A VHT STA must support 40 MHz, but if we verify that here + * then we break a few things - some APs (e.g. Netgear R6300v2 + * and others based on the BCM4360 chipset) will unset this + * capability bit when operating in 20 MHz. + */ vht_cap->vht_supported = true; @@ -349,9 +352,9 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) sta->sta.rx_nss = max_t(u8, 1, ht_rx_nss); } -void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only) +u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 opmode, + enum ieee80211_band band, bool nss_only) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -363,7 +366,7 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) - return; + return 0; nss = opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; @@ -375,7 +378,7 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, } if (nss_only) - goto change; + return changed; switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: @@ -398,7 +401,19 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, changed |= IEEE80211_RC_BW_CHANGED; } - change: - if (changed) + return changed; +} + +void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 opmode, + enum ieee80211_band band, bool nss_only) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; + + u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, + band, nss_only); + + if (changed > 0) rate_control_rate_update(local, sband, sta, changed); } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index afba19cb6f8..d51422c778d 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -106,6 +106,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta = NULL; const u8 *ra = NULL; bool qos = false; + struct mac80211_qos_map *qos_map; if (local->hw.queues < IEEE80211_NUM_ACS || skb->len < 6) { skb->priority = 0; /* required for correct WPA/11i MIC */ @@ -153,9 +154,18 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, return IEEE80211_AC_BE; } + if (skb->protocol == sdata->control_port_protocol) { + skb->priority = 7; + return ieee80211_downgrade_queue(sdata, skb); + } + /* use the data classifier to determine what 802.1d tag the * data frame has */ - skb->priority = cfg80211_classify8021d(skb); + rcu_read_lock(); + qos_map = rcu_dereference(sdata->qos_map); + skb->priority = cfg80211_classify8021d(skb, qos_map ? + &qos_map->qos_map : NULL); + rcu_read_unlock(); return ieee80211_downgrade_queue(sdata, skb); } diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 7313d379c0d..9b3dcc20114 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -127,7 +127,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) * APs with pairwise keys should never receive Michael MIC * errors for non-zero keyidx because these are reserved for * group keys and only the AP is sending real multicast - * frames in the BSS. ( + * frames in the BSS. */ return RX_DROP_UNUSABLE; } @@ -301,8 +301,7 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) } -static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad, - int encrypted) +static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad) { __le16 mask_fc; int a4_included, mgmt; @@ -407,7 +406,10 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) && - !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { + !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && + !((info->control.hw_key->flags & + IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) && + ieee80211_is_mgmt(hdr->frame_control))) { /* * hwaccel has no need for preallocated room for CCMP * header or MIC fields @@ -456,7 +458,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) return 0; pos += IEEE80211_CCMP_HDR_LEN; - ccmp_special_blocks(skb, pn, b_0, aad, 0); + ccmp_special_blocks(skb, pn, b_0, aad); ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len, skb_put(skb, IEEE80211_CCMP_MIC_LEN)); @@ -495,7 +497,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && - !ieee80211_is_robust_mgmt_frame(hdr)) + !ieee80211_is_robust_mgmt_frame(skb)) return RX_CONTINUE; data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - @@ -524,7 +526,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx) u8 aad[2 * AES_BLOCK_SIZE]; u8 b_0[AES_BLOCK_SIZE]; /* hardware didn't decrypt/verify MIC */ - ccmp_special_blocks(skb, pn, b_0, aad, 1); + ccmp_special_blocks(skb, pn, b_0, aad); if (ieee80211_aes_ccm_decrypt( key->u.ccmp.tfm, b_0, aad, diff --git a/net/mac802154/Kconfig b/net/mac802154/Kconfig index b33dd76d430..1818a99b308 100644 --- a/net/mac802154/Kconfig +++ b/net/mac802154/Kconfig @@ -2,6 +2,10 @@ config MAC802154 tristate "Generic IEEE 802.15.4 Soft Networking Stack (mac802154)" depends on IEEE802154 select CRC_CCITT + select CRYPTO_AUTHENC + select CRYPTO_CCM + select CRYPTO_CTR + select CRYPTO_AES ---help--- This option enables the hardware independent IEEE 802.15.4 networking stack for SoftMAC devices (the ones implementing diff --git a/net/mac802154/Makefile b/net/mac802154/Makefile index 57cf5d1a2e4..9723d6f3f3e 100644 --- a/net/mac802154/Makefile +++ b/net/mac802154/Makefile @@ -1,2 +1,5 @@ obj-$(CONFIG_MAC802154) += mac802154.o -mac802154-objs := ieee802154_dev.o rx.o tx.o mac_cmd.o mib.o monitor.o wpan.o +mac802154-objs := ieee802154_dev.o rx.o tx.o mac_cmd.o mib.o \ + monitor.o wpan.o llsec.o + +ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/mac802154/ieee802154_dev.c b/net/mac802154/ieee802154_dev.c index 52ae6646a41..2cf66d885e6 100644 --- a/net/mac802154/ieee802154_dev.c +++ b/net/mac802154/ieee802154_dev.c @@ -27,6 +27,7 @@ #include <net/netlink.h> #include <linux/nl802154.h> #include <net/mac802154.h> +#include <net/ieee802154_netdev.h> #include <net/route.h> #include <net/wpan-phy.h> @@ -35,9 +36,28 @@ int mac802154_slave_open(struct net_device *dev) { struct mac802154_sub_if_data *priv = netdev_priv(dev); + struct mac802154_sub_if_data *subif; struct mac802154_priv *ipriv = priv->hw; int res = 0; + ASSERT_RTNL(); + + if (priv->type == IEEE802154_DEV_WPAN) { + mutex_lock(&priv->hw->slaves_mtx); + list_for_each_entry(subif, &priv->hw->slaves, list) { + if (subif != priv && subif->type == priv->type && + subif->running) { + mutex_unlock(&priv->hw->slaves_mtx); + return -EBUSY; + } + } + mutex_unlock(&priv->hw->slaves_mtx); + } + + mutex_lock(&priv->hw->slaves_mtx); + priv->running = true; + mutex_unlock(&priv->hw->slaves_mtx); + if (ipriv->open_count++ == 0) { res = ipriv->ops->start(&ipriv->hw); WARN_ON(res); @@ -46,7 +66,9 @@ int mac802154_slave_open(struct net_device *dev) } if (ipriv->ops->ieee_addr) { - res = ipriv->ops->ieee_addr(&ipriv->hw, dev->dev_addr); + __le64 addr = ieee802154_devaddr_from_raw(dev->dev_addr); + + res = ipriv->ops->ieee_addr(&ipriv->hw, addr); WARN_ON(res); if (res) goto err; @@ -66,8 +88,14 @@ int mac802154_slave_close(struct net_device *dev) struct mac802154_sub_if_data *priv = netdev_priv(dev); struct mac802154_priv *ipriv = priv->hw; + ASSERT_RTNL(); + netif_stop_queue(dev); + mutex_lock(&priv->hw->slaves_mtx); + priv->running = false; + mutex_unlock(&priv->hw->slaves_mtx); + if (!--ipriv->open_count) ipriv->ops->stop(&ipriv->hw); @@ -165,6 +193,49 @@ err: return ERR_PTR(err); } +static int mac802154_set_txpower(struct wpan_phy *phy, int db) +{ + struct mac802154_priv *priv = wpan_phy_priv(phy); + + return priv->ops->set_txpower(&priv->hw, db); +} + +static int mac802154_set_lbt(struct wpan_phy *phy, bool on) +{ + struct mac802154_priv *priv = wpan_phy_priv(phy); + + return priv->ops->set_lbt(&priv->hw, on); +} + +static int mac802154_set_cca_mode(struct wpan_phy *phy, u8 mode) +{ + struct mac802154_priv *priv = wpan_phy_priv(phy); + + return priv->ops->set_cca_mode(&priv->hw, mode); +} + +static int mac802154_set_cca_ed_level(struct wpan_phy *phy, s32 level) +{ + struct mac802154_priv *priv = wpan_phy_priv(phy); + + return priv->ops->set_cca_ed_level(&priv->hw, level); +} + +static int mac802154_set_csma_params(struct wpan_phy *phy, u8 min_be, + u8 max_be, u8 retries) +{ + struct mac802154_priv *priv = wpan_phy_priv(phy); + + return priv->ops->set_csma_params(&priv->hw, min_be, max_be, retries); +} + +static int mac802154_set_frame_retries(struct wpan_phy *phy, s8 retries) +{ + struct mac802154_priv *priv = wpan_phy_priv(phy); + + return priv->ops->set_frame_retries(&priv->hw, retries); +} + struct ieee802154_dev * ieee802154_alloc_device(size_t priv_data_len, struct ieee802154_ops *ops) { @@ -242,6 +313,18 @@ int ieee802154_register_device(struct ieee802154_dev *dev) priv->phy->add_iface = mac802154_add_iface; priv->phy->del_iface = mac802154_del_iface; + if (priv->ops->set_txpower) + priv->phy->set_txpower = mac802154_set_txpower; + if (priv->ops->set_lbt) + priv->phy->set_lbt = mac802154_set_lbt; + if (priv->ops->set_cca_mode) + priv->phy->set_cca_mode = mac802154_set_cca_mode; + if (priv->ops->set_cca_ed_level) + priv->phy->set_cca_ed_level = mac802154_set_cca_ed_level; + if (priv->ops->set_csma_params) + priv->phy->set_csma_params = mac802154_set_csma_params; + if (priv->ops->set_frame_retries) + priv->phy->set_frame_retries = mac802154_set_frame_retries; rc = wpan_phy_register(priv->phy); if (rc < 0) diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c new file mode 100644 index 00000000000..1456f73b02b --- /dev/null +++ b/net/mac802154/llsec.c @@ -0,0 +1,1070 @@ +/* + * Copyright (C) 2014 Fraunhofer ITWM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by: + * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de> + */ + +#include <linux/err.h> +#include <linux/bug.h> +#include <linux/completion.h> +#include <net/ieee802154.h> +#include <crypto/algapi.h> + +#include "mac802154.h" +#include "llsec.h" + +static void llsec_key_put(struct mac802154_llsec_key *key); +static bool llsec_key_id_equal(const struct ieee802154_llsec_key_id *a, + const struct ieee802154_llsec_key_id *b); + +static void llsec_dev_free(struct mac802154_llsec_device *dev); + +void mac802154_llsec_init(struct mac802154_llsec *sec) +{ + memset(sec, 0, sizeof(*sec)); + + memset(&sec->params.default_key_source, 0xFF, IEEE802154_ADDR_LEN); + + INIT_LIST_HEAD(&sec->table.security_levels); + INIT_LIST_HEAD(&sec->table.devices); + INIT_LIST_HEAD(&sec->table.keys); + hash_init(sec->devices_short); + hash_init(sec->devices_hw); + rwlock_init(&sec->lock); +} + +void mac802154_llsec_destroy(struct mac802154_llsec *sec) +{ + struct ieee802154_llsec_seclevel *sl, *sn; + struct ieee802154_llsec_device *dev, *dn; + struct ieee802154_llsec_key_entry *key, *kn; + + list_for_each_entry_safe(sl, sn, &sec->table.security_levels, list) { + struct mac802154_llsec_seclevel *msl; + + msl = container_of(sl, struct mac802154_llsec_seclevel, level); + list_del(&sl->list); + kfree(msl); + } + + list_for_each_entry_safe(dev, dn, &sec->table.devices, list) { + struct mac802154_llsec_device *mdev; + + mdev = container_of(dev, struct mac802154_llsec_device, dev); + list_del(&dev->list); + llsec_dev_free(mdev); + } + + list_for_each_entry_safe(key, kn, &sec->table.keys, list) { + struct mac802154_llsec_key *mkey; + + mkey = container_of(key->key, struct mac802154_llsec_key, key); + list_del(&key->list); + llsec_key_put(mkey); + kfree(key); + } +} + + + +int mac802154_llsec_get_params(struct mac802154_llsec *sec, + struct ieee802154_llsec_params *params) +{ + read_lock_bh(&sec->lock); + *params = sec->params; + read_unlock_bh(&sec->lock); + + return 0; +} + +int mac802154_llsec_set_params(struct mac802154_llsec *sec, + const struct ieee802154_llsec_params *params, + int changed) +{ + write_lock_bh(&sec->lock); + + if (changed & IEEE802154_LLSEC_PARAM_ENABLED) + sec->params.enabled = params->enabled; + if (changed & IEEE802154_LLSEC_PARAM_FRAME_COUNTER) + sec->params.frame_counter = params->frame_counter; + if (changed & IEEE802154_LLSEC_PARAM_OUT_LEVEL) + sec->params.out_level = params->out_level; + if (changed & IEEE802154_LLSEC_PARAM_OUT_KEY) + sec->params.out_key = params->out_key; + if (changed & IEEE802154_LLSEC_PARAM_KEY_SOURCE) + sec->params.default_key_source = params->default_key_source; + if (changed & IEEE802154_LLSEC_PARAM_PAN_ID) + sec->params.pan_id = params->pan_id; + if (changed & IEEE802154_LLSEC_PARAM_HWADDR) + sec->params.hwaddr = params->hwaddr; + if (changed & IEEE802154_LLSEC_PARAM_COORD_HWADDR) + sec->params.coord_hwaddr = params->coord_hwaddr; + if (changed & IEEE802154_LLSEC_PARAM_COORD_SHORTADDR) + sec->params.coord_shortaddr = params->coord_shortaddr; + + write_unlock_bh(&sec->lock); + + return 0; +} + + + +static struct mac802154_llsec_key* +llsec_key_alloc(const struct ieee802154_llsec_key *template) +{ + const int authsizes[3] = { 4, 8, 16 }; + struct mac802154_llsec_key *key; + int i; + + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) + return NULL; + + kref_init(&key->ref); + key->key = *template; + + BUILD_BUG_ON(ARRAY_SIZE(authsizes) != ARRAY_SIZE(key->tfm)); + + for (i = 0; i < ARRAY_SIZE(key->tfm); i++) { + key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0, + CRYPTO_ALG_ASYNC); + if (!key->tfm[i]) + goto err_tfm; + if (crypto_aead_setkey(key->tfm[i], template->key, + IEEE802154_LLSEC_KEY_SIZE)) + goto err_tfm; + if (crypto_aead_setauthsize(key->tfm[i], authsizes[i])) + goto err_tfm; + } + + key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); + if (!key->tfm0) + goto err_tfm; + + if (crypto_blkcipher_setkey(key->tfm0, template->key, + IEEE802154_LLSEC_KEY_SIZE)) + goto err_tfm0; + + return key; + +err_tfm0: + crypto_free_blkcipher(key->tfm0); +err_tfm: + for (i = 0; i < ARRAY_SIZE(key->tfm); i++) + if (key->tfm[i]) + crypto_free_aead(key->tfm[i]); + + kfree(key); + return NULL; +} + +static void llsec_key_release(struct kref *ref) +{ + struct mac802154_llsec_key *key; + int i; + + key = container_of(ref, struct mac802154_llsec_key, ref); + + for (i = 0; i < ARRAY_SIZE(key->tfm); i++) + crypto_free_aead(key->tfm[i]); + + crypto_free_blkcipher(key->tfm0); + kfree(key); +} + +static struct mac802154_llsec_key* +llsec_key_get(struct mac802154_llsec_key *key) +{ + kref_get(&key->ref); + return key; +} + +static void llsec_key_put(struct mac802154_llsec_key *key) +{ + kref_put(&key->ref, llsec_key_release); +} + +static bool llsec_key_id_equal(const struct ieee802154_llsec_key_id *a, + const struct ieee802154_llsec_key_id *b) +{ + if (a->mode != b->mode) + return false; + + if (a->mode == IEEE802154_SCF_KEY_IMPLICIT) + return ieee802154_addr_equal(&a->device_addr, &b->device_addr); + + if (a->id != b->id) + return false; + + switch (a->mode) { + case IEEE802154_SCF_KEY_INDEX: + return true; + case IEEE802154_SCF_KEY_SHORT_INDEX: + return a->short_source == b->short_source; + case IEEE802154_SCF_KEY_HW_INDEX: + return a->extended_source == b->extended_source; + } + + return false; +} + +int mac802154_llsec_key_add(struct mac802154_llsec *sec, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + struct mac802154_llsec_key *mkey = NULL; + struct ieee802154_llsec_key_entry *pos, *new; + + if (!(key->frame_types & (1 << IEEE802154_FC_TYPE_MAC_CMD)) && + key->cmd_frame_ids) + return -EINVAL; + + list_for_each_entry(pos, &sec->table.keys, list) { + if (llsec_key_id_equal(&pos->id, id)) + return -EEXIST; + + if (memcmp(pos->key->key, key->key, + IEEE802154_LLSEC_KEY_SIZE)) + continue; + + mkey = container_of(pos->key, struct mac802154_llsec_key, key); + + /* Don't allow multiple instances of the same AES key to have + * different allowed frame types/command frame ids, as this is + * not possible in the 802.15.4 PIB. + */ + if (pos->key->frame_types != key->frame_types || + pos->key->cmd_frame_ids != key->cmd_frame_ids) + return -EEXIST; + + break; + } + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + + if (!mkey) + mkey = llsec_key_alloc(key); + else + mkey = llsec_key_get(mkey); + + if (!mkey) + goto fail; + + new->id = *id; + new->key = &mkey->key; + + list_add_rcu(&new->list, &sec->table.keys); + + return 0; + +fail: + kfree(new); + return -ENOMEM; +} + +int mac802154_llsec_key_del(struct mac802154_llsec *sec, + const struct ieee802154_llsec_key_id *key) +{ + struct ieee802154_llsec_key_entry *pos; + + list_for_each_entry(pos, &sec->table.keys, list) { + struct mac802154_llsec_key *mkey; + + mkey = container_of(pos->key, struct mac802154_llsec_key, key); + + if (llsec_key_id_equal(&pos->id, key)) { + list_del_rcu(&pos->list); + llsec_key_put(mkey); + return 0; + } + } + + return -ENOENT; +} + + + +static bool llsec_dev_use_shortaddr(__le16 short_addr) +{ + return short_addr != cpu_to_le16(IEEE802154_ADDR_UNDEF) && + short_addr != cpu_to_le16(0xffff); +} + +static u32 llsec_dev_hash_short(__le16 short_addr, __le16 pan_id) +{ + return ((__force u16) short_addr) << 16 | (__force u16) pan_id; +} + +static u64 llsec_dev_hash_long(__le64 hwaddr) +{ + return (__force u64) hwaddr; +} + +static struct mac802154_llsec_device* +llsec_dev_find_short(struct mac802154_llsec *sec, __le16 short_addr, + __le16 pan_id) +{ + struct mac802154_llsec_device *dev; + u32 key = llsec_dev_hash_short(short_addr, pan_id); + + hash_for_each_possible_rcu(sec->devices_short, dev, bucket_s, key) { + if (dev->dev.short_addr == short_addr && + dev->dev.pan_id == pan_id) + return dev; + } + + return NULL; +} + +static struct mac802154_llsec_device* +llsec_dev_find_long(struct mac802154_llsec *sec, __le64 hwaddr) +{ + struct mac802154_llsec_device *dev; + u64 key = llsec_dev_hash_long(hwaddr); + + hash_for_each_possible_rcu(sec->devices_hw, dev, bucket_hw, key) { + if (dev->dev.hwaddr == hwaddr) + return dev; + } + + return NULL; +} + +static void llsec_dev_free(struct mac802154_llsec_device *dev) +{ + struct ieee802154_llsec_device_key *pos, *pn; + struct mac802154_llsec_device_key *devkey; + + list_for_each_entry_safe(pos, pn, &dev->dev.keys, list) { + devkey = container_of(pos, struct mac802154_llsec_device_key, + devkey); + + list_del(&pos->list); + kfree(devkey); + } + + kfree(dev); +} + +int mac802154_llsec_dev_add(struct mac802154_llsec *sec, + const struct ieee802154_llsec_device *dev) +{ + struct mac802154_llsec_device *entry; + u32 skey = llsec_dev_hash_short(dev->short_addr, dev->pan_id); + u64 hwkey = llsec_dev_hash_long(dev->hwaddr); + + BUILD_BUG_ON(sizeof(hwkey) != IEEE802154_ADDR_LEN); + + if ((llsec_dev_use_shortaddr(dev->short_addr) && + llsec_dev_find_short(sec, dev->short_addr, dev->pan_id)) || + llsec_dev_find_long(sec, dev->hwaddr)) + return -EEXIST; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->dev = *dev; + spin_lock_init(&entry->lock); + INIT_LIST_HEAD(&entry->dev.keys); + + if (llsec_dev_use_shortaddr(dev->short_addr)) + hash_add_rcu(sec->devices_short, &entry->bucket_s, skey); + else + INIT_HLIST_NODE(&entry->bucket_s); + + hash_add_rcu(sec->devices_hw, &entry->bucket_hw, hwkey); + list_add_tail_rcu(&entry->dev.list, &sec->table.devices); + + return 0; +} + +static void llsec_dev_free_rcu(struct rcu_head *rcu) +{ + llsec_dev_free(container_of(rcu, struct mac802154_llsec_device, rcu)); +} + +int mac802154_llsec_dev_del(struct mac802154_llsec *sec, __le64 device_addr) +{ + struct mac802154_llsec_device *pos; + + pos = llsec_dev_find_long(sec, device_addr); + if (!pos) + return -ENOENT; + + hash_del_rcu(&pos->bucket_s); + hash_del_rcu(&pos->bucket_hw); + call_rcu(&pos->rcu, llsec_dev_free_rcu); + + return 0; +} + + + +static struct mac802154_llsec_device_key* +llsec_devkey_find(struct mac802154_llsec_device *dev, + const struct ieee802154_llsec_key_id *key) +{ + struct ieee802154_llsec_device_key *devkey; + + list_for_each_entry_rcu(devkey, &dev->dev.keys, list) { + if (!llsec_key_id_equal(key, &devkey->key_id)) + continue; + + return container_of(devkey, struct mac802154_llsec_device_key, + devkey); + } + + return NULL; +} + +int mac802154_llsec_devkey_add(struct mac802154_llsec *sec, + __le64 dev_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct mac802154_llsec_device *dev; + struct mac802154_llsec_device_key *devkey; + + dev = llsec_dev_find_long(sec, dev_addr); + + if (!dev) + return -ENOENT; + + if (llsec_devkey_find(dev, &key->key_id)) + return -EEXIST; + + devkey = kmalloc(sizeof(*devkey), GFP_KERNEL); + if (!devkey) + return -ENOMEM; + + devkey->devkey = *key; + list_add_tail_rcu(&devkey->devkey.list, &dev->dev.keys); + return 0; +} + +int mac802154_llsec_devkey_del(struct mac802154_llsec *sec, + __le64 dev_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct mac802154_llsec_device *dev; + struct mac802154_llsec_device_key *devkey; + + dev = llsec_dev_find_long(sec, dev_addr); + + if (!dev) + return -ENOENT; + + devkey = llsec_devkey_find(dev, &key->key_id); + if (!devkey) + return -ENOENT; + + list_del_rcu(&devkey->devkey.list); + kfree_rcu(devkey, rcu); + return 0; +} + + + +static struct mac802154_llsec_seclevel* +llsec_find_seclevel(const struct mac802154_llsec *sec, + const struct ieee802154_llsec_seclevel *sl) +{ + struct ieee802154_llsec_seclevel *pos; + + list_for_each_entry(pos, &sec->table.security_levels, list) { + if (pos->frame_type != sl->frame_type || + (pos->frame_type == IEEE802154_FC_TYPE_MAC_CMD && + pos->cmd_frame_id != sl->cmd_frame_id) || + pos->device_override != sl->device_override || + pos->sec_levels != sl->sec_levels) + continue; + + return container_of(pos, struct mac802154_llsec_seclevel, + level); + } + + return NULL; +} + +int mac802154_llsec_seclevel_add(struct mac802154_llsec *sec, + const struct ieee802154_llsec_seclevel *sl) +{ + struct mac802154_llsec_seclevel *entry; + + if (llsec_find_seclevel(sec, sl)) + return -EEXIST; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->level = *sl; + + list_add_tail_rcu(&entry->level.list, &sec->table.security_levels); + + return 0; +} + +int mac802154_llsec_seclevel_del(struct mac802154_llsec *sec, + const struct ieee802154_llsec_seclevel *sl) +{ + struct mac802154_llsec_seclevel *pos; + + pos = llsec_find_seclevel(sec, sl); + if (!pos) + return -ENOENT; + + list_del_rcu(&pos->level.list); + kfree_rcu(pos, rcu); + + return 0; +} + + + +static int llsec_recover_addr(struct mac802154_llsec *sec, + struct ieee802154_addr *addr) +{ + __le16 caddr = sec->params.coord_shortaddr; + addr->pan_id = sec->params.pan_id; + + if (caddr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { + return -EINVAL; + } else if (caddr == cpu_to_le16(IEEE802154_ADDR_UNDEF)) { + addr->extended_addr = sec->params.coord_hwaddr; + addr->mode = IEEE802154_ADDR_LONG; + } else { + addr->short_addr = sec->params.coord_shortaddr; + addr->mode = IEEE802154_ADDR_SHORT; + } + + return 0; +} + +static struct mac802154_llsec_key* +llsec_lookup_key(struct mac802154_llsec *sec, + const struct ieee802154_hdr *hdr, + const struct ieee802154_addr *addr, + struct ieee802154_llsec_key_id *key_id) +{ + struct ieee802154_addr devaddr = *addr; + u8 key_id_mode = hdr->sec.key_id_mode; + struct ieee802154_llsec_key_entry *key_entry; + struct mac802154_llsec_key *key; + + if (key_id_mode == IEEE802154_SCF_KEY_IMPLICIT && + devaddr.mode == IEEE802154_ADDR_NONE) { + if (hdr->fc.type == IEEE802154_FC_TYPE_BEACON) { + devaddr.extended_addr = sec->params.coord_hwaddr; + devaddr.mode = IEEE802154_ADDR_LONG; + } else if (llsec_recover_addr(sec, &devaddr) < 0) { + return NULL; + } + } + + list_for_each_entry_rcu(key_entry, &sec->table.keys, list) { + const struct ieee802154_llsec_key_id *id = &key_entry->id; + + if (!(key_entry->key->frame_types & BIT(hdr->fc.type))) + continue; + + if (id->mode != key_id_mode) + continue; + + if (key_id_mode == IEEE802154_SCF_KEY_IMPLICIT) { + if (ieee802154_addr_equal(&devaddr, &id->device_addr)) + goto found; + } else { + if (id->id != hdr->sec.key_id) + continue; + + if ((key_id_mode == IEEE802154_SCF_KEY_INDEX) || + (key_id_mode == IEEE802154_SCF_KEY_SHORT_INDEX && + id->short_source == hdr->sec.short_src) || + (key_id_mode == IEEE802154_SCF_KEY_HW_INDEX && + id->extended_source == hdr->sec.extended_src)) + goto found; + } + } + + return NULL; + +found: + key = container_of(key_entry->key, struct mac802154_llsec_key, key); + if (key_id) + *key_id = key_entry->id; + return llsec_key_get(key); +} + + +static void llsec_geniv(u8 iv[16], __le64 addr, + const struct ieee802154_sechdr *sec) +{ + __be64 addr_bytes = (__force __be64) swab64((__force u64) addr); + __be32 frame_counter = (__force __be32) swab32((__force u32) sec->frame_counter); + + iv[0] = 1; /* L' = L - 1 = 1 */ + memcpy(iv + 1, &addr_bytes, sizeof(addr_bytes)); + memcpy(iv + 9, &frame_counter, sizeof(frame_counter)); + iv[13] = sec->level; + iv[14] = 0; + iv[15] = 1; +} + +static int +llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, + const struct ieee802154_hdr *hdr, + struct mac802154_llsec_key *key) +{ + u8 iv[16]; + struct scatterlist src; + struct blkcipher_desc req = { + .tfm = key->tfm0, + .info = iv, + .flags = 0, + }; + + llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); + sg_init_one(&src, skb->data, skb->len); + return crypto_blkcipher_encrypt_iv(&req, &src, &src, skb->len); +} + +static struct crypto_aead* +llsec_tfm_by_len(struct mac802154_llsec_key *key, int authlen) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(key->tfm); i++) + if (crypto_aead_authsize(key->tfm[i]) == authlen) + return key->tfm[i]; + + BUG(); +} + +static int +llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, + const struct ieee802154_hdr *hdr, + struct mac802154_llsec_key *key) +{ + u8 iv[16]; + unsigned char *data; + int authlen, assoclen, datalen, rc; + struct scatterlist src, assoc[2], dst[2]; + struct aead_request *req; + + authlen = ieee802154_sechdr_authtag_len(&hdr->sec); + llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); + + req = aead_request_alloc(llsec_tfm_by_len(key, authlen), GFP_ATOMIC); + if (!req) + return -ENOMEM; + + sg_init_table(assoc, 2); + sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len); + assoclen = skb->mac_len; + + data = skb_mac_header(skb) + skb->mac_len; + datalen = skb_tail_pointer(skb) - data; + + if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) { + sg_set_buf(&assoc[1], data, 0); + } else { + sg_set_buf(&assoc[1], data, datalen); + assoclen += datalen; + datalen = 0; + } + + sg_init_one(&src, data, datalen); + + sg_init_table(dst, 2); + sg_set_buf(&dst[0], data, datalen); + sg_set_buf(&dst[1], skb_put(skb, authlen), authlen); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_assoc(req, assoc, assoclen); + aead_request_set_crypt(req, &src, dst, datalen, iv); + + rc = crypto_aead_encrypt(req); + + kfree(req); + + return rc; +} + +static int llsec_do_encrypt(struct sk_buff *skb, + const struct mac802154_llsec *sec, + const struct ieee802154_hdr *hdr, + struct mac802154_llsec_key *key) +{ + if (hdr->sec.level == IEEE802154_SCF_SECLEVEL_ENC) + return llsec_do_encrypt_unauth(skb, sec, hdr, key); + else + return llsec_do_encrypt_auth(skb, sec, hdr, key); +} + +int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb) +{ + struct ieee802154_hdr hdr; + int rc, authlen, hlen; + struct mac802154_llsec_key *key; + u32 frame_ctr; + + hlen = ieee802154_hdr_pull(skb, &hdr); + + if (hlen < 0 || hdr.fc.type != IEEE802154_FC_TYPE_DATA) + return -EINVAL; + + if (!hdr.fc.security_enabled || hdr.sec.level == 0) { + skb_push(skb, hlen); + return 0; + } + + authlen = ieee802154_sechdr_authtag_len(&hdr.sec); + + if (skb->len + hlen + authlen + IEEE802154_MFR_SIZE > IEEE802154_MTU) + return -EMSGSIZE; + + rcu_read_lock(); + + read_lock_bh(&sec->lock); + + if (!sec->params.enabled) { + rc = -EINVAL; + goto fail_read; + } + + key = llsec_lookup_key(sec, &hdr, &hdr.dest, NULL); + if (!key) { + rc = -ENOKEY; + goto fail_read; + } + + read_unlock_bh(&sec->lock); + + write_lock_bh(&sec->lock); + + frame_ctr = be32_to_cpu(sec->params.frame_counter); + hdr.sec.frame_counter = cpu_to_le32(frame_ctr); + if (frame_ctr == 0xFFFFFFFF) { + write_unlock_bh(&sec->lock); + llsec_key_put(key); + rc = -EOVERFLOW; + goto fail; + } + + sec->params.frame_counter = cpu_to_be32(frame_ctr + 1); + + write_unlock_bh(&sec->lock); + + rcu_read_unlock(); + + skb->mac_len = ieee802154_hdr_push(skb, &hdr); + skb_reset_mac_header(skb); + + rc = llsec_do_encrypt(skb, sec, &hdr, key); + llsec_key_put(key); + + return rc; + +fail_read: + read_unlock_bh(&sec->lock); +fail: + rcu_read_unlock(); + return rc; +} + + + +static struct mac802154_llsec_device* +llsec_lookup_dev(struct mac802154_llsec *sec, + const struct ieee802154_addr *addr) +{ + struct ieee802154_addr devaddr = *addr; + struct mac802154_llsec_device *dev = NULL; + + if (devaddr.mode == IEEE802154_ADDR_NONE && + llsec_recover_addr(sec, &devaddr) < 0) + return NULL; + + if (devaddr.mode == IEEE802154_ADDR_SHORT) { + u32 key = llsec_dev_hash_short(devaddr.short_addr, + devaddr.pan_id); + + hash_for_each_possible_rcu(sec->devices_short, dev, + bucket_s, key) { + if (dev->dev.pan_id == devaddr.pan_id && + dev->dev.short_addr == devaddr.short_addr) + return dev; + } + } else { + u64 key = llsec_dev_hash_long(devaddr.extended_addr); + + hash_for_each_possible_rcu(sec->devices_hw, dev, + bucket_hw, key) { + if (dev->dev.hwaddr == devaddr.extended_addr) + return dev; + } + } + + return NULL; +} + +static int +llsec_lookup_seclevel(const struct mac802154_llsec *sec, + u8 frame_type, u8 cmd_frame_id, + struct ieee802154_llsec_seclevel *rlevel) +{ + struct ieee802154_llsec_seclevel *level; + + list_for_each_entry_rcu(level, &sec->table.security_levels, list) { + if (level->frame_type == frame_type && + (frame_type != IEEE802154_FC_TYPE_MAC_CMD || + level->cmd_frame_id == cmd_frame_id)) { + *rlevel = *level; + return 0; + } + } + + return -EINVAL; +} + +static int +llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, + const struct ieee802154_hdr *hdr, + struct mac802154_llsec_key *key, __le64 dev_addr) +{ + u8 iv[16]; + unsigned char *data; + int datalen; + struct scatterlist src; + struct blkcipher_desc req = { + .tfm = key->tfm0, + .info = iv, + .flags = 0, + }; + + llsec_geniv(iv, dev_addr, &hdr->sec); + data = skb_mac_header(skb) + skb->mac_len; + datalen = skb_tail_pointer(skb) - data; + + sg_init_one(&src, data, datalen); + + return crypto_blkcipher_decrypt_iv(&req, &src, &src, datalen); +} + +static int +llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, + const struct ieee802154_hdr *hdr, + struct mac802154_llsec_key *key, __le64 dev_addr) +{ + u8 iv[16]; + unsigned char *data; + int authlen, datalen, assoclen, rc; + struct scatterlist src, assoc[2]; + struct aead_request *req; + + authlen = ieee802154_sechdr_authtag_len(&hdr->sec); + llsec_geniv(iv, dev_addr, &hdr->sec); + + req = aead_request_alloc(llsec_tfm_by_len(key, authlen), GFP_ATOMIC); + if (!req) + return -ENOMEM; + + sg_init_table(assoc, 2); + sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len); + assoclen = skb->mac_len; + + data = skb_mac_header(skb) + skb->mac_len; + datalen = skb_tail_pointer(skb) - data; + + if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) { + sg_set_buf(&assoc[1], data, 0); + } else { + sg_set_buf(&assoc[1], data, datalen - authlen); + assoclen += datalen - authlen; + data += datalen - authlen; + datalen = authlen; + } + + sg_init_one(&src, data, datalen); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_assoc(req, assoc, assoclen); + aead_request_set_crypt(req, &src, &src, datalen, iv); + + rc = crypto_aead_decrypt(req); + + kfree(req); + skb_trim(skb, skb->len - authlen); + + return rc; +} + +static int +llsec_do_decrypt(struct sk_buff *skb, const struct mac802154_llsec *sec, + const struct ieee802154_hdr *hdr, + struct mac802154_llsec_key *key, __le64 dev_addr) +{ + if (hdr->sec.level == IEEE802154_SCF_SECLEVEL_ENC) + return llsec_do_decrypt_unauth(skb, sec, hdr, key, dev_addr); + else + return llsec_do_decrypt_auth(skb, sec, hdr, key, dev_addr); +} + +static int +llsec_update_devkey_record(struct mac802154_llsec_device *dev, + const struct ieee802154_llsec_key_id *in_key) +{ + struct mac802154_llsec_device_key *devkey; + + devkey = llsec_devkey_find(dev, in_key); + + if (!devkey) { + struct mac802154_llsec_device_key *next; + + next = kzalloc(sizeof(*devkey), GFP_ATOMIC); + if (!next) + return -ENOMEM; + + next->devkey.key_id = *in_key; + + spin_lock_bh(&dev->lock); + + devkey = llsec_devkey_find(dev, in_key); + if (!devkey) + list_add_rcu(&next->devkey.list, &dev->dev.keys); + else + kfree(next); + + spin_unlock_bh(&dev->lock); + } + + return 0; +} + +static int +llsec_update_devkey_info(struct mac802154_llsec_device *dev, + const struct ieee802154_llsec_key_id *in_key, + u32 frame_counter) +{ + struct mac802154_llsec_device_key *devkey = NULL; + + if (dev->dev.key_mode == IEEE802154_LLSEC_DEVKEY_RESTRICT) { + devkey = llsec_devkey_find(dev, in_key); + if (!devkey) + return -ENOENT; + } + + if (dev->dev.key_mode == IEEE802154_LLSEC_DEVKEY_RECORD) { + int rc = llsec_update_devkey_record(dev, in_key); + + if (rc < 0) + return rc; + } + + spin_lock_bh(&dev->lock); + + if ((!devkey && frame_counter < dev->dev.frame_counter) || + (devkey && frame_counter < devkey->devkey.frame_counter)) { + spin_unlock_bh(&dev->lock); + return -EINVAL; + } + + if (devkey) + devkey->devkey.frame_counter = frame_counter + 1; + else + dev->dev.frame_counter = frame_counter + 1; + + spin_unlock_bh(&dev->lock); + + return 0; +} + +int mac802154_llsec_decrypt(struct mac802154_llsec *sec, struct sk_buff *skb) +{ + struct ieee802154_hdr hdr; + struct mac802154_llsec_key *key; + struct ieee802154_llsec_key_id key_id; + struct mac802154_llsec_device *dev; + struct ieee802154_llsec_seclevel seclevel; + int err; + __le64 dev_addr; + u32 frame_ctr; + + if (ieee802154_hdr_peek(skb, &hdr) < 0) + return -EINVAL; + if (!hdr.fc.security_enabled) + return 0; + if (hdr.fc.version == 0) + return -EINVAL; + + read_lock_bh(&sec->lock); + if (!sec->params.enabled) { + read_unlock_bh(&sec->lock); + return -EINVAL; + } + read_unlock_bh(&sec->lock); + + rcu_read_lock(); + + key = llsec_lookup_key(sec, &hdr, &hdr.source, &key_id); + if (!key) { + err = -ENOKEY; + goto fail; + } + + dev = llsec_lookup_dev(sec, &hdr.source); + if (!dev) { + err = -EINVAL; + goto fail_dev; + } + + if (llsec_lookup_seclevel(sec, hdr.fc.type, 0, &seclevel) < 0) { + err = -EINVAL; + goto fail_dev; + } + + if (!(seclevel.sec_levels & BIT(hdr.sec.level)) && + (hdr.sec.level == 0 && seclevel.device_override && + !dev->dev.seclevel_exempt)) { + err = -EINVAL; + goto fail_dev; + } + + frame_ctr = le32_to_cpu(hdr.sec.frame_counter); + + if (frame_ctr == 0xffffffff) { + err = -EOVERFLOW; + goto fail_dev; + } + + err = llsec_update_devkey_info(dev, &key_id, frame_ctr); + if (err) + goto fail_dev; + + dev_addr = dev->dev.hwaddr; + + rcu_read_unlock(); + + err = llsec_do_decrypt(skb, sec, &hdr, key, dev_addr); + llsec_key_put(key); + return err; + +fail_dev: + llsec_key_put(key); +fail: + rcu_read_unlock(); + return err; +} diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h new file mode 100644 index 00000000000..950578e1d7b --- /dev/null +++ b/net/mac802154/llsec.h @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2014 Fraunhofer ITWM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by: + * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de> + */ + +#ifndef MAC802154_LLSEC_H +#define MAC802154_LLSEC_H + +#include <linux/slab.h> +#include <linux/hashtable.h> +#include <linux/crypto.h> +#include <linux/kref.h> +#include <linux/spinlock.h> +#include <net/af_ieee802154.h> +#include <net/ieee802154_netdev.h> + +struct mac802154_llsec_key { + struct ieee802154_llsec_key key; + + /* one tfm for each authsize (4/8/16) */ + struct crypto_aead *tfm[3]; + struct crypto_blkcipher *tfm0; + + struct kref ref; +}; + +struct mac802154_llsec_device_key { + struct ieee802154_llsec_device_key devkey; + + struct rcu_head rcu; +}; + +struct mac802154_llsec_device { + struct ieee802154_llsec_device dev; + + struct hlist_node bucket_s; + struct hlist_node bucket_hw; + + /* protects dev.frame_counter and the elements of dev.keys */ + spinlock_t lock; + + struct rcu_head rcu; +}; + +struct mac802154_llsec_seclevel { + struct ieee802154_llsec_seclevel level; + + struct rcu_head rcu; +}; + +struct mac802154_llsec { + struct ieee802154_llsec_params params; + struct ieee802154_llsec_table table; + + DECLARE_HASHTABLE(devices_short, 6); + DECLARE_HASHTABLE(devices_hw, 6); + + /* protects params, all other fields are fine with RCU */ + rwlock_t lock; +}; + +void mac802154_llsec_init(struct mac802154_llsec *sec); +void mac802154_llsec_destroy(struct mac802154_llsec *sec); + +int mac802154_llsec_get_params(struct mac802154_llsec *sec, + struct ieee802154_llsec_params *params); +int mac802154_llsec_set_params(struct mac802154_llsec *sec, + const struct ieee802154_llsec_params *params, + int changed); + +int mac802154_llsec_key_add(struct mac802154_llsec *sec, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key); +int mac802154_llsec_key_del(struct mac802154_llsec *sec, + const struct ieee802154_llsec_key_id *key); + +int mac802154_llsec_dev_add(struct mac802154_llsec *sec, + const struct ieee802154_llsec_device *dev); +int mac802154_llsec_dev_del(struct mac802154_llsec *sec, + __le64 device_addr); + +int mac802154_llsec_devkey_add(struct mac802154_llsec *sec, + __le64 dev_addr, + const struct ieee802154_llsec_device_key *key); +int mac802154_llsec_devkey_del(struct mac802154_llsec *sec, + __le64 dev_addr, + const struct ieee802154_llsec_device_key *key); + +int mac802154_llsec_seclevel_add(struct mac802154_llsec *sec, + const struct ieee802154_llsec_seclevel *sl); +int mac802154_llsec_seclevel_del(struct mac802154_llsec *sec, + const struct ieee802154_llsec_seclevel *sl); + +int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb); +int mac802154_llsec_decrypt(struct mac802154_llsec *sec, struct sk_buff *skb); + +#endif /* MAC802154_LLSEC_H */ diff --git a/net/mac802154/mac802154.h b/net/mac802154/mac802154.h index d48422e2711..762a6f849c6 100644 --- a/net/mac802154/mac802154.h +++ b/net/mac802154/mac802154.h @@ -23,6 +23,12 @@ #ifndef MAC802154_H #define MAC802154_H +#include <linux/mutex.h> +#include <net/mac802154.h> +#include <net/ieee802154_netdev.h> + +#include "llsec.h" + /* mac802154 device private data */ struct mac802154_priv { struct ieee802154_dev hw; @@ -71,19 +77,30 @@ struct mac802154_sub_if_data { struct net_device *dev; int type; + bool running; spinlock_t mib_lock; __le16 pan_id; __le16 short_addr; + __le64 extended_addr; u8 chan; u8 page; + struct ieee802154_mac_params mac_params; + /* MAC BSN field */ u8 bsn; /* MAC DSN field */ u8 dsn; + + /* protects sec from concurrent access by netlink. access by + * encrypt/decrypt/header_create safe without additional protection. + */ + struct mutex sec_mtx; + + struct mac802154_llsec sec; }; #define mac802154_to_priv(_hw) container_of(_hw, struct mac802154_priv, hw) @@ -106,12 +123,50 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, u8 page, u8 chan); /* MIB callbacks */ -void mac802154_dev_set_short_addr(struct net_device *dev, u16 val); -u16 mac802154_dev_get_short_addr(const struct net_device *dev); +void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val); +__le16 mac802154_dev_get_short_addr(const struct net_device *dev); void mac802154_dev_set_ieee_addr(struct net_device *dev); -u16 mac802154_dev_get_pan_id(const struct net_device *dev); -void mac802154_dev_set_pan_id(struct net_device *dev, u16 val); +__le16 mac802154_dev_get_pan_id(const struct net_device *dev); +void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val); void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); u8 mac802154_dev_get_dsn(const struct net_device *dev); +int mac802154_set_mac_params(struct net_device *dev, + const struct ieee802154_mac_params *params); +void mac802154_get_mac_params(struct net_device *dev, + struct ieee802154_mac_params *params); + +int mac802154_get_params(struct net_device *dev, + struct ieee802154_llsec_params *params); +int mac802154_set_params(struct net_device *dev, + const struct ieee802154_llsec_params *params, + int changed); + +int mac802154_add_key(struct net_device *dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key); +int mac802154_del_key(struct net_device *dev, + const struct ieee802154_llsec_key_id *id); + +int mac802154_add_dev(struct net_device *dev, + const struct ieee802154_llsec_device *llsec_dev); +int mac802154_del_dev(struct net_device *dev, __le64 dev_addr); + +int mac802154_add_devkey(struct net_device *dev, + __le64 device_addr, + const struct ieee802154_llsec_device_key *key); +int mac802154_del_devkey(struct net_device *dev, + __le64 device_addr, + const struct ieee802154_llsec_device_key *key); + +int mac802154_add_seclevel(struct net_device *dev, + const struct ieee802154_llsec_seclevel *sl); +int mac802154_del_seclevel(struct net_device *dev, + const struct ieee802154_llsec_seclevel *sl); + +void mac802154_lock_table(struct net_device *dev); +void mac802154_get_table(struct net_device *dev, + struct ieee802154_llsec_table **t); +void mac802154_unlock_table(struct net_device *dev); + #endif /* MAC802154_H */ diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index a99910d4d52..bf809131eef 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -40,19 +40,41 @@ static int mac802154_mlme_start_req(struct net_device *dev, u8 pan_coord, u8 blx, u8 coord_realign) { - BUG_ON(addr->addr_type != IEEE802154_ADDR_SHORT); + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + int rc = 0; + + BUG_ON(addr->mode != IEEE802154_ADDR_SHORT); mac802154_dev_set_pan_id(dev, addr->pan_id); mac802154_dev_set_short_addr(dev, addr->short_addr); mac802154_dev_set_ieee_addr(dev); mac802154_dev_set_page_channel(dev, page, channel); + if (ops->llsec) { + struct ieee802154_llsec_params params; + int changed = 0; + + params.coord_shortaddr = addr->short_addr; + changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; + + params.pan_id = addr->pan_id; + changed |= IEEE802154_LLSEC_PARAM_PAN_ID; + + params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); + changed |= IEEE802154_LLSEC_PARAM_HWADDR; + + params.coord_hwaddr = params.hwaddr; + changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; + + rc = ops->llsec->set_params(dev, ¶ms, changed); + } + /* FIXME: add validation for unused parameters to be sane * for SoftMAC */ ieee802154_nl_start_confirm(dev, IEEE802154_SUCCESS); - return 0; + return rc; } static struct wpan_phy *mac802154_get_phy(const struct net_device *dev) @@ -64,6 +86,22 @@ static struct wpan_phy *mac802154_get_phy(const struct net_device *dev) return to_phy(get_device(&priv->hw->phy->dev)); } +static struct ieee802154_llsec_ops mac802154_llsec_ops = { + .get_params = mac802154_get_params, + .set_params = mac802154_set_params, + .add_key = mac802154_add_key, + .del_key = mac802154_del_key, + .add_dev = mac802154_add_dev, + .del_dev = mac802154_del_dev, + .add_devkey = mac802154_add_devkey, + .del_devkey = mac802154_del_devkey, + .add_seclevel = mac802154_add_seclevel, + .del_seclevel = mac802154_del_seclevel, + .lock_table = mac802154_lock_table, + .get_table = mac802154_get_table, + .unlock_table = mac802154_unlock_table, +}; + struct ieee802154_reduced_mlme_ops mac802154_mlme_reduced = { .get_phy = mac802154_get_phy, }; @@ -74,4 +112,9 @@ struct ieee802154_mlme_ops mac802154_mlme_wpan = { .get_pan_id = mac802154_dev_get_pan_id, .get_short_addr = mac802154_dev_get_short_addr, .get_dsn = mac802154_dev_get_dsn, + + .llsec = &mac802154_llsec_ops, + + .set_mac_params = mac802154_set_mac_params, + .get_mac_params = mac802154_get_mac_params, }; diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 8ded97cf1c3..15aa2f2b03a 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -24,6 +24,7 @@ #include <linux/if_arp.h> #include <net/mac802154.h> +#include <net/ieee802154_netdev.h> #include <net/wpan-phy.h> #include "mac802154.h" @@ -62,8 +63,6 @@ static void hw_addr_notify(struct work_struct *work) pr_debug("failed changed mask %lx\n", nw->changed); kfree(nw); - - return; } static void set_hw_addr_filt(struct net_device *dev, unsigned long changed) @@ -79,11 +78,9 @@ static void set_hw_addr_filt(struct net_device *dev, unsigned long changed) work->dev = dev; work->changed = changed; queue_work(priv->hw->dev_workqueue, &work->work); - - return; } -void mac802154_dev_set_short_addr(struct net_device *dev, u16 val) +void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val) { struct mac802154_sub_if_data *priv = netdev_priv(dev); @@ -100,10 +97,10 @@ void mac802154_dev_set_short_addr(struct net_device *dev, u16 val) } } -u16 mac802154_dev_get_short_addr(const struct net_device *dev) +__le16 mac802154_dev_get_short_addr(const struct net_device *dev) { struct mac802154_sub_if_data *priv = netdev_priv(dev); - u16 ret; + __le16 ret; BUG_ON(dev->type != ARPHRD_IEEE802154); @@ -119,19 +116,19 @@ void mac802154_dev_set_ieee_addr(struct net_device *dev) struct mac802154_sub_if_data *priv = netdev_priv(dev); struct mac802154_priv *mac = priv->hw; + priv->extended_addr = ieee802154_devaddr_from_raw(dev->dev_addr); + if (mac->ops->set_hw_addr_filt && - memcmp(mac->hw.hw_filt.ieee_addr, - dev->dev_addr, IEEE802154_ADDR_LEN)) { - memcpy(mac->hw.hw_filt.ieee_addr, - dev->dev_addr, IEEE802154_ADDR_LEN); + mac->hw.hw_filt.ieee_addr != priv->extended_addr) { + mac->hw.hw_filt.ieee_addr = priv->extended_addr; set_hw_addr_filt(dev, IEEE802515_AFILT_IEEEADDR_CHANGED); } } -u16 mac802154_dev_get_pan_id(const struct net_device *dev) +__le16 mac802154_dev_get_pan_id(const struct net_device *dev) { struct mac802154_sub_if_data *priv = netdev_priv(dev); - u16 ret; + __le16 ret; BUG_ON(dev->type != ARPHRD_IEEE802154); @@ -142,7 +139,7 @@ u16 mac802154_dev_get_pan_id(const struct net_device *dev) return ret; } -void mac802154_dev_set_pan_id(struct net_device *dev, u16 val) +void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val) { struct mac802154_sub_if_data *priv = netdev_priv(dev); @@ -216,3 +213,190 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) } else mutex_unlock(&priv->hw->phy->pib_lock); } + + +int mac802154_get_params(struct net_device *dev, + struct ieee802154_llsec_params *params) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_get_params(&priv->sec, params); + mutex_unlock(&priv->sec_mtx); + + return res; +} + +int mac802154_set_params(struct net_device *dev, + const struct ieee802154_llsec_params *params, + int changed) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_set_params(&priv->sec, params, changed); + mutex_unlock(&priv->sec_mtx); + + return res; +} + + +int mac802154_add_key(struct net_device *dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_key_add(&priv->sec, id, key); + mutex_unlock(&priv->sec_mtx); + + return res; +} + +int mac802154_del_key(struct net_device *dev, + const struct ieee802154_llsec_key_id *id) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_key_del(&priv->sec, id); + mutex_unlock(&priv->sec_mtx); + + return res; +} + + +int mac802154_add_dev(struct net_device *dev, + const struct ieee802154_llsec_device *llsec_dev) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_dev_add(&priv->sec, llsec_dev); + mutex_unlock(&priv->sec_mtx); + + return res; +} + +int mac802154_del_dev(struct net_device *dev, __le64 dev_addr) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_dev_del(&priv->sec, dev_addr); + mutex_unlock(&priv->sec_mtx); + + return res; +} + + +int mac802154_add_devkey(struct net_device *dev, + __le64 device_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_devkey_add(&priv->sec, device_addr, key); + mutex_unlock(&priv->sec_mtx); + + return res; +} + +int mac802154_del_devkey(struct net_device *dev, + __le64 device_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_devkey_del(&priv->sec, device_addr, key); + mutex_unlock(&priv->sec_mtx); + + return res; +} + + +int mac802154_add_seclevel(struct net_device *dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_seclevel_add(&priv->sec, sl); + mutex_unlock(&priv->sec_mtx); + + return res; +} + +int mac802154_del_seclevel(struct net_device *dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + int res; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); + res = mac802154_llsec_seclevel_del(&priv->sec, sl); + mutex_unlock(&priv->sec_mtx); + + return res; +} + + +void mac802154_lock_table(struct net_device *dev) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_lock(&priv->sec_mtx); +} + +void mac802154_get_table(struct net_device *dev, + struct ieee802154_llsec_table **t) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + *t = &priv->sec.table; +} + +void mac802154_unlock_table(struct net_device *dev) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + mutex_unlock(&priv->sec_mtx); +} diff --git a/net/mac802154/monitor.c b/net/mac802154/monitor.c index 434a26f76a8..a68230e2b25 100644 --- a/net/mac802154/monitor.c +++ b/net/mac802154/monitor.c @@ -70,7 +70,8 @@ void mac802154_monitors_rx(struct mac802154_priv *priv, struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(sdata, &priv->slaves, list) { - if (sdata->type != IEEE802154_DEV_MONITOR) + if (sdata->type != IEEE802154_DEV_MONITOR || + !netif_running(sdata->dev)) continue; skb2 = skb_clone(skb, GFP_ATOMIC); diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 38548ec2098..7f820a108a9 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -59,28 +59,28 @@ mac802154_subif_rx(struct ieee802154_dev *hw, struct sk_buff *skb, u8 lqi) skb->protocol = htons(ETH_P_IEEE802154); skb_reset_mac_header(skb); - BUILD_BUG_ON(sizeof(struct ieee802154_mac_cb) > sizeof(skb->cb)); - if (!(priv->hw.flags & IEEE802154_HW_OMIT_CKSUM)) { u16 crc; if (skb->len < 2) { pr_debug("got invalid frame\n"); - goto out; + goto fail; } crc = crc_ccitt(0, skb->data, skb->len); if (crc) { pr_debug("CRC mismatch\n"); - goto out; + goto fail; } skb_trim(skb, skb->len - 2); /* CRC */ } mac802154_monitors_rx(priv, skb); mac802154_wpans_rx(priv, skb); -out: - dev_kfree_skb(skb); + return; + +fail: + kfree_skb(skb); } static void mac802154_rx_worker(struct work_struct *work) diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c index e24bcf97729..3c3069fd697 100644 --- a/net/mac802154/wpan.c +++ b/net/mac802154/wpan.c @@ -35,33 +35,26 @@ #include "mac802154.h" -static inline int mac802154_fetch_skb_u8(struct sk_buff *skb, u8 *val) +static int mac802154_wpan_update_llsec(struct net_device *dev) { - if (unlikely(!pskb_may_pull(skb, 1))) - return -EINVAL; + struct mac802154_sub_if_data *priv = netdev_priv(dev); + struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); + int rc = 0; - *val = skb->data[0]; - skb_pull(skb, 1); + if (ops->llsec) { + struct ieee802154_llsec_params params; + int changed = 0; - return 0; -} + params.pan_id = priv->pan_id; + changed |= IEEE802154_LLSEC_PARAM_PAN_ID; -static inline int mac802154_fetch_skb_u16(struct sk_buff *skb, u16 *val) -{ - if (unlikely(!pskb_may_pull(skb, 2))) - return -EINVAL; - - *val = skb->data[0] | (skb->data[1] << 8); - skb_pull(skb, 2); + params.hwaddr = priv->extended_addr; + changed |= IEEE802154_LLSEC_PARAM_HWADDR; - return 0; -} + rc = ops->llsec->set_params(dev, ¶ms, changed); + } -static inline void mac802154_haddr_copy_swap(u8 *dest, const u8 *src) -{ - int i; - for (i = 0; i < IEEE802154_ADDR_LEN; i++) - dest[IEEE802154_ADDR_LEN - i - 1] = src[i]; + return rc; } static int @@ -76,19 +69,25 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGIFADDR: - if (priv->pan_id == IEEE802154_PANID_BROADCAST || - priv->short_addr == IEEE802154_ADDR_BROADCAST) { + { + u16 pan_id, short_addr; + + pan_id = le16_to_cpu(priv->pan_id); + short_addr = le16_to_cpu(priv->short_addr); + if (pan_id == IEEE802154_PANID_BROADCAST || + short_addr == IEEE802154_ADDR_BROADCAST) { err = -EADDRNOTAVAIL; break; } sa->family = AF_IEEE802154; sa->addr.addr_type = IEEE802154_ADDR_SHORT; - sa->addr.pan_id = priv->pan_id; - sa->addr.short_addr = priv->short_addr; + sa->addr.pan_id = pan_id; + sa->addr.short_addr = short_addr; err = 0; break; + } case SIOCSIFADDR: dev_warn(&dev->dev, "Using DEBUGing ioctl SIOCSIFADDR isn't recommened!\n"); @@ -101,10 +100,10 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - priv->pan_id = sa->addr.pan_id; - priv->short_addr = sa->addr.short_addr; + priv->pan_id = cpu_to_le16(sa->addr.pan_id); + priv->short_addr = cpu_to_le16(sa->addr.short_addr); - err = 0; + err = mac802154_wpan_update_llsec(dev); break; } @@ -122,193 +121,194 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) /* FIXME: validate addr */ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); mac802154_dev_set_ieee_addr(dev); - return 0; + return mac802154_wpan_update_llsec(dev); } -static int mac802154_header_create(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, - const void *_daddr, - const void *_saddr, - unsigned len) +int mac802154_set_mac_params(struct net_device *dev, + const struct ieee802154_mac_params *params) { - const struct ieee802154_addr *saddr = _saddr; - const struct ieee802154_addr *daddr = _daddr; - struct ieee802154_addr dev_addr; struct mac802154_sub_if_data *priv = netdev_priv(dev); - int pos = 2; - u8 head[MAC802154_FRAME_HARD_HEADER_LEN]; - u16 fc; - if (!daddr) - return -EINVAL; + mutex_lock(&priv->hw->slaves_mtx); + priv->mac_params = *params; + mutex_unlock(&priv->hw->slaves_mtx); - head[pos++] = mac_cb(skb)->seq; /* DSN/BSN */ - fc = mac_cb_type(skb); - if (mac_cb_is_ackreq(skb)) - fc |= IEEE802154_FC_ACK_REQ; + return 0; +} - if (!saddr) { - spin_lock_bh(&priv->mib_lock); +void mac802154_get_mac_params(struct net_device *dev, + struct ieee802154_mac_params *params) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); - if (priv->short_addr == IEEE802154_ADDR_BROADCAST || - priv->short_addr == IEEE802154_ADDR_UNDEF || - priv->pan_id == IEEE802154_PANID_BROADCAST) { - dev_addr.addr_type = IEEE802154_ADDR_LONG; - memcpy(dev_addr.hwaddr, dev->dev_addr, - IEEE802154_ADDR_LEN); - } else { - dev_addr.addr_type = IEEE802154_ADDR_SHORT; - dev_addr.short_addr = priv->short_addr; - } + mutex_lock(&priv->hw->slaves_mtx); + *params = priv->mac_params; + mutex_unlock(&priv->hw->slaves_mtx); +} - dev_addr.pan_id = priv->pan_id; - saddr = &dev_addr; +static int mac802154_wpan_open(struct net_device *dev) +{ + int rc; + struct mac802154_sub_if_data *priv = netdev_priv(dev); + struct wpan_phy *phy = priv->hw->phy; - spin_unlock_bh(&priv->mib_lock); - } + rc = mac802154_slave_open(dev); + if (rc < 0) + return rc; - if (daddr->addr_type != IEEE802154_ADDR_NONE) { - fc |= (daddr->addr_type << IEEE802154_FC_DAMODE_SHIFT); + mutex_lock(&phy->pib_lock); - head[pos++] = daddr->pan_id & 0xff; - head[pos++] = daddr->pan_id >> 8; + if (phy->set_txpower) { + rc = phy->set_txpower(phy, priv->mac_params.transmit_power); + if (rc < 0) + goto out; + } - if (daddr->addr_type == IEEE802154_ADDR_SHORT) { - head[pos++] = daddr->short_addr & 0xff; - head[pos++] = daddr->short_addr >> 8; - } else { - mac802154_haddr_copy_swap(head + pos, daddr->hwaddr); - pos += IEEE802154_ADDR_LEN; - } + if (phy->set_lbt) { + rc = phy->set_lbt(phy, priv->mac_params.lbt); + if (rc < 0) + goto out; } - if (saddr->addr_type != IEEE802154_ADDR_NONE) { - fc |= (saddr->addr_type << IEEE802154_FC_SAMODE_SHIFT); + if (phy->set_cca_mode) { + rc = phy->set_cca_mode(phy, priv->mac_params.cca_mode); + if (rc < 0) + goto out; + } - if ((saddr->pan_id == daddr->pan_id) && - (saddr->pan_id != IEEE802154_PANID_BROADCAST)) { - /* PANID compression/intra PAN */ - fc |= IEEE802154_FC_INTRA_PAN; - } else { - head[pos++] = saddr->pan_id & 0xff; - head[pos++] = saddr->pan_id >> 8; - } + if (phy->set_cca_ed_level) { + rc = phy->set_cca_ed_level(phy, priv->mac_params.cca_ed_level); + if (rc < 0) + goto out; + } - if (saddr->addr_type == IEEE802154_ADDR_SHORT) { - head[pos++] = saddr->short_addr & 0xff; - head[pos++] = saddr->short_addr >> 8; - } else { - mac802154_haddr_copy_swap(head + pos, saddr->hwaddr); - pos += IEEE802154_ADDR_LEN; - } + if (phy->set_csma_params) { + rc = phy->set_csma_params(phy, priv->mac_params.min_be, + priv->mac_params.max_be, + priv->mac_params.csma_retries); + if (rc < 0) + goto out; } - head[0] = fc; - head[1] = fc >> 8; + if (phy->set_frame_retries) { + rc = phy->set_frame_retries(phy, + priv->mac_params.frame_retries); + if (rc < 0) + goto out; + } - memcpy(skb_push(skb, pos), head, pos); - skb_reset_mac_header(skb); - skb->mac_len = pos; + mutex_unlock(&phy->pib_lock); + return 0; - return pos; +out: + mutex_unlock(&phy->pib_lock); + return rc; } -static int -mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) +static int mac802154_set_header_security(struct mac802154_sub_if_data *priv, + struct ieee802154_hdr *hdr, + const struct ieee802154_mac_cb *cb) { - const u8 *hdr = skb_mac_header(skb); - const u8 *tail = skb_tail_pointer(skb); - struct ieee802154_addr *addr = (struct ieee802154_addr *)haddr; - u16 fc; - int da_type; + struct ieee802154_llsec_params params; + u8 level; - if (hdr + 3 > tail) - goto malformed; + mac802154_llsec_get_params(&priv->sec, ¶ms); - fc = hdr[0] | (hdr[1] << 8); + if (!params.enabled && cb->secen_override && cb->secen) + return -EINVAL; + if (!params.enabled || + (cb->secen_override && !cb->secen) || + !params.out_level) + return 0; + if (cb->seclevel_override && !cb->seclevel) + return -EINVAL; - hdr += 3; + level = cb->seclevel_override ? cb->seclevel : params.out_level; - da_type = IEEE802154_FC_DAMODE(fc); - addr->addr_type = IEEE802154_FC_SAMODE(fc); + hdr->fc.security_enabled = 1; + hdr->sec.level = level; + hdr->sec.key_id_mode = params.out_key.mode; + if (params.out_key.mode == IEEE802154_SCF_KEY_SHORT_INDEX) + hdr->sec.short_src = params.out_key.short_source; + else if (params.out_key.mode == IEEE802154_SCF_KEY_HW_INDEX) + hdr->sec.extended_src = params.out_key.extended_source; + hdr->sec.key_id = params.out_key.id; - switch (da_type) { - case IEEE802154_ADDR_NONE: - if (fc & IEEE802154_FC_INTRA_PAN) - goto malformed; - break; - case IEEE802154_ADDR_LONG: - if (fc & IEEE802154_FC_INTRA_PAN) { - if (hdr + 2 > tail) - goto malformed; - addr->pan_id = hdr[0] | (hdr[1] << 8); - hdr += 2; - } + return 0; +} - if (hdr + IEEE802154_ADDR_LEN > tail) - goto malformed; +static int mac802154_header_create(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + const void *daddr, + const void *saddr, + unsigned len) +{ + struct ieee802154_hdr hdr; + struct mac802154_sub_if_data *priv = netdev_priv(dev); + struct ieee802154_mac_cb *cb = mac_cb(skb); + int hlen; - hdr += IEEE802154_ADDR_LEN; - break; - case IEEE802154_ADDR_SHORT: - if (fc & IEEE802154_FC_INTRA_PAN) { - if (hdr + 2 > tail) - goto malformed; - addr->pan_id = hdr[0] | (hdr[1] << 8); - hdr += 2; - } + if (!daddr) + return -EINVAL; - if (hdr + 2 > tail) - goto malformed; + memset(&hdr.fc, 0, sizeof(hdr.fc)); + hdr.fc.type = cb->type; + hdr.fc.security_enabled = cb->secen; + hdr.fc.ack_request = cb->ackreq; + hdr.seq = ieee802154_mlme_ops(dev)->get_dsn(dev); - hdr += 2; - break; - default: - goto malformed; + if (mac802154_set_header_security(priv, &hdr, cb) < 0) + return -EINVAL; - } + if (!saddr) { + spin_lock_bh(&priv->mib_lock); - switch (addr->addr_type) { - case IEEE802154_ADDR_NONE: - break; - case IEEE802154_ADDR_LONG: - if (!(fc & IEEE802154_FC_INTRA_PAN)) { - if (hdr + 2 > tail) - goto malformed; - addr->pan_id = hdr[0] | (hdr[1] << 8); - hdr += 2; + if (priv->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || + priv->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || + priv->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { + hdr.source.mode = IEEE802154_ADDR_LONG; + hdr.source.extended_addr = priv->extended_addr; + } else { + hdr.source.mode = IEEE802154_ADDR_SHORT; + hdr.source.short_addr = priv->short_addr; } - if (hdr + IEEE802154_ADDR_LEN > tail) - goto malformed; + hdr.source.pan_id = priv->pan_id; - mac802154_haddr_copy_swap(addr->hwaddr, hdr); - hdr += IEEE802154_ADDR_LEN; - break; - case IEEE802154_ADDR_SHORT: - if (!(fc & IEEE802154_FC_INTRA_PAN)) { - if (hdr + 2 > tail) - goto malformed; - addr->pan_id = hdr[0] | (hdr[1] << 8); - hdr += 2; - } + spin_unlock_bh(&priv->mib_lock); + } else { + hdr.source = *(const struct ieee802154_addr *)saddr; + } - if (hdr + 2 > tail) - goto malformed; + hdr.dest = *(const struct ieee802154_addr *)daddr; - addr->short_addr = hdr[0] | (hdr[1] << 8); - hdr += 2; - break; - default: - goto malformed; - } + hlen = ieee802154_hdr_push(skb, &hdr); + if (hlen < 0) + return -EINVAL; - return sizeof(struct ieee802154_addr); + skb_reset_mac_header(skb); + skb->mac_len = hlen; -malformed: - pr_debug("malformed packet\n"); - return 0; + if (len > ieee802154_max_payload(&hdr)) + return -EMSGSIZE; + + return hlen; +} + +static int +mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + struct ieee802154_hdr hdr; + struct ieee802154_addr *addr = (struct ieee802154_addr *)haddr; + + if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { + pr_debug("malformed packet\n"); + return 0; + } + + *addr = hdr.source; + return sizeof(*addr); } static netdev_tx_t @@ -316,6 +316,7 @@ mac802154_wpan_xmit(struct sk_buff *skb, struct net_device *dev) { struct mac802154_sub_if_data *priv; u8 chan, page; + int rc; priv = netdev_priv(dev); @@ -331,6 +332,13 @@ mac802154_wpan_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + rc = mac802154_llsec_encrypt(&priv->sec, skb); + if (rc) { + pr_warn("encryption failed: %i\n", rc); + kfree_skb(skb); + return NETDEV_TX_OK; + } + skb->skb_iif = dev->ifindex; dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; @@ -344,13 +352,22 @@ static struct header_ops mac802154_header_ops = { }; static const struct net_device_ops mac802154_wpan_ops = { - .ndo_open = mac802154_slave_open, + .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = mac802154_wpan_xmit, .ndo_do_ioctl = mac802154_wpan_ioctl, .ndo_set_mac_address = mac802154_wpan_mac_addr, }; +static void mac802154_wpan_free(struct net_device *dev) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + + mac802154_llsec_destroy(&priv->sec); + + free_netdev(dev); +} + void mac802154_wpan_setup(struct net_device *dev) { struct mac802154_sub_if_data *priv; @@ -360,14 +377,14 @@ void mac802154_wpan_setup(struct net_device *dev) dev->hard_header_len = MAC802154_FRAME_HARD_HEADER_LEN; dev->header_ops = &mac802154_header_ops; - dev->needed_tailroom = 2; /* FCS */ + dev->needed_tailroom = 2 + 16; /* FCS + MIC */ dev->mtu = IEEE802154_MTU; dev->tx_queue_len = 300; dev->type = ARPHRD_IEEE802154; dev->flags = IFF_NOARP | IFF_BROADCAST; dev->watchdog_timeo = 0; - dev->destructor = free_netdev; + dev->destructor = mac802154_wpan_free; dev->netdev_ops = &mac802154_wpan_ops; dev->ml_priv = &mac802154_mlme_wpan; @@ -378,12 +395,21 @@ void mac802154_wpan_setup(struct net_device *dev) priv->page = 0; spin_lock_init(&priv->mib_lock); + mutex_init(&priv->sec_mtx); get_random_bytes(&priv->bsn, 1); get_random_bytes(&priv->dsn, 1); - priv->pan_id = IEEE802154_PANID_BROADCAST; - priv->short_addr = IEEE802154_ADDR_BROADCAST; + /* defaults per 802.15.4-2011 */ + priv->mac_params.min_be = 3; + priv->mac_params.max_be = 5; + priv->mac_params.csma_retries = 4; + priv->mac_params.frame_retries = -1; /* for compatibility, actual default is 3 */ + + priv->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); + priv->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + + mac802154_llsec_init(&priv->sec); } static int mac802154_process_data(struct net_device *dev, struct sk_buff *skb) @@ -392,15 +418,22 @@ static int mac802154_process_data(struct net_device *dev, struct sk_buff *skb) } static int -mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb) +mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb, + const struct ieee802154_hdr *hdr) { + __le16 span, sshort; + int rc; + pr_debug("getting packet via slave interface %s\n", sdata->dev->name); spin_lock_bh(&sdata->mib_lock); - switch (mac_cb(skb)->da.addr_type) { + span = sdata->pan_id; + sshort = sdata->short_addr; + + switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: - if (mac_cb(skb)->sa.addr_type != IEEE802154_ADDR_NONE) + if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE) /* FIXME: check if we are PAN coordinator */ skb->pkt_type = PACKET_OTHERHOST; else @@ -408,23 +441,22 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; break; case IEEE802154_ADDR_LONG: - if (mac_cb(skb)->da.pan_id != sdata->pan_id && - mac_cb(skb)->da.pan_id != IEEE802154_PANID_BROADCAST) + if (mac_cb(skb)->dest.pan_id != span && + mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) skb->pkt_type = PACKET_OTHERHOST; - else if (!memcmp(mac_cb(skb)->da.hwaddr, sdata->dev->dev_addr, - IEEE802154_ADDR_LEN)) + else if (mac_cb(skb)->dest.extended_addr == sdata->extended_addr) skb->pkt_type = PACKET_HOST; else skb->pkt_type = PACKET_OTHERHOST; break; case IEEE802154_ADDR_SHORT: - if (mac_cb(skb)->da.pan_id != sdata->pan_id && - mac_cb(skb)->da.pan_id != IEEE802154_PANID_BROADCAST) + if (mac_cb(skb)->dest.pan_id != span && + mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) skb->pkt_type = PACKET_OTHERHOST; - else if (mac_cb(skb)->da.short_addr == sdata->short_addr) + else if (mac_cb(skb)->dest.short_addr == sshort) skb->pkt_type = PACKET_HOST; - else if (mac_cb(skb)->da.short_addr == - IEEE802154_ADDR_BROADCAST) + else if (mac_cb(skb)->dest.short_addr == + cpu_to_le16(IEEE802154_ADDR_BROADCAST)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_OTHERHOST; @@ -437,111 +469,108 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb) skb->dev = sdata->dev; + rc = mac802154_llsec_decrypt(&sdata->sec, skb); + if (rc) { + pr_debug("decryption failed: %i\n", rc); + kfree_skb(skb); + return NET_RX_DROP; + } + sdata->dev->stats.rx_packets++; sdata->dev->stats.rx_bytes += skb->len; - switch (mac_cb_type(skb)) { + switch (mac_cb(skb)->type) { case IEEE802154_FC_TYPE_DATA: return mac802154_process_data(sdata->dev, skb); default: - pr_warning("ieee802154: bad frame received (type = %d)\n", - mac_cb_type(skb)); + pr_warn("ieee802154: bad frame received (type = %d)\n", + mac_cb(skb)->type); kfree_skb(skb); return NET_RX_DROP; } } -static int mac802154_parse_frame_start(struct sk_buff *skb) +static void mac802154_print_addr(const char *name, + const struct ieee802154_addr *addr) { - u8 *head = skb->data; - u16 fc; - - if (mac802154_fetch_skb_u16(skb, &fc) || - mac802154_fetch_skb_u8(skb, &(mac_cb(skb)->seq))) - goto err; - - pr_debug("fc: %04x dsn: %02x\n", fc, head[2]); - - mac_cb(skb)->flags = IEEE802154_FC_TYPE(fc); - mac_cb(skb)->sa.addr_type = IEEE802154_FC_SAMODE(fc); - mac_cb(skb)->da.addr_type = IEEE802154_FC_DAMODE(fc); - - if (fc & IEEE802154_FC_INTRA_PAN) - mac_cb(skb)->flags |= MAC_CB_FLAG_INTRAPAN; + if (addr->mode == IEEE802154_ADDR_NONE) + pr_debug("%s not present\n", name); - if (mac_cb(skb)->da.addr_type != IEEE802154_ADDR_NONE) { - if (mac802154_fetch_skb_u16(skb, &(mac_cb(skb)->da.pan_id))) - goto err; + pr_debug("%s PAN ID: %04x\n", name, le16_to_cpu(addr->pan_id)); + if (addr->mode == IEEE802154_ADDR_SHORT) { + pr_debug("%s is short: %04x\n", name, + le16_to_cpu(addr->short_addr)); + } else { + u64 hw = swab64((__force u64) addr->extended_addr); - /* source PAN id compression */ - if (mac_cb_is_intrapan(skb)) - mac_cb(skb)->sa.pan_id = mac_cb(skb)->da.pan_id; + pr_debug("%s is hardware: %8phC\n", name, &hw); + } +} - pr_debug("dest PAN addr: %04x\n", mac_cb(skb)->da.pan_id); +static int mac802154_parse_frame_start(struct sk_buff *skb, + struct ieee802154_hdr *hdr) +{ + int hlen; + struct ieee802154_mac_cb *cb = mac_cb_init(skb); - if (mac_cb(skb)->da.addr_type == IEEE802154_ADDR_SHORT) { - u16 *da = &(mac_cb(skb)->da.short_addr); + hlen = ieee802154_hdr_pull(skb, hdr); + if (hlen < 0) + return -EINVAL; - if (mac802154_fetch_skb_u16(skb, da)) - goto err; + skb->mac_len = hlen; - pr_debug("destination address is short: %04x\n", - mac_cb(skb)->da.short_addr); - } else { - if (!pskb_may_pull(skb, IEEE802154_ADDR_LEN)) - goto err; + pr_debug("fc: %04x dsn: %02x\n", le16_to_cpup((__le16 *)&hdr->fc), + hdr->seq); - mac802154_haddr_copy_swap(mac_cb(skb)->da.hwaddr, - skb->data); - skb_pull(skb, IEEE802154_ADDR_LEN); + cb->type = hdr->fc.type; + cb->ackreq = hdr->fc.ack_request; + cb->secen = hdr->fc.security_enabled; - pr_debug("destination address is hardware\n"); - } - } + mac802154_print_addr("destination", &hdr->dest); + mac802154_print_addr("source", &hdr->source); - if (mac_cb(skb)->sa.addr_type != IEEE802154_ADDR_NONE) { - /* non PAN-compression, fetch source address id */ - if (!(mac_cb_is_intrapan(skb))) { - u16 *sa_pan = &(mac_cb(skb)->sa.pan_id); - - if (mac802154_fetch_skb_u16(skb, sa_pan)) - goto err; - } + cb->source = hdr->source; + cb->dest = hdr->dest; - pr_debug("source PAN addr: %04x\n", mac_cb(skb)->da.pan_id); + if (hdr->fc.security_enabled) { + u64 key; - if (mac_cb(skb)->sa.addr_type == IEEE802154_ADDR_SHORT) { - u16 *sa = &(mac_cb(skb)->sa.short_addr); + pr_debug("seclevel %i\n", hdr->sec.level); - if (mac802154_fetch_skb_u16(skb, sa)) - goto err; + switch (hdr->sec.key_id_mode) { + case IEEE802154_SCF_KEY_IMPLICIT: + pr_debug("implicit key\n"); + break; - pr_debug("source address is short: %04x\n", - mac_cb(skb)->sa.short_addr); - } else { - if (!pskb_may_pull(skb, IEEE802154_ADDR_LEN)) - goto err; + case IEEE802154_SCF_KEY_INDEX: + pr_debug("key %02x\n", hdr->sec.key_id); + break; - mac802154_haddr_copy_swap(mac_cb(skb)->sa.hwaddr, - skb->data); - skb_pull(skb, IEEE802154_ADDR_LEN); + case IEEE802154_SCF_KEY_SHORT_INDEX: + pr_debug("key %04x:%04x %02x\n", + le32_to_cpu(hdr->sec.short_src) >> 16, + le32_to_cpu(hdr->sec.short_src) & 0xffff, + hdr->sec.key_id); + break; - pr_debug("source address is hardware\n"); + case IEEE802154_SCF_KEY_HW_INDEX: + key = swab64((__force u64) hdr->sec.extended_src); + pr_debug("key source %8phC %02x\n", &key, + hdr->sec.key_id); + break; } } return 0; -err: - return -EINVAL; } void mac802154_wpans_rx(struct mac802154_priv *priv, struct sk_buff *skb) { int ret; - struct sk_buff *sskb; struct mac802154_sub_if_data *sdata; + struct ieee802154_hdr hdr; - ret = mac802154_parse_frame_start(skb); + ret = mac802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); return; @@ -549,12 +578,16 @@ void mac802154_wpans_rx(struct mac802154_priv *priv, struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(sdata, &priv->slaves, list) { - if (sdata->type != IEEE802154_DEV_WPAN) + if (sdata->type != IEEE802154_DEV_WPAN || + !netif_running(sdata->dev)) continue; - sskb = skb_clone(skb, GFP_ATOMIC); - if (sskb) - mac802154_subif_frame(sdata, sskb); + mac802154_subif_frame(sdata, skb, &hdr); + skb = NULL; + break; } rcu_read_unlock(); + + if (skb) + kfree_skb(skb); } diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 851cd880b0c..6b38d083e1c 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -33,6 +33,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_MPLS))) goto out; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c3398cd99b9..e9410d17619 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -414,47 +414,116 @@ config NETFILTER_SYNPROXY endif # NF_CONNTRACK config NF_TABLES - depends on NETFILTER_NETLINK + select NETFILTER_NETLINK tristate "Netfilter nf_tables support" + help + nftables is the new packet classification framework that intends to + replace the existing {ip,ip6,arp,eb}_tables infrastructure. It + provides a pseudo-state machine with an extensible instruction-set + (also known as expressions) that the userspace 'nft' utility + (http://www.netfilter.org/projects/nftables) uses to build the + rule-set. It also comes with the generic set infrastructure that + allows you to construct mappings between matchings and actions + for performance lookups. + + To compile it as a module, choose M here. + +config NF_TABLES_INET + depends on NF_TABLES && IPV6 + select NF_TABLES_IPV4 + select NF_TABLES_IPV6 + tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" + help + This option enables support for a mixed IPv4/IPv6 "inet" table. config NFT_EXTHDR depends on NF_TABLES tristate "Netfilter nf_tables IPv6 exthdr module" + help + This option adds the "exthdr" expression that you can use to match + IPv6 extension headers. config NFT_META depends on NF_TABLES tristate "Netfilter nf_tables meta module" + help + This option adds the "meta" expression that you can use to match and + to set packet metainformation such as the packet mark. config NFT_CT depends on NF_TABLES depends on NF_CONNTRACK tristate "Netfilter nf_tables conntrack module" + help + This option adds the "meta" expression that you can use to match + connection tracking information such as the flow state. config NFT_RBTREE depends on NF_TABLES tristate "Netfilter nf_tables rbtree set module" + help + This option adds the "rbtree" set type (Red Black tree) that is used + to build interval-based sets. config NFT_HASH depends on NF_TABLES tristate "Netfilter nf_tables hash set module" + help + This option adds the "hash" set type that is used to build one-way + mappings between matchings and actions. config NFT_COUNTER depends on NF_TABLES tristate "Netfilter nf_tables counter module" + help + This option adds the "counter" expression that you can use to + include packet and byte counters in a rule. config NFT_LOG depends on NF_TABLES tristate "Netfilter nf_tables log module" + help + This option adds the "log" expression that you can use to log + packets matching some criteria. config NFT_LIMIT depends on NF_TABLES tristate "Netfilter nf_tables limit module" + help + This option adds the "limit" expression that you can use to + ratelimit rule matchings. config NFT_NAT depends on NF_TABLES depends on NF_CONNTRACK depends on NF_NAT tristate "Netfilter nf_tables nat module" + help + This option adds the "nat" expression that you can use to perform + typical Network Address Translation (NAT) packet transformations. + +config NFT_QUEUE + depends on NF_TABLES + depends on NETFILTER_XTABLES + depends on NETFILTER_NETLINK_QUEUE + tristate "Netfilter nf_tables queue module" + help + This is required if you intend to use the userspace queueing + infrastructure (also known as NFQUEUE) from nftables. + +config NFT_REJECT + depends on NF_TABLES + default m if NETFILTER_ADVANCED=n + tristate "Netfilter nf_tables reject support" + help + This option adds the "reject" expression that you can use to + explicitly deny and notify via TCP reset/ICMP informational errors + unallowed traffic. + +config NFT_REJECT_INET + depends on NF_TABLES_INET + default NFT_REJECT + tristate config NFT_COMPAT depends on NF_TABLES @@ -858,6 +927,16 @@ config NETFILTER_XT_MATCH_BPF To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_CGROUP + tristate '"control group" match support' + depends on NETFILTER_ADVANCED + depends on CGROUPS + select CGROUP_NET_CLASSID + ---help--- + Socket/process control group matching allows you to match locally + generated packets based on which net_cls control group processes + belong to. + config NETFILTER_XT_MATCH_CLUSTER tristate '"cluster" match support' depends on NF_CONNTRACK @@ -1035,6 +1114,15 @@ config NETFILTER_XT_MATCH_HL in the IPv6 header, or the time-to-live field in the IPv4 header of the packet. +config NETFILTER_XT_MATCH_IPCOMP + tristate '"ipcomp" match support' + depends on NETFILTER_ADVANCED + help + This match extension allows you to match a range of CPIs(16 bits) + inside IPComp header of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_IPRANGE tristate '"iprange" address range match support' depends on NETFILTER_ADVANCED @@ -1055,6 +1143,16 @@ config NETFILTER_XT_MATCH_IPVS If unsure, say N. +config NETFILTER_XT_MATCH_L2TP + tristate '"l2tp" match support' + depends on NETFILTER_ADVANCED + default L2TP + ---help--- + This option adds an "L2TP" match, which allows you to match against + L2TP protocol header fields. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 394483b2c19..bffdad774da 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -70,13 +70,16 @@ nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o obj-$(CONFIG_NF_TABLES) += nf_tables.o +obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o -#nf_tables-objs += nft_meta_target.o +obj-$(CONFIG_NFT_QUEUE) += nft_queue.o +obj-$(CONFIG_NFT_REJECT) += nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o @@ -133,8 +136,10 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o +obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o @@ -142,6 +147,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index a2d6263b6c6..2f7f5c32c6f 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -21,7 +21,7 @@ config IP_SET_MAX You can define here default value of the maximum number of IP sets for the kernel. - The value can be overriden by the 'max_sets' module + The value can be overridden by the 'max_sets' module parameter of the 'ip_set' module. config IP_SET_BITMAP_IP @@ -61,6 +61,15 @@ config IP_SET_HASH_IP To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_IPMARK + tristate "hash:ip,mark set support" + depends on IP_SET + help + This option adds the hash:ip,mark set type support, by which one + can store IPv4/IPv6 address and mark pairs. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_IPPORT tristate "hash:ip,port set support" depends on IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile index 44b2d38476f..231f10196cb 100644 --- a/net/netfilter/ipset/Makefile +++ b/net/netfilter/ipset/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o # hash types obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bac7e01df67..ec8114fae50 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -54,10 +54,10 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex is held: */ -#define nfnl_dereference(p) \ +#define ip_set_dereference(p) \ rcu_dereference_protected(p, 1) -#define nfnl_set(inst, id) \ - nfnl_dereference((inst)->ip_set_list)[id] +#define ip_set(inst, id) \ + ip_set_dereference((inst)->ip_set_list)[id] /* * The set types are implemented in modules and registered set types @@ -271,10 +271,7 @@ ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); - if (is_vmalloc_addr(members)) - vfree(members); - else - kfree(members); + kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); @@ -368,6 +365,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) + set->flags |= IPSET_CREATE_FLAG_FORCEADD; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; @@ -510,7 +509,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) - return 0; + return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); @@ -533,7 +532,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) - return 0; + return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); @@ -625,34 +624,6 @@ EXPORT_SYMBOL_GPL(ip_set_name_byindex); */ /* - * Find set by name, reference it once. The reference makes sure the - * thing pointed to, does not go away under our feet. - * - * The nfnl mutex is used in the function. - */ -ip_set_id_t -ip_set_nfnl_get(struct net *net, const char *name) -{ - ip_set_id_t i, index = IPSET_INVALID_ID; - struct ip_set *s; - struct ip_set_net *inst = ip_set_pernet(net); - - nfnl_lock(NFNL_SUBSYS_IPSET); - for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); - if (s != NULL && STREQ(s->name, name)) { - __ip_set_get(s); - index = i; - break; - } - } - nfnl_unlock(NFNL_SUBSYS_IPSET); - - return index; -} -EXPORT_SYMBOL_GPL(ip_set_nfnl_get); - -/* * Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * @@ -668,7 +639,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) return IPSET_INVALID_ID; nfnl_lock(NFNL_SUBSYS_IPSET); - set = nfnl_set(inst, index); + set = ip_set(inst, index); if (set) __ip_set_get(set); else @@ -694,7 +665,7 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index) nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ - set = nfnl_set(inst, index); + set = ip_set(inst, index); if (set != NULL) __ip_set_put(set); } @@ -762,7 +733,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { - set = nfnl_set(inst, i); + set = ip_set(inst, i); if (set != NULL && STREQ(set->name, name)) { *id = i; break; @@ -788,7 +759,7 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s == NULL) { if (*index == IPSET_INVALID_ID) *index = i; @@ -911,7 +882,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = nfnl_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst->ip_set_list); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ @@ -928,7 +899,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, * Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); - nfnl_set(inst, index) = set; + ip_set(inst, index) = set; return ret; @@ -953,10 +924,10 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { static void ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) { - struct ip_set *set = nfnl_set(inst, index); + struct ip_set *set = ip_set(inst, index); pr_debug("set: %s\n", set->name); - nfnl_set(inst, index) = NULL; + ip_set(inst, index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -990,7 +961,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL && s->ref) { ret = -IPSET_ERR_BUSY; goto out; @@ -998,7 +969,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, } read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL) ip_set_destroy_set(inst, i); } @@ -1048,7 +1019,7 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL) ip_set_flush_set(s); } @@ -1102,7 +1073,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { - s = nfnl_set(inst, i); + s = ip_set(inst, i); if (s != NULL && STREQ(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; @@ -1162,8 +1133,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); - nfnl_set(inst, from_id) = to; - nfnl_set(inst, to_id) = from; + ip_set(inst, from_id) = to; + ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); return 0; @@ -1185,7 +1156,7 @@ ip_set_dump_done(struct netlink_callback *cb) struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; if (cb->args[IPSET_CB_ARG0]) { pr_debug("release set %s\n", - nfnl_set(inst, cb->args[IPSET_CB_INDEX])->name); + ip_set(inst, cb->args[IPSET_CB_INDEX])->name); __ip_set_put_byindex(inst, (ip_set_id_t) cb->args[IPSET_CB_INDEX]); } @@ -1282,7 +1253,7 @@ dump_last: dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; - set = nfnl_set(inst, index); + set = ip_set(inst, index); if (set == NULL) { if (dump_type == DUMP_ONE) { ret = -ENOENT; @@ -1360,7 +1331,7 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", nfnl_set(inst, index)->name); + pr_debug("release set %s\n", ip_set(inst, index)->name); __ip_set_put_byindex(inst, index); cb->args[IPSET_CB_ARG0] = 0; } @@ -1915,7 +1886,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; if (id != IPSET_INVALID_ID) - req_get->family = nfnl_set(inst, id)->family; + req_get->family = ip_set(inst, id)->family; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } @@ -1929,7 +1900,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) goto done; } nfnl_lock(NFNL_SUBSYS_IPSET); - set = nfnl_set(inst, req_get->set.index); + set = ip_set(inst, req_get->set.index); strncpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); nfnl_unlock(NFNL_SUBSYS_IPSET); @@ -1973,7 +1944,6 @@ ip_set_net_init(struct net *net) return -ENOMEM; inst->is_deleted = 0; rcu_assign_pointer(inst->ip_set_list, list); - pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -1988,7 +1958,7 @@ ip_set_net_exit(struct net *net) inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ for (i = 0; i < inst->ip_set_max; i++) { - set = nfnl_set(inst, i); + set = ip_set(inst, i); if (set != NULL) ip_set_destroy_set(inst, i); } @@ -2024,6 +1994,7 @@ ip_set_init(void) nfnetlink_subsys_unregister(&ip_set_netlink_subsys); return ret; } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index be6932ad3a8..61c7fb05280 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -263,6 +263,9 @@ struct htype { u32 maxelem; /* max elements in the hash */ u32 elements; /* current element (vs timeout) */ u32 initval; /* random jhash init value */ +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; /* markmask value for mark mask to store */ +#endif struct timer_list gc; /* garbage collection when timeout enabled */ struct mtype_elem next; /* temporary storage for uadd */ #ifdef IP_SET_HASH_WITH_MULTI @@ -454,6 +457,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) #ifdef IP_SET_HASH_WITH_NETMASK x->netmask == y->netmask && #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + x->markmask == y->markmask && +#endif a->extensions == b->extensions; } @@ -627,6 +633,18 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, bool flag_exist = flags & IPSET_FLAG_EXIST; u32 key, multi = 0; + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) { + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t,key); + if (n->pos) { + /* Choosing the first entry in the array to replace */ + j = 0; + goto reuse_slot; + } + rcu_read_unlock_bh(); + } if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) /* FIXME: when set is full, we slow down here */ mtype_expire(set, h, NLEN(set->family), set->dsize); @@ -908,6 +926,10 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) + goto nla_put_failure; +#endif if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; @@ -1016,6 +1038,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; +#endif u8 hbits; #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; @@ -1026,6 +1051,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; + +#ifdef IP_SET_HASH_WITH_MARKMASK + markmask = 0xffffffff; +#endif #ifdef IP_SET_HASH_WITH_NETMASK netmask = set->family == NFPROTO_IPV4 ? 32 : 128; pr_debug("Create set %s with family %s\n", @@ -1034,6 +1063,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || +#ifdef IP_SET_HASH_WITH_MARKMASK + !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || +#endif !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; @@ -1057,6 +1089,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, return -IPSET_ERR_INVALID_NETMASK; } #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (tb[IPSET_ATTR_MARKMASK]) { + markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + + if ((markmask > 4294967295u) || markmask == 0) + return -IPSET_ERR_INVALID_MARKMASK; + } +#endif hsize = sizeof(*h); #ifdef IP_SET_HASH_WITH_NETS @@ -1071,6 +1111,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifdef IP_SET_HASH_WITH_NETMASK h->netmask = netmask; #endif +#ifdef IP_SET_HASH_WITH_MARKMASK + h->markmask = markmask; +#endif get_random_bytes(&h->initval, sizeof(h->initval)); set->timeout = IPSET_NO_TIMEOUT; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index e65fc2423d5..dd40607f878 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -25,7 +25,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support */ -#define IPSET_TYPE_REV_MAX 2 /* Comments support */ +/* 2 Comments support */ +#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c new file mode 100644 index 00000000000..4eff0a29725 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -0,0 +1,321 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,mark type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); +IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,mark"); + +/* Type specific function prefix */ +#define HTYPE hash_ipmark +#define IP_SET_HASH_WITH_MARKMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipmark4_elem { + __be32 ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, + const struct hash_ipmark4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark4_data_list(struct sk_buff *skb, + const struct hash_ipmark4_elem *data) +{ + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark4_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark4_elem *d) +{ + next->ip = d->ip; +} + +#define MTYPE hash_ipmark4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip, ip_to = 0; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip = ntohl(e.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip++) { + e.ip = htonl(ip); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_ipmark6_elem { + union nf_inet_addr ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, + const struct hash_ipmark6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark6_data_list(struct sk_buff *skb, + const struct hash_ipmark6_elem *data) +{ + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark6_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE hash_ipmark6 +#define PF 6 +#define HOST_MASK 128 +#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + + +static int +hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + + return ret; +} + +static struct ip_set_type hash_ipmark_type __read_mostly = { + .name = "hash:ip,mark", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_MARK, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ipmark_create, + .create_policy = { + [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_MARK] = { .type = NLA_U32 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipmark_init(void) +{ + return ip_set_type_register(&hash_ipmark_type); +} + +static void __exit +hash_ipmark_fini(void) +{ + ip_set_type_unregister(&hash_ipmark_type); +} + +module_init(hash_ipmark_init); +module_exit(hash_ipmark_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 525a595dd1f..7597b82a8b0 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ -#define IPSET_TYPE_REV_MAX 3 /* Comments support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index f5636631466..672655ffd57 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ -#define IPSET_TYPE_REV_MAX 3 /* Comments support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5d87fe8a41f..7308d84f927 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -29,7 +29,8 @@ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ -#define IPSET_TYPE_REV_MAX 5 /* Comments support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 8295cf4f9fd..4c7d495783a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -26,7 +26,8 @@ /* 1 Range as input support for IPv4 added */ /* 2 nomatch flag support added */ /* 3 Counters support added */ -#define IPSET_TYPE_REV_MAX 4 /* Comments support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 3f64a66bf5d..db2606805b3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -27,7 +27,8 @@ /* 1 nomatch flag support added */ /* 2 /0 support added */ /* 3 Counters support added */ -#define IPSET_TYPE_REV_MAX 4 /* Comments support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -46,31 +47,12 @@ struct iface_node { static void rbtree_destroy(struct rb_root *root) { - struct rb_node *p, *n = root->rb_node; - struct iface_node *node; - - /* Non-recursive destroy, like in ext3 */ - while (n) { - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - p = rb_parent(n); - node = rb_entry(n, struct iface_node, node); - if (!p) - *root = RB_ROOT; - else if (p->rb_left == n) - p->rb_left = NULL; - else if (p->rb_right == n) - p->rb_right = NULL; + struct iface_node *node, *next; + rbtree_postorder_for_each_entry_safe(node, next, root, node) kfree(node); - n = p; - } + + *root = RB_ROOT; } static int diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 2bc2dec20b0..3e99987e4bf 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -24,7 +24,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -59,7 +59,7 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, u32 *multi) { return ip1->ipcmp == ip2->ipcmp && - ip2->ccmp == ip2->ccmp; + ip1->ccmp == ip2->ccmp; } static inline int @@ -112,10 +112,10 @@ hash_netnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -334,10 +334,10 @@ hash_netnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 7097fb0141b..1c645fbd09c 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -28,7 +28,8 @@ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ -#define IPSET_TYPE_REV_MAX 5 /* Comments support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 703d1192a6a..c0d2ba73f8b 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -25,7 +25,8 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 /* Comments support added */ +/* 0 Comments support added */ +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 4f29fa97044..04d15fdc99e 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -7,8 +7,8 @@ #define E(a, b, c, d) \ {.ip6 = { \ - __constant_htonl(a), __constant_htonl(b), \ - __constant_htonl(c), __constant_htonl(d), \ + htonl(a), htonl(b), \ + htonl(c), htonl(d), \ } } /* diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 4c8e5c0aa1a..610e19c0e13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -797,7 +797,6 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_control_del(cp); if (cp->flags & IP_VS_CONN_F_NFCT) { - ip_vs_conn_drop_conntrack(cp); /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. @@ -871,11 +870,11 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; - ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr); - cp->vport = p->vport; - /* proto should only be IPPROTO_IP if d_addr is a fwmark */ + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, - &cp->daddr, daddr); + &cp->vaddr, p->vaddr); + cp->vport = p->vport; + ip_vs_addr_set(p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; @@ -1209,7 +1208,7 @@ void ip_vs_random_dropentry(struct net *net) * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned int hash = net_random() & ip_vs_conn_tab_mask; + unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f26ee46b51..e6836755c45 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -97,7 +97,7 @@ const char *ip_vs_proto_name(unsigned int proto) return "ICMPv6"; #endif default: - sprintf(buf, "IP_%d", proto); + sprintf(buf, "IP_%u", proto); return buf; } } @@ -1392,15 +1392,19 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (ipip) { __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; /* Update the MTU */ if (ic->type == ICMP_DEST_UNREACH && ic->code == ICMP_FRAG_NEEDED) { struct ip_vs_dest *dest = cp->dest; u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; /* Strip outer IP and ICMP, go to IPIP header */ - __skb_pull(skb, ihl + sizeof(_icmph)); + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_ipip; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", @@ -1408,7 +1412,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ipv4_update_pmtu(skb, dev_net(skb->dev), mtu, 0, 0, 0, 0); /* Client uses PMTUD? */ - if (!(cih->frag_off & htons(IP_DF))) + if (!(frag_off & htons(IP_DF))) goto ignore_ipip; /* Prefer the resulting PMTU */ if (dest) { @@ -1427,12 +1431,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* Strip outer IP, ICMP and IPIP, go to IP header of * original request. */ - __skb_pull(skb, offset2); + if (pskb_pull(skb, offset2) == NULL) + goto ignore_ipip; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, - ic->type, ic->code, ntohl(info)); - icmp_send(skb, ic->type, ic->code, info); + type, code, ntohl(info)); + icmp_send(skb, type, code, info); /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 35be035ee0c..581a6584ed0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2177,10 +2177,10 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) __u64 inbytes, outbytes; do { - start = u64_stats_fetch_begin_bh(&u->syncp); + start = u64_stats_fetch_begin_irq(&u->syncp); inbytes = u->ustats.inbytes; outbytes = u->ustats.outbytes; - } while (u64_stats_fetch_retry_bh(&u->syncp, start)); + } while (u64_stats_fetch_retry_irq(&u->syncp, start)); seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", i, u->ustats.conns, u->ustats.inpkts, @@ -3580,7 +3580,7 @@ out: } -static const struct genl_ops ip_vs_genl_ops[] __read_mostly = { +static const struct genl_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .flags = GENL_ADMIN_PERM, @@ -3778,6 +3778,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); } #else @@ -3840,7 +3841,6 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); remove_proc_entry("ip_vs_stats_percpu", net->proc_net); remove_proc_entry("ip_vs_stats", net->proc_net); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index ca056a331e6..547ff33c1ef 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc) spin_lock_bh(&svc->sched_lock); tbl->dead = 1; - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_del(en); atomic_dec(&tbl->entries); @@ -265,7 +265,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) unsigned long now = jiffies; int i, j; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); @@ -321,7 +321,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); @@ -340,7 +340,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) tbl->rover = j; out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); } @@ -363,7 +363,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; @@ -536,8 +536,7 @@ out: /* * IPVS LBLC Scheduler structure */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = { .name = "lblc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index d5f41514f57..5882bbfd198 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -62,6 +62,7 @@ #include <net/ip_vs.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_zones.h> @@ -96,6 +97,11 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; + /* Applications may adjust TCP seqs */ + if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && + !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) + return; + /* * The connection is not yet in the hashtable, so we update it. * CIP->VIP will remain the same, so leave the tuple in diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f63c2388f38..db801263ee9 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1637,7 +1637,10 @@ static int sync_thread_master(void *data) continue; } while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { - int ret = __wait_event_interruptible(*sk_sleep(sk), + /* (Ab)use interruptible sleep to avoid increasing + * the load avg. + */ + __wait_event_interruptible(*sk_sleep(sk), sock_writeable(sk) || kthread_should_stop()); if (unlikely(kthread_should_stop())) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index c47444e4cf8..73ba1cc7a88 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -562,7 +562,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); rcu_read_unlock(); @@ -590,7 +590,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); rcu_read_unlock(); @@ -684,7 +684,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); rcu_read_unlock(); @@ -774,7 +774,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); rcu_read_unlock(); @@ -883,10 +883,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) @@ -974,7 +974,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) @@ -1023,7 +1023,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_send_check(ip_hdr(skb)); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); rcu_read_unlock(); @@ -1060,7 +1060,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); rcu_read_unlock(); @@ -1157,7 +1157,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_nat_icmp(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); rcu_read_unlock(); @@ -1249,7 +1249,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_nat_icmp_v6(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 43549eb7a7b..1f4f954c4b4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -60,14 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, const struct nlattr *attr) __read_mostly; EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); -int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff); -EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook); +__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; +EXPORT_SYMBOL_GPL(nf_conntrack_locks); -DEFINE_SPINLOCK(nf_conntrack_lock); -EXPORT_SYMBOL_GPL(nf_conntrack_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); + +static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) +{ + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + spin_unlock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_unlock(&nf_conntrack_locks[h2]); +} + +/* return true if we need to recompute hashes (in case hash table was resized) */ +static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, + unsigned int h2, unsigned int sequence) +{ + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + if (h1 <= h2) { + spin_lock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_lock_nested(&nf_conntrack_locks[h2], + SINGLE_DEPTH_NESTING); + } else { + spin_lock(&nf_conntrack_locks[h2]); + spin_lock_nested(&nf_conntrack_locks[h1], + SINGLE_DEPTH_NESTING); + } + if (read_seqcount_retry(&net->ct.generation, sequence)) { + nf_conntrack_double_unlock(h1, h2); + return true; + } + return false; +} + +static void nf_conntrack_all_lock(void) +{ + int i; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_nested(&nf_conntrack_locks[i], i); +} + +static void nf_conntrack_all_unlock(void) +{ + int i; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_unlock(&nf_conntrack_locks[i]); +} unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); @@ -198,6 +243,50 @@ clean_from_lists(struct nf_conn *ct) nf_ct_remove_expectations(ct); } +/* must be called with local_bh_disable */ +static void nf_ct_add_to_dying_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) dying list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->dying); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) unconfirmed list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->unconfirmed); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* We overload first tuple to link into unconfirmed or dying list.*/ + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&pcpu->lock); +} + static void destroy_conntrack(struct nf_conntrack *nfct) { @@ -209,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); - /* To make sure we don't get any weird locking issues here: - * destroy_conntrack() MUST NOT be called with a write lock - * to nf_conntrack_lock!!! -HW */ rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) @@ -219,19 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct) rcu_read_unlock(); - spin_lock_bh(&nf_conntrack_lock); + local_bh_disable(); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, - * too. */ + * too. + */ nf_ct_remove_expectations(ct); - /* We overload first tuple to link into unconfirmed or dying list.*/ - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + nf_ct_del_from_dying_or_unconfirmed_list(ct); NF_CT_STAT_INC(net, delete); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (ct->master) nf_ct_put(ct->master); @@ -243,17 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct) static void nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + unsigned int hash, reply_hash; + u16 zone = nf_ct_zone(ct); + unsigned int sequence; nf_ct_helper_destroy(ct); - spin_lock_bh(&nf_conntrack_lock); - /* Inside lock so preempt is disabled on module removal path. - * Otherwise we can get spurious warnings. */ - NF_CT_STAT_INC(net, delete_list); + + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + clean_from_lists(ct); - /* add this conntrack to the dying list */ - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.dying); - spin_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + + nf_ct_add_to_dying_list(ct); + + NF_CT_STAT_INC(net, delete_list); + local_bh_enable(); } static void death_by_event(unsigned long ul_conntrack) @@ -318,12 +414,25 @@ static void death_by_timeout(unsigned long ul_conntrack) nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); } +static inline bool +nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, + const struct nf_conntrack_tuple *tuple, + u16 zone) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + /* A conntrack can be recreated with the equal tuple, + * so we need to check that the conntrack is confirmed + */ + return nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone && + nf_ct_is_confirmed(ct); +} + /* * Warning : * - Caller must take a reference on returned object * and recheck nf_ct_tuple_equal(tuple, &h->tuple) - * OR - * - Caller must lock nf_conntrack_lock before calling this function */ static struct nf_conntrack_tuple_hash * ____nf_conntrack_find(struct net *net, u16 zone, @@ -339,8 +448,7 @@ ____nf_conntrack_find(struct net *net, u16 zone, local_bh_disable(); begin: hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { + if (nf_ct_key_equal(h, tuple, zone)) { NF_CT_STAT_INC(net, found); local_bh_enable(); return h; @@ -361,15 +469,6 @@ begin: return NULL; } -struct nf_conntrack_tuple_hash * -__nf_conntrack_find(struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple) -{ - return ____nf_conntrack_find(net, zone, tuple, - hash_conntrack_raw(tuple, zone)); -} -EXPORT_SYMBOL_GPL(__nf_conntrack_find); - /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * __nf_conntrack_find_get(struct net *net, u16 zone, @@ -387,8 +486,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || - nf_ct_zone(ct) != zone)) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { nf_ct_put(ct); goto begin; } @@ -410,32 +508,36 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, - unsigned int repl_hash) + unsigned int reply_hash) { struct net *net = nf_ct_net(ct); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &net->ct.hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[repl_hash]); + &net->ct.hash[reply_hash]); } int nf_conntrack_hash_check_insert(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int hash, repl_hash; + unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; u16 zone; + unsigned int sequence; zone = nf_ct_zone(ct); - hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - spin_lock_bh(&nf_conntrack_lock); + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) @@ -443,32 +545,57 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; add_timer(&ct->timeout); - nf_conntrack_get(&ct->ct_general); - __nf_conntrack_hash_insert(ct, hash, repl_hash); + smp_wmb(); + /* The caller holds a reference to this object */ + atomic_set(&ct->ct_general.use, 2); + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); - spin_unlock_bh(&nf_conntrack_lock); - + local_bh_enable(); return 0; out: + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); return -EEXIST; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ + struct ct_pcpu *pcpu; + + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + + /* add this conntrack to the (per cpu) tmpl list */ + local_bh_disable(); + tmpl->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); + + spin_lock(&pcpu->lock); + /* Overload tuple linked list to put us in template list. */ + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->tmpl); + spin_unlock_bh(&pcpu->lock); +} +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { - unsigned int hash, repl_hash; + unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; @@ -477,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; u16 zone; + unsigned int sequence; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -489,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; zone = nf_ct_zone(ct); - /* reuse the hash saved before */ - hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); - repl_hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + local_bh_disable(); + + do { + sequence = read_seqcount_begin(&net->ct.generation); + /* reuse the hash saved before */ + hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; + hash = hash_bucket(hash, net); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related - connections for unconfirmed conns. But packet copies and - REJECT will give spurious warnings here. */ + * connections for unconfirmed conns. But packet copies and + * REJECT will give spurious warnings here. + */ /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ /* No external references means no one else could have - confirmed us. */ + * confirmed us. + */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); pr_debug("Confirming conntrack %p\n", ct); - - spin_lock_bh(&nf_conntrack_lock); - /* We have to check the DYING flag inside the lock to prevent a race against nf_ct_get_next_corpse() possibly called from user context, else we insert an already 'dead' hash, blocking further use of that particular connection -JM */ if (unlikely(nf_ct_is_dying(ct))) { - spin_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + local_bh_enable(); return NF_ACCEPT; } @@ -525,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb) &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - /* Remove from unconfirmed list */ - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + nf_ct_del_from_dying_or_unconfirmed_list(ct); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in @@ -555,9 +688,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) * guarantee that no other CPU can find the conntrack before the above * stores are visible. */ - __nf_conntrack_hash_insert(ct, hash, repl_hash); + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); help = nfct_help(ct); if (help && help->helper) @@ -568,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); return NF_DROP; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -612,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int hash) +static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; struct nf_conn *ct = NULL, *tmp; struct hlist_nulls_node *n; - unsigned int i, cnt = 0; + unsigned int i = 0, cnt = 0; int dropped = 0; + unsigned int hash, sequence; + spinlock_t *lockp; - rcu_read_lock(); - for (i = 0; i < net->ct.htable_size; i++) { + local_bh_disable(); +restart: + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_bucket(_hash, net); + for (; i < net->ct.htable_size; i++) { + lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (read_seqcount_retry(&net->ct.generation, sequence)) { + spin_unlock(lockp); + goto restart; + } hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) + if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && + !nf_ct_is_dying(tmp) && + atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; + break; + } cnt++; } - if (ct != NULL) { - if (likely(!nf_ct_is_dying(ct) && - atomic_inc_not_zero(&ct->ct_general.use))) - break; - else - ct = NULL; - } + hash = (hash + 1) % net->ct.htable_size; + spin_unlock(lockp); - if (cnt >= NF_CT_EVICTION_RANGE) + if (ct || cnt >= NF_CT_EVICTION_RANGE) break; - hash = (hash + 1) % net->ct.htable_size; } - rcu_read_unlock(); + local_bh_enable(); if (!ct) return dropped; @@ -693,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { - if (!early_drop(net, hash_bucket(hash, net))) { + if (!early_drop(net, hash)) { atomic_dec(&net->ct.count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); @@ -735,11 +879,10 @@ __nf_conntrack_alloc(struct net *net, u16 zone, nf_ct_zone->id = zone; } #endif - /* - * changes to lookup keys must be done before setting refcnt to 1 + /* Because we use RCU lookups, we set ct_general.use to zero before + * this is inserted in any list. */ - smp_wmb(); - atomic_set(&ct->ct_general.use, 1); + atomic_set(&ct->ct_general.use, 0); return ct; #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -763,10 +906,15 @@ void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + /* A freed object has refcnt == 0, that's + * the golden rule for SLAB_DESTROY_BY_RCU + */ + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); + nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&net->ct.count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -786,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_ecache *ecache; - struct nf_conntrack_expect *exp; + struct nf_conntrack_expect *exp = NULL; u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; struct nf_conn_timeout *timeout_ext; unsigned int *timeouts; @@ -830,39 +978,44 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, ecache ? ecache->expmask : 0, GFP_ATOMIC); - spin_lock_bh(&nf_conntrack_lock); - exp = nf_ct_find_expectation(net, zone, tuple); - if (exp) { - pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", - ct, exp); - /* Welcome, Mr. Bond. We've been expecting you... */ - __set_bit(IPS_EXPECTED_BIT, &ct->status); - ct->master = exp->master; - if (exp->helper) { - help = nf_ct_helper_ext_add(ct, exp->helper, - GFP_ATOMIC); - if (help) - rcu_assign_pointer(help->helper, exp->helper); - } + local_bh_disable(); + if (net->ct.expect_count) { + spin_lock(&nf_conntrack_expect_lock); + exp = nf_ct_find_expectation(net, zone, tuple); + if (exp) { + pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + ct, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &ct->status); + /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ + ct->master = exp->master; + if (exp->helper) { + help = nf_ct_helper_ext_add(ct, exp->helper, + GFP_ATOMIC); + if (help) + rcu_assign_pointer(help->helper, exp->helper); + } #ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = exp->master->mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK - ct->secmark = exp->master->secmark; + ct->secmark = exp->master->secmark; #endif - nf_conntrack_get(&ct->master->ct_general); - NF_CT_STAT_INC(net, expect_new); - } else { + NF_CT_STAT_INC(net, expect_new); + } + spin_unlock(&nf_conntrack_expect_lock); + } + if (!exp) { __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); NF_CT_STAT_INC(net, new); } - /* Overload tuple linked list to put us in unconfirmed list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.unconfirmed); + /* Now it is inserted into the unconfirmed list, bump refcount */ + nf_conntrack_get(&ct->ct_general); + nf_ct_add_to_unconfirmed_list(ct); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (exp) { if (exp->expectfn) @@ -1232,27 +1385,42 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; + int cpu; + spinlock_t *lockp; - spin_lock_bh(&nf_conntrack_lock); for (; *bucket < net->ct.htable_size; (*bucket)++) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) - continue; + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; + local_bh_disable(); + spin_lock(lockp); + if (*bucket < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } + } + spin_unlock(lockp); + local_bh_enable(); + } + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) - goto found; + set_bit(IPS_DYING_BIT, &ct->status); } + spin_unlock_bh(&pcpu->lock); } - hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - set_bit(IPS_DYING_BIT, &ct->status); - } - spin_unlock_bh(&nf_conntrack_lock); return NULL; found: atomic_inc(&ct->ct_general.use); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock(lockp); + local_bh_enable(); return ct; } @@ -1301,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net) struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; + int cpu; - spin_lock_bh(&nf_conntrack_lock); - hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - /* never fails to remove them, no listeners at this point */ - nf_ct_kill(ct); + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&pcpu->lock); } - spin_unlock_bh(&nf_conntrack_lock); } static int untrack_refs(void) @@ -1395,6 +1568,7 @@ i_see_dead_people: kmem_cache_destroy(net->ct.nf_conntrack_cachep); kfree(net->ct.slabname); free_percpu(net->ct.stat); + free_percpu(net->ct.pcpu_lists); } } @@ -1447,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hash) return -ENOMEM; + local_bh_disable(); + nf_conntrack_all_lock(); + write_seqcount_begin(&init_net.ct.generation); + /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections * created because of a false negative won't make it into the hash - * though since that required taking the lock. + * though since that required taking the locks. */ - spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < init_net.ct.htable_size; i++) { while (!hlist_nulls_empty(&init_net.ct.hash[i])) { h = hlist_nulls_entry(init_net.ct.hash[i].first, @@ -1469,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; init_net.ct.hash = hash; - spin_unlock_bh(&nf_conntrack_lock); + + write_seqcount_end(&init_net.ct.generation); + nf_conntrack_all_unlock(); + local_bh_enable(); nf_ct_free_hashtable(old_hash, old_size); return 0; @@ -1491,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); int nf_conntrack_init_start(void) { int max_factor = 8; - int ret, cpu; + int i, ret, cpu; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_init(&nf_conntrack_locks[i]); /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ @@ -1607,37 +1791,44 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { - int ret; + int ret = -ENOMEM; + int cpu; atomic_set(&net->ct.count, 0); - INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL); - net->ct.stat = alloc_percpu(struct ip_conntrack_stat); - if (!net->ct.stat) { - ret = -ENOMEM; + seqcount_init(&net->ct.generation); + + net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); + if (!net->ct.pcpu_lists) goto err_stat; + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_init(&pcpu->lock); + INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); } + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) + goto err_pcpu_lists; + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); - if (!net->ct.slabname) { - ret = -ENOMEM; + if (!net->ct.slabname) goto err_slabname; - } net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, sizeof(struct nf_conn), 0, SLAB_DESTROY_BY_RCU, NULL); if (!net->ct.nf_conntrack_cachep) { printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - ret = -ENOMEM; goto err_cache; } net->ct.htable_size = nf_conntrack_htable_size; net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); if (!net->ct.hash) { - ret = -ENOMEM; printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_hash; } @@ -1679,6 +1870,8 @@ err_cache: kfree(net->ct.slabname); err_slabname: free_percpu(net->ct.stat); +err_pcpu_lists: + free_percpu(net->ct.pcpu_lists); err_stat: return ret; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 4fd1ca94fd4..f87e8f68ad4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -66,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) { struct nf_conntrack_expect *exp = (void *)ul_expect; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } @@ -155,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone, if (!nf_ct_is_confirmed(exp->master)) return NULL; + /* Avoid race with other CPUs, that for exp->master ct, is + * about to invoke ->destroy(), or nf_ct_delete() via timeout + * or early_drop(). + * + * The atomic_inc_not_zero() check tells: If that fails, we + * know that the ct is being destroyed. If it succeeds, we + * can be sure the ct cannot disappear underneath. + */ + if (unlikely(nf_ct_is_dying(exp->master) || + !atomic_inc_not_zero(&exp->master->ct_general.use))) + return NULL; + if (exp->flags & NF_CT_EXPECT_PERMANENT) { atomic_inc(&exp->use); return exp; @@ -162,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone, nf_ct_unlink_expect(exp); return exp; } + /* Undo exp->master refcnt increase, if del_timer() failed */ + nf_ct_put(exp->master); return NULL; } @@ -177,12 +191,14 @@ void nf_ct_remove_expectations(struct nf_conn *ct) if (!help) return; + spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); @@ -217,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); @@ -335,7 +351,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); helper = rcu_dereference_protected(master_help->helper, - lockdep_is_held(&nf_conntrack_lock)); + lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; @@ -395,7 +411,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, - lockdep_is_held(&nf_conntrack_lock)); + lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && @@ -417,12 +433,12 @@ out: return ret; } -int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report) { int ret; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect); if (ret <= 0) goto out; @@ -430,11 +446,11 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, ret = nf_ct_expect_insert(expect); if (ret < 0) goto out; - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; out: - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 70866d192ef..3a3a60b126e 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1476,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_refresh(ct, skb, info->timeout * HZ); /* Set expect timeout */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3, info->sig_port[!dir]); if (exp) { @@ -1486,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_dump_tuple(&exp->tuple); set_expect_timeout(exp, info->timeout); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 974a2a4adef..5b3eae7d4c9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -250,16 +250,14 @@ out: } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); +/* appropiate ct lock protecting must be taken by caller */ static inline int unhelp(struct nf_conntrack_tuple_hash *i, const struct nf_conntrack_helper *me) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_help *help = nfct_help(ct); - if (help && rcu_dereference_protected( - help->helper, - lockdep_is_held(&nf_conntrack_lock) - ) == me) { + if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); RCU_INIT_POINTER(help->helper, NULL); } @@ -284,17 +282,17 @@ static LIST_HEAD(nf_ct_helper_expectfn_list); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); list_del_rcu(&n->head); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); @@ -396,15 +394,17 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, const struct hlist_node *next; const struct hlist_nulls_node *nn; unsigned int i; + int cpu; /* Get rid of expectations */ + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, - lockdep_is_held(&nf_conntrack_lock) + lockdep_is_held(&nf_conntrack_expect_lock) ) == me || exp->helper == me) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); @@ -412,14 +412,27 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } } } + spin_unlock_bh(&nf_conntrack_expect_lock); /* Get rid of expecteds, set helpers to NULL. */ - hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) - unhelp(h, me); - for (i = 0; i < net->ct.htable_size; i++) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) unhelp(h, me); + spin_unlock_bh(&pcpu->lock); + } + local_bh_disable(); + for (i = 0; i < net->ct.htable_size; i++) { + spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + if (i < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + unhelp(h, me); + } + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); } + local_bh_enable(); } void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) @@ -437,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) synchronize_rcu(); rtnl_lock(); - spin_lock_bh(&nf_conntrack_lock); for_each_net(net) __nf_conntrack_helper_unregister(me, net); - spin_unlock_bh(&nf_conntrack_lock); rtnl_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 08870b85904..300ed1eec72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -597,6 +597,9 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) ; @@ -764,14 +767,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; int res; + spinlock_t *lockp; + #ifdef CONFIG_NF_CONNTRACK_MARK const struct ctnetlink_dump_filter *filter = cb->data; #endif - spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; + + local_bh_disable(); for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: + lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (cb->args[0] >= net->ct.htable_size) { + spin_unlock(lockp); + goto out; + } hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -803,16 +815,18 @@ restart: if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; + spin_unlock(lockp); goto out; } } + spin_unlock(lockp); if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (last) nf_ct_put(last); @@ -966,7 +980,6 @@ ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, return 0; } -#define __CTA_LABELS_MAX_LENGTH ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, @@ -984,9 +997,9 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_ZONE] = { .type = NLA_U16 }, [CTA_MARK_MASK] = { .type = NLA_U32 }, [CTA_LABELS] = { .type = NLA_BINARY, - .len = __CTA_LABELS_MAX_LENGTH }, + .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, - .len = __CTA_LABELS_MAX_LENGTH }, + .len = NF_CT_LABELS_MAX_SIZE }, }; static int @@ -1138,8 +1151,7 @@ static int ctnetlink_done_list(struct netlink_callback *cb) } static int -ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, - struct hlist_nulls_head *list) +ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) { struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; @@ -1147,41 +1159,57 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; int res; + int cpu; + struct hlist_nulls_head *list; + struct net *net = sock_net(skb->sk); if (cb->args[2]) return 0; - spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; -restart: - hlist_nulls_for_each_entry(h, n, list, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (l3proto && nf_ct_l3num(ct) != l3proto) + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + struct ct_pcpu *pcpu; + + if (!cpu_possible(cpu)) continue; - if (cb->args[1]) { - if (ct != last) + + pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + spin_lock_bh(&pcpu->lock); + list = dying ? &pcpu->dying : &pcpu->unconfirmed; +restart: + hlist_nulls_for_each_entry(h, n, list, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (l3proto && nf_ct_l3num(ct) != l3proto) continue; - cb->args[1] = 0; + if (cb->args[1]) { + if (ct != last) + continue; + cb->args[1] = 0; + } + rcu_read_lock(); + res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { + if (!atomic_inc_not_zero(&ct->ct_general.use)) + continue; + cb->args[0] = cpu; + cb->args[1] = (unsigned long)ct; + spin_unlock_bh(&pcpu->lock); + goto out; + } } - rcu_read_lock(); - res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); - if (res < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; - goto out; + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; } + spin_unlock_bh(&pcpu->lock); } - if (cb->args[1]) { - cb->args[1] = 0; - goto restart; - } else - cb->args[2] = 1; + cb->args[2] = 1; out: - spin_unlock_bh(&nf_conntrack_lock); if (last) nf_ct_put(last); @@ -1191,9 +1219,7 @@ out: static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - - return ctnetlink_dump_list(skb, cb, &net->ct.dying); + return ctnetlink_dump_list(skb, cb, true); } static int @@ -1215,9 +1241,7 @@ ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb, static int ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - - return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed); + return ctnetlink_dump_list(skb, cb, false); } static int @@ -1310,27 +1334,25 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) } static int -ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_NAT_NEEDED int ret; - if (cda[CTA_NAT_DST]) { - ret = ctnetlink_parse_nat_setup(ct, - NF_NAT_MANIP_DST, - cda[CTA_NAT_DST]); - if (ret < 0) - return ret; - } - if (cda[CTA_NAT_SRC]) { - ret = ctnetlink_parse_nat_setup(ct, - NF_NAT_MANIP_SRC, - cda[CTA_NAT_SRC]); - if (ret < 0) - return ret; - } - return 0; + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, + cda[CTA_NAT_DST]); + if (ret < 0) + return ret; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); + return ret; #else + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; return -EOPNOTSUPP; #endif } @@ -1366,14 +1388,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) nf_ct_protonum(ct)); if (helper == NULL) { #ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); return -EOPNOTSUPP; } - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper) @@ -1659,11 +1681,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, goto err2; } - if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { - err = ctnetlink_change_nat(ct, cda); - if (err < 0) - goto err2; - } + err = ctnetlink_setup_nat(ct, cda); + if (err < 0) + goto err2; nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); @@ -1811,9 +1831,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | @@ -2023,6 +2043,9 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + ctnetlink_proto_size(ct) ; } @@ -2118,8 +2141,16 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) - if (cda[CTA_MARK]) - ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); + if (cda[CTA_MARK]) { + u32 mask = 0, mark, newmark; + if (cda[CTA_MARK_MASK]) + mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + + mark = ntohl(nla_get_be32(cda[CTA_MARK])); + newmark = (ct->mark & mask) ^ mark; + if (newmark != ct->mark) + ct->mark = newmark; + } #endif return 0; } @@ -2134,9 +2165,9 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) if (ret < 0) return ret; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } @@ -2691,13 +2722,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } /* after list removal, usage count == 1 */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_expect_put(exp); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); @@ -2706,7 +2737,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conn_help *m_help; /* delete all expectations for this helper */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], @@ -2721,10 +2752,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } } } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } else { /* This basically means we have to flush everything*/ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], @@ -2737,7 +2768,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } } } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; @@ -2963,11 +2994,11 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = __nf_ct_expect_find(net, zone, &tuple); if (!exp) { - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { err = ctnetlink_create_expect(net, zone, cda, @@ -2981,7 +3012,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, err = -EEXIST; if (!(nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return err; } diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7bd03decd36..825c3e3f830 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -605,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = { .expect_policy = &pptp_exp_policy, }; -static void nf_conntrack_pptp_net_exit(struct net *net) -{ - nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations nf_conntrack_pptp_net_ops = { - .exit = nf_conntrack_pptp_net_exit, -}; - static int __init nf_conntrack_pptp_init(void) { - int rv; - - rv = nf_conntrack_helper_register(&pptp); - if (rv < 0) - return rv; - rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); - if (rv < 0) - nf_conntrack_helper_unregister(&pptp); - return rv; + return nf_conntrack_helper_register(&pptp); } static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); } module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index ce3004156ee..b65d5864b6d 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -92,12 +92,6 @@ nf_ct_l3proto_find_get(u_int16_t l3proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); -void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_put); - int nf_ct_l3proto_try_module_get(unsigned short l3proto) { diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index a99b6c3427b..cb372f96f10 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -428,7 +428,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, const char *msg; u_int8_t state; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); BUG_ON(dh == NULL); state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; @@ -457,7 +457,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, - NULL, msg); + NULL, "%s", msg); return false; } @@ -486,7 +486,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, u_int8_t type, old_state, new_state; enum ct_dccp_roles role; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); BUG_ON(dh == NULL); type = dh->dccph_type; @@ -577,7 +577,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, unsigned int cscov; const char *msg; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); if (dh == NULL) { msg = "nf_ct_dccp: short packet "; goto out_invalid; @@ -614,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, msg); + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9d9c0dade60..d5665739e3b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -66,7 +66,7 @@ static inline struct netns_proto_gre *gre_pernet(struct net *net) return net_generic(net, proto_gre_net_id); } -void nf_ct_gre_keymap_flush(struct net *net) +static void nf_ct_gre_keymap_flush(struct net *net) { struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km, *tmp; @@ -78,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net) } write_unlock_bh(&net_gre->keymap_lock); } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush); static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index 17c1bcb182c..f6e2ae91a80 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -36,6 +36,11 @@ int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, if (off == 0) return 0; + if (unlikely(!seqadj)) { + WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); + return 0; + } + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 466410eaa48..4c3ba1c8d68 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -800,7 +800,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct, struct hlist_node *next; int found = 0; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (exp->class != SIP_EXPECT_SIGNALLING || !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || @@ -815,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct, found = 1; break; } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return found; } @@ -825,7 +825,7 @@ static void flush_expectations(struct nf_conn *ct, bool media) struct nf_conntrack_expect *exp; struct hlist_node *next; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) continue; @@ -836,7 +836,7 @@ static void flush_expectations(struct nf_conn *ct, bool media) if (!media) break; } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index 902fb0a6b38..7a394df0deb 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -97,7 +97,6 @@ int nf_conntrack_tstamp_pernet_init(struct net *net) void nf_conntrack_tstamp_pernet_fini(struct net *net) { nf_conntrack_tstamp_fini_sysctl(net); - nf_ct_extend_unregister(&tstamp_extend); } int nf_conntrack_tstamp_init(void) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 63a81540221..a49907b1dab 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -315,7 +315,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && - !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) { + !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (in_range(l3proto, l4proto, orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { @@ -339,7 +339,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, */ /* Only bother mapping if it's not already in range and unique */ - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) { + if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (l4proto->in_range(tuple, maniptype, &range->min_proto, @@ -358,6 +358,19 @@ out: rcu_read_unlock(); } +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + if (nat) + return nat; + + if (!nf_ct_is_confirmed(ct)) + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + + return nat; +} +EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); + unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, @@ -368,14 +381,9 @@ nf_nat_setup_info(struct nf_conn *ct, struct nf_conn_nat *nat; /* nat helper or nfctnetlink also setup binding */ - nat = nfct_nat(ct); - if (!nat) { - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); @@ -432,15 +440,15 @@ nf_nat_setup_info(struct nf_conn *ct, } EXPORT_SYMBOL(nf_nat_setup_info); -unsigned int -nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +static unsigned int +__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = - (HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? + (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range range = { @@ -448,7 +456,13 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) .min_addr = ip, .max_addr = ip, }; - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); + return nf_nat_setup_info(ct, &range, manip); +} + +unsigned int +nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); @@ -511,6 +525,39 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + + if (nf_nat_proto_remove(ct, data)) + return 1; + + if (!nat || !nat->ct) + return 0; + + /* This netns is being destroyed, and conntrack has nat null binding. + * Remove it from bysource hash, as the table will be freed soon. + * + * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() + * will delete entry from already-freed table. + */ + if (!del_timer(&ct->timeout)) + return 1; + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + ct->status &= ~IPS_NAT_DONE_MASK; + nat->ct = NULL; + spin_unlock_bh(&nf_nat_lock); + + add_timer(&ct->timeout); + + /* don't delete conntrack. Although that would make things a lot + * simpler, we'd end up flushing all conntracks on nat rmmod. + */ + return 0; +} + static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) { struct nf_nat_proto_clean clean = { @@ -702,9 +749,9 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { static int nfnetlink_parse_nat(const struct nlattr *nat, - const struct nf_conn *ct, struct nf_nat_range *range) + const struct nf_conn *ct, struct nf_nat_range *range, + const struct nf_nat_l3proto *l3proto) { - const struct nf_nat_l3proto *l3proto; struct nlattr *tb[CTA_NAT_MAX+1]; int err; @@ -714,38 +761,46 @@ nfnetlink_parse_nat(const struct nlattr *nat, if (err < 0) return err; - rcu_read_lock(); - l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); - if (l3proto == NULL) { - err = -EAGAIN; - goto out; - } err = l3proto->nlattr_to_range(tb, range); if (err < 0) - goto out; + return err; if (!tb[CTA_NAT_PROTO]) - goto out; + return 0; - err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); -out: - rcu_read_unlock(); - return err; + return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } +/* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range range; + const struct nf_nat_l3proto *l3proto; int err; - err = nfnetlink_parse_nat(attr, ct, &range); + /* Should not happen, restricted to creating new conntracks + * via ctnetlink. + */ + if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) + return -EEXIST; + + /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to + * attach the null binding, otherwise this may oops. + */ + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + if (l3proto == NULL) + return -EAGAIN; + + /* No NAT information has been passed, allocate the null-binding */ + if (attr == NULL) + return __nf_nat_alloc_null_binding(ct, manip); + + err = nfnetlink_parse_nat(attr, ct, &range, l3proto); if (err < 0) return err; - if (nf_nat_initialized(ct, manip)) - return -EEXIST; return nf_nat_setup_info(ct, &range, manip); } @@ -773,7 +828,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0); + nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index f02b3605823..1fb2258c353 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -34,10 +34,14 @@ static unsigned int help(struct sk_buff *skb, struct nf_conntrack_expect *exp) { char buffer[sizeof("4294967296 65635")]; + struct nf_conn *ct = exp->master; + union nf_inet_addr newaddr; u_int16_t port; unsigned int ret; /* Reply comes from server. */ + newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; exp->dir = IP_CT_DIR_REPLY; exp->expectfn = nf_nat_follow_master; @@ -57,17 +61,35 @@ static unsigned int help(struct sk_buff *skb, } if (port == 0) { - nf_ct_helper_log(skb, exp->master, "all ports in use"); + nf_ct_helper_log(skb, ct, "all ports in use"); return NF_DROP; } - ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo, - protoff, matchoff, matchlen, buffer, - strlen(buffer)); + /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 + * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 + * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 + * + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, + * 255.255.255.255==4294967296, 10 digits) + * P: bound port (min 1 d, max 5d (65635)) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + /* AAA = "us", ie. where server normally talks to. */ + snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port); + pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", + buffer, &newaddr.ip, port); + + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, + matchlen, buffer, strlen(buffer)); if (ret != NF_ACCEPT) { - nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); + nf_ct_helper_log(skb, ct, "cannot mangle packet"); nf_ct_unexpect_related(exp); } + return ret; } diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 9baaf734c14..83a72a235ca 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -74,22 +74,24 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, range_size = ntohs(range->max_proto.all) - min + 1; } - if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) + if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) { off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC ? tuple->dst.u.all : tuple->src.u.all); - else + } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { + off = prandom_u32(); + } else { off = *rover; + } for (i = 0; ; ++off) { *portptr = htons(min + off % range_size); if (++i != range_size && nf_nat_used_tuple(tuple, ct)) continue; - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) + if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) *rover = off; return; } - return; } EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 9858e3e51a3..52e20c9a46a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -363,9 +363,8 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_tmpl_insert(net, ct); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); @@ -390,7 +389,7 @@ static void __net_exit synproxy_net_exit(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - nf_conntrack_free(snet->tmpl); + nf_ct_put(snet->tmpl); synproxy_proc_exit(net); free_percpu(snet->stats); } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dcddc49c0e0..8746ff9a835 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -35,7 +35,7 @@ int nft_register_afinfo(struct net *net, struct nft_af_info *afi) { INIT_LIST_HEAD(&afi->tables); nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&afi->list, &net->nft.af_info); + list_add_tail_rcu(&afi->list, &net->nft.af_info); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(nft_register_afinfo); void nft_unregister_afinfo(struct nft_af_info *afi) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&afi->list); + list_del_rcu(&afi->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_afinfo); @@ -88,6 +88,45 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) return ERR_PTR(-EAFNOSUPPORT); } +static void nft_ctx_init(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nft_af_info *afi, + struct nft_table *table, + struct nft_chain *chain, + const struct nlattr * const *nla) +{ + ctx->net = sock_net(skb->sk); + ctx->afi = afi; + ctx->table = table; + ctx->chain = chain; + ctx->nla = nla; + ctx->portid = NETLINK_CB(skb).portid; + ctx->report = nlmsg_report(nlh); + ctx->seq = nlh->nlmsg_seq; +} + +static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, + u32 size) +{ + struct nft_trans *trans; + + trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); + if (trans == NULL) + return NULL; + + trans->msg_type = msg_type; + trans->ctx = *ctx; + + return trans; +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + list_del(&trans->list); + kfree(trans); +} + /* * Tables */ @@ -124,37 +163,43 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) return ++table->hgenerator; } -static struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX]; +static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX]; -static int __nf_tables_chain_type_lookup(int family, const struct nlattr *nla) +static const struct nf_chain_type * +__nf_tables_chain_type_lookup(int family, const struct nlattr *nla) { int i; - for (i=0; i<NFT_CHAIN_T_MAX; i++) { + for (i = 0; i < NFT_CHAIN_T_MAX; i++) { if (chain_type[family][i] != NULL && !nla_strcmp(nla, chain_type[family][i]->name)) - return i; + return chain_type[family][i]; } - return -1; + return NULL; } -static int nf_tables_chain_type_lookup(const struct nft_af_info *afi, - const struct nlattr *nla, - bool autoload) +static const struct nf_chain_type * +nf_tables_chain_type_lookup(const struct nft_af_info *afi, + const struct nlattr *nla, + bool autoload) { - int type; + const struct nf_chain_type *type; type = __nf_tables_chain_type_lookup(afi->family, nla); + if (type != NULL) + return type; #ifdef CONFIG_MODULES - if (type < 0 && autoload) { + if (autoload) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%*.s", afi->family, - nla_len(nla)-1, (const char *)nla_data(nla)); + request_module("nft-chain-%u-%.*s", afi->family, + nla_len(nla), (const char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); type = __nf_tables_chain_type_lookup(afi->family, nla); + if (type != NULL) + return ERR_PTR(-EAGAIN); } #endif - return type; + return ERR_PTR(-ENOENT); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -180,7 +225,8 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg->res_id = 0; if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || - nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags))) + nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || + nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -190,20 +236,13 @@ nla_put_failure: return -1; } -static int nf_tables_table_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - int event, int family) +static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; - u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - u32 seq = nlh ? nlh->nlmsg_seq : 0; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - bool report; int err; - report = nlh ? nlmsg_report(nlh) : false; - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -211,18 +250,20 @@ static int nf_tables_table_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, portid, seq, event, 0, - family, table); + err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -236,11 +277,14 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -253,15 +297,21 @@ static int nf_tables_dump_tables(struct sk_buff *skb, NLM_F_MULTI, afi->family, table) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } done: + rcu_read_unlock(); cb->args[0] = idx; return skb->len; } +/* Internal table flags */ +#define NFT_TABLE_INACTIVE (1 << 15) + static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -288,6 +338,8 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -306,13 +358,17 @@ err: return err; } -static int nf_tables_table_enable(struct nft_table *table) +static int nf_tables_table_enable(const struct nft_af_info *afi, + struct nft_table *table) { struct nft_chain *chain; int err, i = 0; list_for_each_entry(chain, &table->chains, list) { - err = nf_register_hook(&nft_base_chain(chain)->ops); + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); if (err < 0) goto err; @@ -321,59 +377,87 @@ static int nf_tables_table_enable(struct nft_table *table) return 0; err: list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + if (i-- <= 0) break; - nf_unregister_hook(&nft_base_chain(chain)->ops); + nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); } return err; } -static int nf_tables_table_disable(struct nft_table *table) +static void nf_tables_table_disable(const struct nft_af_info *afi, + struct nft_table *table) { struct nft_chain *chain; - list_for_each_entry(chain, &table->chains, list) - nf_unregister_hook(&nft_base_chain(chain)->ops); - - return 0; + list_for_each_entry(chain, &table->chains, list) { + if (chain->flags & NFT_BASE_CHAIN) + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } } -static int nf_tables_updtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct nft_af_info *afi, struct nft_table *table) +static int nf_tables_updtable(struct nft_ctx *ctx) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family, ret = 0; + struct nft_trans *trans; + u32 flags; + int ret = 0; - if (nla[NFTA_TABLE_FLAGS]) { - __be32 flags; + if (!ctx->nla[NFTA_TABLE_FLAGS]) + return 0; - flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; + + if (flags == ctx->table->flags) + return 0; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, + sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; - if ((flags & NFT_TABLE_F_DORMANT) && - !(table->flags & NFT_TABLE_F_DORMANT)) { - ret = nf_tables_table_disable(table); - if (ret >= 0) - table->flags |= NFT_TABLE_F_DORMANT; - } else if (!(flags & NFT_TABLE_F_DORMANT) && - table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(table); - if (ret >= 0) - table->flags &= ~NFT_TABLE_F_DORMANT; + if ((flags & NFT_TABLE_F_DORMANT) && + !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { + nft_trans_table_enable(trans) = false; + } else if (!(flags & NFT_TABLE_F_DORMANT) && + ctx->table->flags & NFT_TABLE_F_DORMANT) { + ret = nf_tables_table_enable(ctx->afi, ctx->table); + if (ret >= 0) { + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + nft_trans_table_enable(trans) = true; } - if (ret < 0) - goto err; } + if (ret < 0) + goto err; - nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family); + nft_trans_table_update(trans) = true; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; err: + nft_trans_destroy(trans); return ret; } +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWTABLE) + ctx->table->flags |= NFT_TABLE_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -384,6 +468,9 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, struct nft_table *table; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + u32 flags = 0; + struct nft_ctx ctx; + int err; afi = nf_tables_afinfo_lookup(net, family, true); if (IS_ERR(afi)) @@ -398,35 +485,45 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, } if (table != NULL) { + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return nf_tables_updtable(nlsk, skb, nlh, nla, afi, table); + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + return nf_tables_updtable(&ctx); } + if (nla[NFTA_TABLE_FLAGS]) { + flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; + } + + if (!try_module_get(afi->owner)) + return -EAFNOSUPPORT; + table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); - if (table == NULL) + if (table == NULL) { + module_put(afi->owner); return -ENOMEM; + } nla_strlcpy(table->name, name, nla_len(name)); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); + table->flags = flags; - if (nla[NFTA_TABLE_FLAGS]) { - __be32 flags; - - flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) { - kfree(table); - return -EINVAL; - } - - table->flags |= flags; + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); + if (err < 0) { + kfree(table); + module_put(afi->owner); + return err; } - - list_add_tail(&table->list, &afi->tables); - nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family); + list_add_tail_rcu(&table->list, &afi->tables); return 0; } @@ -438,7 +535,8 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, struct nft_af_info *afi; struct nft_table *table; struct net *net = sock_net(skb->sk); - int family = nfmsg->nfgen_family; + int family = nfmsg->nfgen_family, err; + struct nft_ctx ctx; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -447,17 +545,29 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); if (IS_ERR(table)) return PTR_ERR(table); - - if (table->use) + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + if (table->use > 0) return -EBUSY; - list_del(&table->list); - nf_tables_table_notify(skb, nlh, table, NFT_MSG_DELTABLE, family); - kfree(table); + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); + if (err < 0) + return err; + + list_del_rcu(&table->list); return 0; } -int nft_register_chain_type(struct nf_chain_type *ctype) +static void nf_tables_table_destroy(struct nft_ctx *ctx) +{ + BUG_ON(ctx->table->use > 0); + + kfree(ctx->table); + module_put(ctx->afi->owner); +} + +int nft_register_chain_type(const struct nf_chain_type *ctype) { int err = 0; @@ -466,10 +576,6 @@ int nft_register_chain_type(struct nf_chain_type *ctype) err = -EBUSY; goto out; } - - if (!try_module_get(ctype->me)) - goto out; - chain_type[ctype->family][ctype->type] = ctype; out: nfnl_unlock(NFNL_SUBSYS_NFTABLES); @@ -477,11 +583,10 @@ out: } EXPORT_SYMBOL_GPL(nft_register_chain_type); -void nft_unregister_chain_type(struct nf_chain_type *ctype) +void nft_unregister_chain_type(const struct nf_chain_type *ctype) { nfnl_lock(NFNL_SUBSYS_NFTABLES); chain_type[ctype->family][ctype->type] = NULL; - module_put(ctype->me); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_chain_type); @@ -526,7 +631,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, - [NFTA_CHAIN_TYPE] = { .type = NLA_NUL_STRING }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, }; @@ -539,13 +644,20 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) { struct nft_stats *cpu_stats, total; struct nlattr *nest; + unsigned int seq; + u64 pkts, bytes; int cpu; memset(&total, 0, sizeof(total)); for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); - total.pkts += cpu_stats->pkts; - total.bytes += cpu_stats->bytes; + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + pkts = cpu_stats->pkts; + bytes = cpu_stats->bytes; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + total.pkts += pkts; + total.bytes += bytes; } nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); if (nest == NULL) @@ -589,7 +701,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, if (chain->flags & NFT_BASE_CHAIN) { const struct nft_base_chain *basechain = nft_base_chain(chain); - const struct nf_hook_ops *ops = &basechain->ops; + const struct nf_hook_ops *ops = &basechain->ops[0]; struct nlattr *nest; nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); @@ -605,9 +717,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, htonl(basechain->policy))) goto nla_put_failure; - if (nla_put_string(skb, NFTA_CHAIN_TYPE, - chain_type[ops->pf][nft_base_chain(chain)->type]->name)) - goto nla_put_failure; + if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) + goto nla_put_failure; if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) goto nla_put_failure; @@ -623,21 +734,13 @@ nla_put_failure: return -1; } -static int nf_tables_chain_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - const struct nft_chain *chain, - int event, int family) +static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; - u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - u32 seq = nlh ? nlh->nlmsg_seq : 0; - bool report; int err; - report = nlh ? nlmsg_report(nlh) : false; - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -645,18 +748,21 @@ static int nf_tables_chain_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, portid, seq, event, 0, family, - table, chain); + err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -671,12 +777,15 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(chain, &table->chains, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -688,17 +797,19 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NLM_F_MULTI, afi->family, table, chain) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } } done: + rcu_read_unlock(); cb->args[0] = idx; return skb->len; } - static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -726,10 +837,14 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -748,29 +863,12 @@ err: return err; } -static int -nf_tables_chain_policy(struct nft_base_chain *chain, const struct nlattr *attr) -{ - switch (ntohl(nla_get_be32(attr))) { - case NF_DROP: - chain->policy = NF_DROP; - break; - case NF_ACCEPT: - chain->policy = NF_ACCEPT; - break; - default: - return -EINVAL; - } - return 0; -} - static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; -static int -nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) +static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) { struct nlattr *tb[NFTA_COUNTER_MAX+1]; struct nft_stats __percpu *newstats; @@ -779,14 +877,14 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); if (err < 0) - return err; + return ERR_PTR(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return -EINVAL; + return ERR_PTR(-EINVAL); - newstats = alloc_percpu(struct nft_stats); + newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -795,36 +893,71 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + return newstats; +} + +static void nft_chain_stats_replace(struct nft_base_chain *chain, + struct nft_stats __percpu *newstats) +{ if (chain->stats) { - /* nfnl_lock is held, add some nfnl function for this, later */ struct nft_stats __percpu *oldstats = - rcu_dereference_protected(chain->stats, 1); + nft_dereference(chain->stats); rcu_assign_pointer(chain->stats, newstats); synchronize_rcu(); free_percpu(oldstats); } else rcu_assign_pointer(chain->stats, newstats); +} +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWCHAIN) + ctx->chain->flags |= NFT_CHAIN_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; } +static void nf_tables_chain_destroy(struct nft_chain *chain) +{ + BUG_ON(chain->use > 0); + + if (chain->flags & NFT_BASE_CHAIN) { + module_put(nft_base_chain(chain)->type->owner); + free_percpu(nft_base_chain(chain)->stats); + kfree(nft_base_chain(chain)); + } else { + kfree(chain); + } +} + static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr * uninitialized_var(name); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + u8 policy = NF_ACCEPT; u64 handle = 0; + unsigned int i; + struct nft_stats __percpu *stats; int err; bool create; + struct nft_ctx ctx; create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; @@ -836,9 +969,6 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(table)) return PTR_ERR(table); - if (table->use == UINT_MAX) - return -EOVERFLOW; - chain = NULL; name = nla[NFTA_CHAIN_NAME]; @@ -856,7 +986,28 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, } } + if (nla[NFTA_CHAIN_POLICY]) { + if ((chain != NULL && + !(chain->flags & NFT_BASE_CHAIN)) || + nla[NFTA_CHAIN_HOOK] == NULL) + return -EOPNOTSUPP; + + policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); + switch (policy) { + case NF_DROP: + case NF_ACCEPT: + break; + default: + return -EINVAL; + } + } + if (chain != NULL) { + struct nft_stats *stats = NULL; + struct nft_trans *trans; + + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -866,44 +1017,53 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) return -EEXIST; - if (nla[NFTA_CHAIN_POLICY]) { + if (nla[NFTA_CHAIN_COUNTERS]) { if (!(chain->flags & NFT_BASE_CHAIN)) return -EOPNOTSUPP; - err = nf_tables_chain_policy(nft_base_chain(chain), - nla[NFTA_CHAIN_POLICY]); - if (err < 0) - return err; + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) + return PTR_ERR(stats); } - if (nla[NFTA_CHAIN_COUNTERS]) { - if (!(chain->flags & NFT_BASE_CHAIN)) - return -EOPNOTSUPP; + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, + sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; - err = nf_tables_counters(nft_base_chain(chain), - nla[NFTA_CHAIN_COUNTERS]); - if (err < 0) - return err; - } + nft_trans_chain_stats(trans) = stats; + nft_trans_chain_update(trans) = true; - if (nla[NFTA_CHAIN_HANDLE] && name) - nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + if (nla[NFTA_CHAIN_POLICY]) + nft_trans_chain_policy(trans) = policy; + else + nft_trans_chain_policy(trans) = -1; - goto notify; + if (nla[NFTA_CHAIN_HANDLE] && name) { + nla_strlcpy(nft_trans_chain_name(trans), name, + NFT_CHAIN_MAXNAMELEN); + } + list_add_tail(&trans->list, &net->nft.commit_list); + return 0; } + if (table->use == UINT_MAX) + return -EOVERFLOW; + if (nla[NFTA_CHAIN_HOOK]) { + const struct nf_chain_type *type; struct nf_hook_ops *ops; nf_hookfn *hookfn; - u32 hooknum; - int type = NFT_CHAIN_T_DEFAULT; + u32 hooknum, priority; + type = chain_type[family][NFT_CHAIN_T_DEFAULT]; if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE], create); - if (type < 0) - return -ENOENT; + if (IS_ERR(type)) + return PTR_ERR(type); } err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], @@ -917,59 +1077,55 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); if (hooknum >= afi->nhooks) return -EINVAL; + priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - hookfn = chain_type[family][type]->fn[hooknum]; - if (hookfn == NULL) + if (!(type->hook_mask & (1 << hooknum))) return -EOPNOTSUPP; + if (!try_module_get(type->owner)) + return -ENOENT; + hookfn = type->hooks[hooknum]; basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); if (basechain == NULL) return -ENOMEM; - basechain->type = type; - chain = &basechain->chain; - - ops = &basechain->ops; - ops->pf = family; - ops->owner = afi->owner; - ops->hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); - ops->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - ops->priv = chain; - ops->hook = hookfn; - if (afi->hooks[ops->hooknum]) - ops->hook = afi->hooks[ops->hooknum]; - - chain->flags |= NFT_BASE_CHAIN; - - if (nla[NFTA_CHAIN_POLICY]) { - err = nf_tables_chain_policy(basechain, - nla[NFTA_CHAIN_POLICY]); - if (err < 0) { - free_percpu(basechain->stats); - kfree(basechain); - return err; - } - } else - basechain->policy = NF_ACCEPT; - if (nla[NFTA_CHAIN_COUNTERS]) { - err = nf_tables_counters(basechain, - nla[NFTA_CHAIN_COUNTERS]); - if (err < 0) { - free_percpu(basechain->stats); + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) { + module_put(type->owner); kfree(basechain); - return err; + return PTR_ERR(stats); } + basechain->stats = stats; } else { - struct nft_stats __percpu *newstats; + stats = netdev_alloc_pcpu_stats(struct nft_stats); + if (IS_ERR(stats)) { + module_put(type->owner); + kfree(basechain); + return PTR_ERR(stats); + } + rcu_assign_pointer(basechain->stats, stats); + } - newstats = alloc_percpu(struct nft_stats); - if (newstats == NULL) - return -ENOMEM; + basechain->type = type; + chain = &basechain->chain; - rcu_assign_pointer(nft_base_chain(chain)->stats, - newstats); + for (i = 0; i < afi->nops; i++) { + ops = &basechain->ops[i]; + ops->pf = family; + ops->owner = afi->owner; + ops->hooknum = hooknum; + ops->priority = priority; + ops->priv = chain; + ops->hook = afi->hooks[ops->hooknum]; + if (hookfn) + ops->hook = hookfn; + if (afi->hook_ops_init) + afi->hook_ops_init(ops, i); } + + chain->flags |= NFT_BASE_CHAIN; + basechain->policy = policy; } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -984,32 +1140,28 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (!(table->flags & NFT_TABLE_F_DORMANT) && chain->flags & NFT_BASE_CHAIN) { - err = nf_register_hook(&nft_base_chain(chain)->ops); - if (err < 0) { - free_percpu(basechain->stats); - kfree(basechain); - return err; - } + err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + if (err < 0) + goto err1; } - list_add_tail(&chain->list, &table->chains); - table->use++; -notify: - nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN, - family); - return 0; -} - -static void nf_tables_rcu_chain_destroy(struct rcu_head *head) -{ - struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); - BUG_ON(chain->use > 0); + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); + if (err < 0) + goto err2; - if (chain->flags & NFT_BASE_CHAIN) { - free_percpu(nft_base_chain(chain)->stats); - kfree(nft_base_chain(chain)); - } else - kfree(chain); + table->use++; + list_add_tail_rcu(&chain->list, &table->chains); + return 0; +err2: + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } +err1: + nf_tables_chain_destroy(chain); + return err; } static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, @@ -1017,11 +1169,13 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nft_ctx ctx; + int err; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -1030,46 +1184,27 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) return PTR_ERR(chain); - - if (!list_empty(&chain->rules)) + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + if (chain->use > 0) return -EBUSY; - list_del(&chain->list); - table->use--; - - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) - nf_unregister_hook(&nft_base_chain(chain)->ops); - - nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_DELCHAIN, - family); + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); + if (err < 0) + return err; - /* Make sure all rule references are gone before this is released */ - call_rcu(&chain->rcu_head, nf_tables_rcu_chain_destroy); + table->use--; + list_del_rcu(&chain->list); return 0; } -static void nft_ctx_init(struct nft_ctx *ctx, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nft_af_info *afi, - const struct nft_table *table, - const struct nft_chain *chain, - const struct nlattr * const *nla) -{ - ctx->net = sock_net(skb->sk); - ctx->skb = skb; - ctx->nlh = nlh; - ctx->afi = afi; - ctx->table = table; - ctx->chain = chain; - ctx->nla = nla; -} - /* * Expressions */ @@ -1084,7 +1219,10 @@ static void nft_ctx_init(struct nft_ctx *ctx, int nft_register_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&type->list, &nf_tables_expressions); + if (type->family == NFPROTO_UNSPEC) + list_add_tail_rcu(&type->list, &nf_tables_expressions); + else + list_add_rcu(&type->list, &nf_tables_expressions); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -1099,40 +1237,50 @@ EXPORT_SYMBOL_GPL(nft_register_expr); void nft_unregister_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&type->list); + list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_expr); -static const struct nft_expr_type *__nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *__nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; list_for_each_entry(type, &nf_tables_expressions, list) { - if (!nla_strcmp(nla, type->name)) + if (!nla_strcmp(nla, type->name) && + (!type->family || type->family == family)) return type; } return NULL; } -static const struct nft_expr_type *nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; if (nla == NULL) return ERR_PTR(-EINVAL); - type = __nft_expr_type_get(nla); + type = __nft_expr_type_get(family, nla); if (type != NULL && try_module_get(type->owner)) return type; #ifdef CONFIG_MODULES if (type == NULL) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + + nfnl_unlock(NFNL_SUBSYS_NFTABLES); request_module("nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (__nft_expr_type_get(nla)) + if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); } #endif @@ -1183,7 +1331,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); @@ -1234,10 +1382,11 @@ err1: return err; } -static void nf_tables_expr_destroy(struct nft_expr *expr) +static void nf_tables_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) { if (expr->ops->destroy) - expr->ops->destroy(expr); + expr->ops->destroy(ctx, expr); module_put(expr->ops->type->owner); } @@ -1276,6 +1425,8 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, [NFTA_RULE_POSITION] = { .type = NLA_U64 }, + [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, @@ -1328,6 +1479,10 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, } nla_nest_end(skb, list); + if (rule->ulen && + nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) + goto nla_put_failure; + return nlmsg_end(skb, nlh); nla_put_failure: @@ -1335,22 +1490,15 @@ nla_put_failure: return -1; } -static int nf_tables_rule_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - const struct nft_chain *chain, +static int nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, - int event, u32 flags, int family) + int event) { struct sk_buff *skb; - u32 portid = NETLINK_CB(oskb).portid; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - u32 seq = nlh->nlmsg_seq; - bool report; int err; - report = nlmsg_report(nlh); - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -1358,18 +1506,21 @@ static int nf_tables_rule_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, portid, seq, event, flags, - family, table, chain, rule); + err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain, rule); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -1419,16 +1570,17 @@ static int nf_tables_dump_rules(struct sk_buff *skb, unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - u8 genctr = ACCESS_ONCE(net->nft.genctr); - u8 gencursor = ACCESS_ONCE(net->nft.gencursor); - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(chain, &table->chains, list) { - list_for_each_entry(rule, &chain->rules, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { + list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_rule_is_active(net, rule)) goto cont; if (idx < s_idx) @@ -1442,6 +1594,8 @@ static int nf_tables_dump_rules(struct sk_buff *skb, NLM_F_MULTI | NLM_F_APPEND, afi->family, table, chain, rule) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } @@ -1449,9 +1603,7 @@ cont: } } done: - /* Invalidate this dump, a transition to the new generation happened */ - if (gencursor != net->nft.gencursor || genctr != net->nft.genctr) - return -EBUSY; + rcu_read_unlock(); cb->args[0] = idx; return skb->len; @@ -1485,10 +1637,14 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); if (IS_ERR(chain)) return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) @@ -1511,9 +1667,9 @@ err: return err; } -static void nf_tables_rcu_rule_destroy(struct rcu_head *head) +static void nf_tables_rule_destroy(const struct nft_ctx *ctx, + struct nft_rule *rule) { - struct nft_rule *rule = container_of(head, struct nft_rule, rcu_head); struct nft_expr *expr; /* @@ -1522,55 +1678,46 @@ static void nf_tables_rcu_rule_destroy(struct rcu_head *head) */ expr = nft_expr_first(rule); while (expr->ops && expr != nft_expr_last(rule)) { - nf_tables_expr_destroy(expr); + nf_tables_expr_destroy(ctx, expr); expr = nft_expr_next(expr); } kfree(rule); } -static void nf_tables_rule_destroy(struct nft_rule *rule) +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, + struct nft_rule *rule) { - call_rcu(&rule->rcu_head, nf_tables_rcu_rule_destroy); -} - -#define NFT_RULE_MAXEXPRS 128 + struct nft_trans *trans; -static struct nft_expr_info *info; + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); + if (trans == NULL) + return NULL; -static struct nft_rule_trans * -nf_tables_trans_add(struct nft_rule *rule, const struct nft_ctx *ctx) -{ - struct nft_rule_trans *rupd; + nft_trans_rule(trans) = rule; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); - rupd = kmalloc(sizeof(struct nft_rule_trans), GFP_KERNEL); - if (rupd == NULL) - return NULL; + return trans; +} - rupd->chain = ctx->chain; - rupd->table = ctx->table; - rupd->rule = rule; - rupd->family = ctx->afi->family; - rupd->nlh = ctx->nlh; - list_add_tail(&rupd->list, &ctx->net->nft.commit_list); +#define NFT_RULE_MAXEXPRS 128 - return rupd; -} +static struct nft_expr_info *info; static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; - struct nft_rule_trans *repl = NULL; + struct nft_trans *trans = NULL; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n; + unsigned int size, i, n, ulen = 0; int err, rem; bool create; u64 handle, pos_handle; @@ -1605,6 +1752,9 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); + + if (chain->use == UINT_MAX) + return -EOVERFLOW; } if (nla[NFTA_RULE_POSITION]) { @@ -1636,8 +1786,11 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, } } + if (nla[NFTA_RULE_USERDATA]) + ulen = nla_len(nla[NFTA_RULE_USERDATA]); + err = -ENOMEM; - rule = kzalloc(sizeof(*rule) + size, GFP_KERNEL); + rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); if (rule == NULL) goto err1; @@ -1645,6 +1798,10 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, rule->handle = handle; rule->dlen = size; + rule->ulen = ulen; + + if (ulen) + nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); expr = nft_expr_first(rule); for (i = 0; i < n; i++) { @@ -1657,13 +1814,15 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) { if (nft_rule_is_active_next(net, old_rule)) { - repl = nf_tables_trans_add(old_rule, &ctx); - if (repl == NULL) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, + old_rule); + if (trans == NULL) { err = -ENOMEM; goto err2; } nft_rule_disactivate_next(net, old_rule); - list_add_tail(&rule->list, &old_rule->list); + chain->use--; + list_add_tail_rcu(&rule->list, &old_rule->list); } else { err = -ENOENT; goto err2; @@ -1680,22 +1839,23 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, list_add_rcu(&rule->list, &chain->rules); } - if (nf_tables_trans_add(rule, &ctx) == NULL) { + if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { err = -ENOMEM; goto err3; } + chain->use++; return 0; err3: list_del_rcu(&rule->list); - if (repl) { - list_del_rcu(&repl->rule->list); - list_del(&repl->list); - nft_rule_clear(net, repl->rule); - kfree(repl); + if (trans) { + list_del_rcu(&nft_trans_rule(trans)->list); + nft_rule_clear(net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + chain->use++; } err2: - nf_tables_rule_destroy(rule); + nf_tables_rule_destroy(&ctx, rule); err1: for (i = 0; i < n; i++) { if (info[i].ops != NULL) @@ -1709,24 +1869,38 @@ nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ if (nft_rule_is_active_next(ctx->net, rule)) { - if (nf_tables_trans_add(rule, ctx) == NULL) + if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) return -ENOMEM; nft_rule_disactivate_next(ctx->net, rule); + ctx->chain->use--; return 0; } return -ENOENT; } +static int nf_table_delrule_by_chain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &ctx->chain->rules, list) { + err = nf_tables_delrule_one(ctx, rule); + if (err < 0) + return err; + } + return 0; +} + static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); - const struct nft_table *table; - struct nft_chain *chain; - struct nft_rule *rule, *tmp; + struct nft_table *table; + struct nft_chain *chain = NULL; + struct nft_rule *rule; int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; @@ -1737,23 +1911,32 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); - if (IS_ERR(chain)) - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); - if (nla[NFTA_RULE_HANDLE]) { - rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); - if (IS_ERR(rule)) - return PTR_ERR(rule); + if (chain) { + if (nla[NFTA_RULE_HANDLE]) { + rule = nf_tables_rule_lookup(chain, + nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) + return PTR_ERR(rule); - err = nf_tables_delrule_one(&ctx, rule); - } else { - /* Remove all rules in this chain */ - list_for_each_entry_safe(rule, tmp, &chain->rules, list) { err = nf_tables_delrule_one(&ctx, rule); + } else { + err = nf_table_delrule_by_chain(&ctx); + } + } else { + list_for_each_entry(chain, &table->chains, list) { + ctx.chain = chain; + err = nf_table_delrule_by_chain(&ctx); if (err < 0) break; } @@ -1762,75 +1945,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return err; } -static int nf_tables_commit(struct sk_buff *skb) -{ - struct net *net = sock_net(skb->sk); - struct nft_rule_trans *rupd, *tmp; - - /* Bump generation counter, invalidate any dump in progress */ - net->nft.genctr++; - - /* A new generation has just started */ - net->nft.gencursor = gencursor_next(net); - - /* Make sure all packets have left the previous generation before - * purging old rules. - */ - synchronize_rcu(); - - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete this rule from the dirty list */ - list_del(&rupd->list); - - /* This rule was inactive in the past and just became active. - * Clear the next bit of the genmask since its meaning has - * changed, now it is the future. - */ - if (nft_rule_is_active(net, rupd->rule)) { - nft_rule_clear(net, rupd->rule); - nf_tables_rule_notify(skb, rupd->nlh, rupd->table, - rupd->chain, rupd->rule, - NFT_MSG_NEWRULE, 0, - rupd->family); - kfree(rupd); - continue; - } - - /* This rule is in the past, get rid of it */ - list_del_rcu(&rupd->rule->list); - nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain, - rupd->rule, NFT_MSG_DELRULE, 0, - rupd->family); - nf_tables_rule_destroy(rupd->rule); - kfree(rupd); - } - - return 0; -} - -static int nf_tables_abort(struct sk_buff *skb) -{ - struct net *net = sock_net(skb->sk); - struct nft_rule_trans *rupd, *tmp; - - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete all rules from the dirty list */ - list_del(&rupd->list); - - if (!nft_rule_is_active_next(net, rupd->rule)) { - nft_rule_clear(net, rupd->rule); - kfree(rupd); - continue; - } - - /* This rule is inactive, get rid of it */ - list_del_rcu(&rupd->rule->list); - nf_tables_rule_destroy(rupd->rule); - kfree(rupd); - } - return 0; -} - /* * Sets */ @@ -1840,7 +1954,7 @@ static LIST_HEAD(nf_tables_set_ops); int nft_register_set(struct nft_set_ops *ops) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&ops->list, &nf_tables_set_ops); + list_add_tail_rcu(&ops->list, &nf_tables_set_ops); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -1849,14 +1963,23 @@ EXPORT_SYMBOL_GPL(nft_register_set); void nft_unregister_set(struct nft_set_ops *ops) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&ops->list); + list_del_rcu(&ops->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_set); -static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const nla[]) +/* + * Select a set implementation based on the data characteristics and the + * given policy. The total memory use might not be known if no size is + * given, in that case the amount of memory per element is used. + */ +static const struct nft_set_ops * +nft_select_set_ops(const struct nlattr * const nla[], + const struct nft_set_desc *desc, + enum nft_set_policies policy) { - const struct nft_set_ops *ops; + const struct nft_set_ops *ops, *bops; + struct nft_set_estimate est, best; u32 features; #ifdef CONFIG_MODULES @@ -1874,26 +1997,64 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const features &= NFT_SET_INTERVAL | NFT_SET_MAP; } - // FIXME: implement selection properly + bops = NULL; + best.size = ~0; + best.class = ~0; + list_for_each_entry(ops, &nf_tables_set_ops, list) { if ((ops->features & features) != features) continue; + if (!ops->estimate(desc, features, &est)) + continue; + + switch (policy) { + case NFT_SET_POL_PERFORMANCE: + if (est.class < best.class) + break; + if (est.class == best.class && est.size < best.size) + break; + continue; + case NFT_SET_POL_MEMORY: + if (est.size < best.size) + break; + if (est.size == best.size && est.class < best.class) + break; + continue; + default: + break; + } + if (!try_module_get(ops->owner)) continue; - return ops; + if (bops != NULL) + module_put(bops->owner); + + bops = ops; + best = est; } + if (bops != NULL) + return bops; + return ERR_PTR(-EOPNOTSUPP); } static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, - [NFTA_SET_NAME] = { .type = NLA_STRING }, + [NFTA_SET_NAME] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 }, [NFTA_SET_DATA_LEN] = { .type = NLA_U32 }, + [NFTA_SET_POLICY] = { .type = NLA_U32 }, + [NFTA_SET_DESC] = { .type = NLA_NESTED }, + [NFTA_SET_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { + [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, @@ -1903,17 +2064,24 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, { struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; - const struct nft_table *table = NULL; + struct nft_af_info *afi = NULL; + struct nft_table *table = NULL; - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); + if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + } if (nla[NFTA_SET_TABLE] != NULL) { + if (afi == NULL) + return -EAFNOSUPPORT; + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; } nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); @@ -1935,13 +2103,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + struct nft_trans *trans; + u32 id = ntohl(nla_get_be32(nla)); + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWSET && + id == nft_trans_set_id(trans)) + return nft_trans_set(trans); + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) { const struct nft_set *i; const char *p; unsigned long *inuse; - unsigned int n = 0; + unsigned int n = 0, min = 0; p = strnchr(name, IFNAMSIZ, '%'); if (p != NULL) { @@ -1951,20 +2133,28 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; - +cont: list_for_each_entry(i, &ctx->table->sets, list) { - if (!sscanf(i->name, name, &n)) + int tmp; + + if (!sscanf(i->name, name, &tmp)) continue; - if (n < 0 || n > BITS_PER_LONG * PAGE_SIZE) + if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) continue; - set_bit(n, inuse); + + set_bit(tmp - min, inuse); } - n = find_first_zero_bit(inuse, BITS_PER_LONG * PAGE_SIZE); + n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); + if (n >= BITS_PER_BYTE * PAGE_SIZE) { + min += BITS_PER_BYTE * PAGE_SIZE; + memset(inuse, 0, PAGE_SIZE); + goto cont; + } free_page((unsigned long)inuse); } - snprintf(set->name, sizeof(set->name), name, n); + snprintf(set->name, sizeof(set->name), name, min + n); list_for_each_entry(i, &ctx->table->sets, list) { if (!strcmp(set->name, i->name)) return -ENFILE; @@ -1977,8 +2167,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, { struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; - u32 portid = NETLINK_CB(ctx->skb).portid; - u32 seq = ctx->nlh->nlmsg_seq; + struct nlattr *desc; + u32 portid = ctx->portid; + u32 seq = ctx->seq; event |= NFNL_SUBSYS_NFTABLES << 8; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), @@ -2010,6 +2201,14 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + desc = nla_nest_start(skb, NFTA_SET_DESC); + if (desc == NULL) + goto nla_put_failure; + if (set->size && + nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + goto nla_put_failure; + nla_nest_end(skb, desc); + return nlmsg_end(skb, nlh); nla_put_failure: @@ -2019,19 +2218,18 @@ nla_put_failure: static int nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, - int event) + int event, gfp_t gfp_flags) { struct sk_buff *skb; - u32 portid = NETLINK_CB(ctx->skb).portid; - bool report; + u32 portid = ctx->portid; int err; - report = nlmsg_report(ctx->nlh); - if (!report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; - skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); if (skb == NULL) goto err; @@ -2041,8 +2239,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, + ctx->report, gfp_flags); err: if (err < 0) nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); @@ -2058,7 +2256,10 @@ static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, if (cb->args[1]) return skb->len; - list_for_each_entry(set, &ctx->table->sets, list) { + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, @@ -2066,30 +2267,39 @@ static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[0] = idx; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } cb->args[1] = 1; done: + rcu_read_unlock(); return skb->len; } -static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, - struct netlink_callback *cb) +static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) { const struct nft_set *set; - unsigned int idx = 0, s_idx = cb->args[0]; + unsigned int idx, s_idx = cb->args[0]; struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; if (cb->args[1]) return skb->len; - list_for_each_entry(table, &ctx->afi->tables, list) { - if (cur_table && cur_table != table) - continue; + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(table, &ctx->afi->tables, list) { + if (cur_table) { + if (cur_table != table) + continue; + cur_table = NULL; + } ctx->table = table; - list_for_each_entry(set, &ctx->table->sets, list) { + idx = 0; + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, @@ -2098,12 +2308,74 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[2] = (unsigned long) table; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } cb->args[1] = 1; done: + rcu_read_unlock(); + return skb->len; +} + +static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nft_set *set; + unsigned int idx, s_idx = cb->args[0]; + struct nft_af_info *afi; + struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; + struct net *net = sock_net(skb->sk); + int cur_family = cb->args[3]; + + if (cb->args[1]) + return skb->len; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (cur_family) { + if (afi->family != cur_family) + continue; + + cur_family = 0; + } + + list_for_each_entry_rcu(table, &afi->tables, list) { + if (cur_table) { + if (cur_table != table) + continue; + + cur_table = NULL; + } + + ctx->table = table; + ctx->afi = afi; + idx = 0; + list_for_each_entry_rcu(set, &ctx->table->sets, list) { + if (idx < s_idx) + goto cont; + if (nf_tables_fill_set(skb, ctx, set, + NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + cb->args[2] = (unsigned long) table; + cb->args[3] = afi->family; + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + if (s_idx) + s_idx = 0; + } + } + cb->args[1] = 1; +done: + rcu_read_unlock(); return skb->len; } @@ -2123,14 +2395,19 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - if (ctx.table == NULL) - ret = nf_tables_dump_sets_all(&ctx, skb, cb); - else + if (ctx.table == NULL) { + if (ctx.afi == NULL) + ret = nf_tables_dump_sets_all(&ctx, skb, cb); + else + ret = nf_tables_dump_sets_family(&ctx, skb, cb); + } else ret = nf_tables_dump_sets_table(&ctx, skb, cb); return ret; } +#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */ + static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2138,6 +2415,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); int err; /* Verify existance before starting dump */ @@ -2152,9 +2430,15 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, return netlink_dump_start(nlsk, skb, nlh, &c); } + /* Only accept unspec with dump */ + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) @@ -2171,13 +2455,50 @@ err: return err; } +static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, + struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *da[NFTA_SET_DESC_MAX + 1]; + int err; + + err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); + if (err < 0) + return err; + + if (da[NFTA_SET_DESC_SIZE] != NULL) + desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + + return 0; +} + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + nft_trans_set_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); + set->flags |= NFT_SET_INACTIVE; + } + nft_trans_set(trans) = set; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; @@ -2185,14 +2506,18 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, char name[IFNAMSIZ]; unsigned int size; bool create; - u32 ktype, klen, dlen, dtype, flags; + u32 ktype, dtype, flags, policy; + struct nft_set_desc desc; int err; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || - nla[NFTA_SET_KEY_LEN] == NULL) + nla[NFTA_SET_KEY_LEN] == NULL || + nla[NFTA_SET_ID] == NULL) return -EINVAL; + memset(&desc, 0, sizeof(desc)); + ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); @@ -2200,8 +2525,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } - klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); - if (klen == 0 || klen > FIELD_SIZEOF(struct nft_data, data)) + desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); + if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) return -EINVAL; flags = 0; @@ -2213,7 +2538,6 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } dtype = 0; - dlen = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; @@ -2226,15 +2550,25 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; - dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); - if (dlen == 0 || - dlen > FIELD_SIZEOF(struct nft_data, data)) + desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); + if (desc.dlen == 0 || + desc.dlen > FIELD_SIZEOF(struct nft_data, data)) return -EINVAL; } else - dlen = sizeof(struct nft_data); + desc.dlen = sizeof(struct nft_data); } else if (flags & NFT_SET_MAP) return -EINVAL; + policy = NFT_SET_POL_PERFORMANCE; + if (nla[NFTA_SET_POLICY] != NULL) + policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + + if (nla[NFTA_SET_DESC] != NULL) { + err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); + if (err < 0) + return err; + } + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); @@ -2265,7 +2599,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(nla); + ops = nft_select_set_ops(nla, &desc, policy); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -2286,17 +2620,22 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&set->bindings); set->ops = ops; set->ktype = ktype; - set->klen = klen; + set->klen = desc.klen; set->dtype = dtype; - set->dlen = dlen; + set->dlen = desc.dlen; set->flags = flags; + set->size = desc.size; - err = ops->init(set, nla); + err = ops->init(set, &desc, nla); if (err < 0) goto err2; - list_add_tail(&set->list, &table->sets); - nf_tables_set_notify(&ctx, set, NFT_MSG_NEWSET); + err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); + if (err < 0) + goto err2; + + list_add_tail_rcu(&set->list, &table->sets); + table->use++; return 0; err2: @@ -2306,25 +2645,31 @@ err1: return err; } -static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +static void nft_set_destroy(struct nft_set *set) { - list_del(&set->list); - if (!(set->flags & NFT_SET_ANONYMOUS)) - nf_tables_set_notify(ctx, set, NFT_MSG_DELSET); - set->ops->destroy(set); module_put(set->ops->owner); kfree(set); } +static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ + list_del_rcu(&set->list); + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); + nft_set_destroy(set); +} + static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_set *set; struct nft_ctx ctx; int err; + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; @@ -2335,10 +2680,17 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; if (!list_empty(&set->bindings)) return -EBUSY; - nf_tables_set_destroy(&ctx, set); + err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); + if (err < 0) + return err; + + list_del_rcu(&set->list); + ctx.table->use--; return 0; } @@ -2350,7 +2702,9 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, enum nft_registers dreg; dreg = nft_type_to_reg(set->dtype); - return nft_validate_data_load(ctx, dreg, &elem->data, set->dtype); + return nft_validate_data_load(ctx, dreg, &elem->data, + set->dtype == NFT_DATA_VERDICT ? + NFT_DATA_VERDICT : NFT_DATA_VALUE); } int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, @@ -2387,16 +2741,17 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } bind: binding->chain = ctx->chain; - list_add_tail(&binding->list, &set->bindings); + list_add_tail_rcu(&binding->list, &set->bindings); return 0; } void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { - list_del(&binding->list); + list_del_rcu(&binding->list); - if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) + if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && + !(set->flags & NFT_SET_INACTIVE)) nf_tables_set_destroy(ctx, set); } @@ -2414,16 +2769,18 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + bool trans) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; - const struct nft_table *table; + struct nft_af_info *afi; + struct nft_table *table; struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); @@ -2433,6 +2790,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (!trans && (table->flags & NFT_TABLE_INACTIVE)) + return -ENOENT; nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); return 0; @@ -2501,19 +2860,21 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) u32 portid, seq; int event, err; - nfmsg = nlmsg_data(cb->nlh); - err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_ELEM_LIST_MAX, - nft_set_elem_list_policy); + err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, + NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy); if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla); + err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, + false); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; event = NFT_MSG_NEWSETELEM; event |= NFNL_SUBSYS_NFTABLES << 8; @@ -2526,7 +2887,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_UNSPEC; + nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; @@ -2570,13 +2931,15 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -2587,7 +2950,98 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, return -EOPNOTSUPP; } -static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, +static int nf_tables_fill_setelem_info(struct sk_buff *skb, + const struct nft_ctx *ctx, u32 seq, + u32 portid, int event, u16 flags, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *nest; + int err; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_NAME, set->name)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + if (nest == NULL) + goto nla_put_failure; + + err = nf_tables_fill_setelem(skb, set, elem); + if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) +{ + struct net *net = ctx->net; + u32 portid = ctx->portid; + struct sk_buff *skb; + int err; + + if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, + set, elem); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); +err: + if (err < 0) + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + return err; +} + +static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, + int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + if (trans == NULL) + return NULL; + + nft_trans_elem_set(trans) = set; + return trans; +} + +static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; @@ -2595,8 +3049,12 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct nft_set_binding *binding; enum nft_registers dreg; + struct nft_trans *trans; int err; + if (set->size && set->nelems == set->size) + return -ENFILE; + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy); if (err < 0) @@ -2616,6 +3074,9 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_DATA] == NULL && !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; + if (nla[NFTA_SET_ELEM_DATA] != NULL && + elem.flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; @@ -2646,7 +3107,7 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, struct nft_ctx bind_ctx = { .afi = ctx->afi, .table = ctx->table, - .chain = binding->chain, + .chain = (struct nft_chain *)binding->chain, }; err = nft_validate_data_load(&bind_ctx, dreg, @@ -2656,12 +3117,20 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, } } + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); + if (trans == NULL) + goto err3; + err = set->ops->insert(set, &elem); if (err < 0) - goto err3; + goto err4; + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; +err4: + kfree(trans); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_uninit(&elem.data, d2.type); @@ -2675,35 +3144,46 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err; + int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); - if (IS_ERR(set)) - return PTR_ERR(set); + if (IS_ERR(set)) { + if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { + set = nf_tables_set_lookup_byid(net, + nla[NFTA_SET_ELEM_LIST_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr); if (err < 0) - return err; + break; + + set->nelems++; } - return 0; + return err; } -static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set, +static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_trans *trans; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -2727,7 +3207,12 @@ static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err2; - set->ops->remove(set, &elem); + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); + if (trans == NULL) + goto err2; + + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); nft_data_uninit(&elem.key, NFT_DATA_VALUE); if (set->flags & NFT_SET_MAP) @@ -2746,9 +3231,9 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err; + int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); if (err < 0) return err; @@ -2761,14 +3246,16 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); if (err < 0) - return err; + break; + + set->nelems--; } - return 0; + return err; } static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { - .call = nf_tables_newtable, + .call_batch = nf_tables_newtable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, @@ -2778,12 +3265,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { - .call = nf_tables_deltable, + .call_batch = nf_tables_deltable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { - .call = nf_tables_newchain, + .call_batch = nf_tables_newchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -2793,7 +3280,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { - .call = nf_tables_delchain, + .call_batch = nf_tables_delchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -2813,7 +3300,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { - .call = nf_tables_newset, + .call_batch = nf_tables_newset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, @@ -2823,12 +3310,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { - .call = nf_tables_delset, + .call_batch = nf_tables_delset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { - .call = nf_tables_newsetelem, + .call_batch = nf_tables_newsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, @@ -2838,12 +3325,282 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { - .call = nf_tables_delsetelem, + .call_batch = nf_tables_delsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, }; +static void nft_chain_commit_update(struct nft_trans *trans) +{ + struct nft_base_chain *basechain; + + if (nft_trans_chain_name(trans)[0]) + strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + + if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) + return; + + basechain = nft_base_chain(trans->ctx.chain); + nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + + switch (nft_trans_chain_policy(trans)) { + case NF_DROP: + case NF_ACCEPT: + basechain->policy = nft_trans_chain_policy(trans); + break; + } +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * removed rules. + */ +static void nf_tables_commit_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_DELTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_DELRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_DELSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_commit(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + /* Bump generation counter, invalidate any dump in progress */ + while (++net->nft.base_seq == 0); + + /* A new generation has just started */ + net->nft.gencursor = gencursor_next(net); + + /* Make sure all packets have left the previous generation before + * purging old rules. + */ + synchronize_rcu(); + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (!nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + } else { + trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + } + nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELTABLE: + nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) + nft_chain_commit_update(trans); + else + trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + + nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + break; + case NFT_MSG_NEWRULE: + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_NEWRULE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELRULE: + list_del_rcu(&nft_trans_rule(trans)->list); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_DELRULE); + break; + case NFT_MSG_NEWSET: + nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS && + !list_empty(&nft_trans_set(trans)->bindings)) + trans->ctx.table->use--; + + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_NEWSET, GFP_KERNEL); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSET: + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_DELSET, GFP_KERNEL); + break; + case NFT_MSG_NEWSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_NEWSETELEM, 0); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_DELSETELEM, 0); + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); + } + + return 0; +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * aborted rules. + */ +static void nf_tables_abort_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_NEWCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_NEWRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_NEWSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_abort(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + nft_trans_destroy(trans); + } else { + list_del_rcu(&trans->ctx.table->list); + } + break; + case NFT_MSG_DELTABLE: + list_add_tail_rcu(&trans->ctx.table->list, + &trans->ctx.afi->tables); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) { + if (nft_trans_chain_stats(trans)) + free_percpu(nft_trans_chain_stats(trans)); + + nft_trans_destroy(trans); + } else { + trans->ctx.table->use--; + list_del_rcu(&trans->ctx.chain->list); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + } + break; + case NFT_MSG_DELCHAIN: + trans->ctx.table->use++; + list_add_tail_rcu(&trans->ctx.chain->list, + &trans->ctx.table->chains); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWRULE: + trans->ctx.chain->use--; + list_del_rcu(&nft_trans_rule(trans)->list); + break; + case NFT_MSG_DELRULE: + trans->ctx.chain->use++; + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSET: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_set(trans)->list); + break; + case NFT_MSG_DELSET: + trans->ctx.table->use++; + list_add_tail_rcu(&nft_trans_set(trans)->list, + &trans->ctx.table->sets); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSETELEM: + nft_trans_elem_set(trans)->nelems--; + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nft_trans_elem_set(trans)->nelems++; + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe_reverse(trans, next, + &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); + } + + return 0; +} + static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, @@ -2870,6 +3627,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, const struct nft_set_elem *elem) { + if (elem->flags & NFT_SET_ELEM_INTERVAL_END) + return 0; + switch (elem->data.verdict) { case NFT_JUMP: case NFT_GOTO: @@ -3044,9 +3804,16 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict) { - case NF_ACCEPT: - case NF_DROP: - case NF_QUEUE: + default: + switch (data->verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; + default: + return -EINVAL; + } + /* fall through */ case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: @@ -3067,8 +3834,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, data->chain = chain; desc->len = sizeof(data); break; - default: - return -EINVAL; } desc->type = NFT_DATA_VERDICT; @@ -3223,6 +3988,7 @@ static int nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.af_info); INIT_LIST_HEAD(&net->nft.commit_list); + net->nft.base_seq = 1; return 0; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index cb9e685caae..3b90eb2b2c5 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -25,9 +25,8 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1]) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - u32 mask; + u32 mask = nft_cmp_fast_mask(priv->len); - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - priv->len); if ((data[priv->sreg].data[0] & mask) == priv->data) return; data[NFT_REG_VERDICT].verdict = NFT_BREAK; @@ -67,20 +66,6 @@ struct nft_jumpstack { int rulenum; }; -static inline void -nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt, - struct nft_jumpstack *jumpstack, unsigned int stackptr) -{ - struct nft_stats __percpu *stats; - const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this; - - rcu_read_lock_bh(); - stats = rcu_dereference(nft_base_chain(chain)->stats); - __this_cpu_inc(stats->pkts); - __this_cpu_add(stats->bytes, pkt->skb->len); - rcu_read_unlock_bh(); -} - enum nft_trace { NFT_TRACE_RULE, NFT_TRACE_RETURN, @@ -103,28 +88,29 @@ static struct nf_loginfo trace_loginfo = { }, }; -static inline void nft_trace_packet(const struct nft_pktinfo *pkt, - const struct nft_chain *chain, - int rulenum, enum nft_trace type) +static void nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) { struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, pkt->xt.family, pkt->hooknum, pkt->skb, pkt->in, + nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", chain->table->name, chain->name, comments[type], rulenum); } unsigned int -nft_do_chain_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) { - const struct nft_chain *chain = ops->priv; + const struct nft_chain *chain = ops->priv, *basechain = chain; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_data data[NFT_REG_MAX + 1]; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - int rulenum = 0; + struct nft_stats *stats; + int rulenum; /* * Cache cursor to avoid problems in case that the cursor is updated * while traversing the ruleset. @@ -132,6 +118,7 @@ nft_do_chain_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); do_chain: + rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); next_rule: data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; @@ -157,14 +144,16 @@ next_rule: switch (data[NFT_REG_VERDICT].verdict) { case NFT_BREAK: data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; - /* fall through */ + continue; case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); continue; } break; } - switch (data[NFT_REG_VERDICT].verdict) { + switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_DROP: case NF_QUEUE: @@ -172,6 +161,9 @@ next_rule: nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); return data[NFT_REG_VERDICT].verdict; + } + + switch (data[NFT_REG_VERDICT].verdict) { case NFT_JUMP: if (unlikely(pkt->skb->nf_trace)) nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); @@ -181,39 +173,48 @@ next_rule: jumpstack[stackptr].rule = rule; jumpstack[stackptr].rulenum = rulenum; stackptr++; - /* fall through */ + chain = data[NFT_REG_VERDICT].chain; + goto do_chain; case NFT_GOTO: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + chain = data[NFT_REG_VERDICT].chain; goto do_chain; case NFT_RETURN: if (unlikely(pkt->skb->nf_trace)) nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); - - /* fall through */ + break; case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) + nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); break; default: WARN_ON(1); } if (stackptr > 0) { - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); - stackptr--; chain = jumpstack[stackptr].chain; rule = jumpstack[stackptr].rule; rulenum = jumpstack[stackptr].rulenum; goto next_rule; } - nft_chain_stats(chain, pkt, jumpstack, stackptr); if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_POLICY); + nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + + rcu_read_lock_bh(); + stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); + u64_stats_update_begin(&stats->syncp); + stats->pkts++; + stats->bytes += pkt->skb->len; + u64_stats_update_end(&stats->syncp); + rcu_read_unlock_bh(); - return nft_base_chain(chain)->policy; + return nft_base_chain(basechain)->policy; } -EXPORT_SYMBOL_GPL(nft_do_chain_pktinfo); +EXPORT_SYMBOL_GPL(nft_do_chain); int __init nf_tables_core_module_init(void) { diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c new file mode 100644 index 00000000000..9dd2d216cfc --- /dev/null +++ b/net/netfilter/nf_tables_inet.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/ip.h> + +static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n) +{ + struct nft_af_info *afi; + + if (n == 1) + afi = &nft_af_ipv4; + else + afi = &nft_af_ipv6; + + ops->pf = afi->family; + if (afi->hooks[ops->hooknum]) + ops->hook = afi->hooks[ops->hooknum]; +} + +static struct nft_af_info nft_af_inet __read_mostly = { + .family = NFPROTO_INET, + .nhooks = NF_INET_NUMHOOKS, + .owner = THIS_MODULE, + .nops = 2, + .hook_ops_init = nft_inet_hook_ops_init, +}; + +static int __net_init nf_tables_inet_init_net(struct net *net) +{ + net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.inet == NULL) + return -ENOMEM; + memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet)); + + if (nft_register_afinfo(net, net->nft.inet) < 0) + goto err; + + return 0; + +err: + kfree(net->nft.inet); + return -ENOMEM; +} + +static void __net_exit nf_tables_inet_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.inet); + kfree(net->nft.inet); +} + +static struct pernet_operations nf_tables_inet_net_ops = { + .init = nf_tables_inet_init_net, + .exit = nf_tables_inet_exit_net, +}; + +static const struct nf_chain_type filter_inet = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_INET, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_inet_init(void) +{ + int ret; + + nft_register_chain_type(&filter_inet); + ret = register_pernet_subsys(&nf_tables_inet_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_inet); + + return ret; +} + +static void __exit nf_tables_inet_exit(void) +{ + unregister_pernet_subsys(&nf_tables_inet_net_ops); + nft_unregister_chain_type(&filter_inet); +} + +module_init(nf_tables_inet_init); +module_exit(nf_tables_inet_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(1); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 046aa13b4fe..c138b8fbe28 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -61,6 +61,14 @@ void nfnl_unlock(__u8 subsys_id) } EXPORT_SYMBOL_GPL(nfnl_unlock); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_nfnl_is_held(u8 subsys_id) +{ + return lockdep_is_held(&table[subsys_id].mutex); +} +EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); +#endif + int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { nfnl_lock(n->subsys_id); @@ -248,15 +256,15 @@ replay: #endif { nfnl_unlock(subsys_id); - kfree_skb(nskb); - return netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(nskb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - kfree_skb(nskb); - return netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(skb); } while (skb->len >= nlmsg_total_size(0)) { @@ -360,14 +368,13 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - struct net *net = sock_net(skb->sk); int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { netlink_ack(skb, nlh, -EPERM); return; } @@ -392,19 +399,17 @@ static void nfnetlink_rcv(struct sk_buff *skb) } #ifdef CONFIG_MODULES -static void nfnetlink_bind(int group) +static int nfnetlink_bind(int group) { const struct nfnetlink_subsystem *ss; int type = nfnl_group2type[group]; rcu_read_lock(); ss = nfnetlink_get_subsys(type); - if (!ss) { - rcu_read_unlock(); - request_module("nfnetlink-subsys-%d", type); - return; - } rcu_read_unlock(); + if (!ss) + request_module("nfnetlink-subsys-%d", type); + return 0; } #endif diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index c7b6d466a66..2baa125c2e8 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -32,18 +32,24 @@ static LIST_HEAD(nfnl_acct_list); struct nf_acct { atomic64_t pkts; atomic64_t bytes; + unsigned long flags; struct list_head head; atomic_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; + char data[0]; }; +#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) + static int nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; char *acct_name; + unsigned int size = 0; + u32 flags = 0; if (!tb[NFACCT_NAME]) return -EINVAL; @@ -68,15 +74,38 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); + smp_mb__before_atomic(); + /* reset overquota flag if quota is enabled. */ + if ((matching->flags & NFACCT_F_QUOTA)) + clear_bit(NFACCT_F_OVERQUOTA, &matching->flags); return 0; } return -EBUSY; } - nfacct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL); + if (tb[NFACCT_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); + if (flags & ~NFACCT_F_QUOTA) + return -EOPNOTSUPP; + if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) + return -EINVAL; + if (flags & NFACCT_F_OVERQUOTA) + return -EINVAL; + + size += sizeof(u64); + } + + nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL); if (nfacct == NULL) return -ENOMEM; + if (flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)nfacct->data; + + *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); + nfacct->flags = flags; + } + strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { @@ -117,6 +146,9 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (type == NFNL_MSG_ACCT_GET_CTRZERO) { pkts = atomic64_xchg(&acct->pkts, 0); bytes = atomic64_xchg(&acct->bytes, 0); + smp_mb__before_atomic(); + if (acct->flags & NFACCT_F_QUOTA) + clear_bit(NFACCT_F_OVERQUOTA, &acct->flags); } else { pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); @@ -125,7 +157,13 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) || nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) goto nla_put_failure; + if (acct->flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)acct->data; + if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) || + nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) + goto nla_put_failure; + } nlmsg_end(skb, nlh); return skb->len; @@ -270,6 +308,8 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, [NFACCT_BYTES] = { .type = NLA_U64 }, [NFACCT_PKTS] = { .type = NLA_U64 }, + [NFACCT_FLAGS] = { .type = NLA_U32 }, + [NFACCT_QUOTA] = { .type = NLA_U64 }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { @@ -336,6 +376,50 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_update); +static void nfnl_overquota_report(struct nf_acct *nfacct) +{ + int ret; + struct sk_buff *skb; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (skb == NULL) + return; + + ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, + nfacct); + if (ret <= 0) { + kfree_skb(skb); + return; + } + netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, + GFP_ATOMIC); +} + +int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) +{ + u64 now; + u64 *quota; + int ret = NFACCT_UNDERQUOTA; + + /* no place here if we don't have a quota */ + if (!(nfacct->flags & NFACCT_F_QUOTA)) + return NFACCT_NO_QUOTA; + + quota = (u64 *)nfacct->data; + now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? + atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); + + ret = now > *quota; + + if (now >= *quota && + !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) { + nfnl_overquota_report(nfacct); + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfnl_acct_overquota); + static int __init nfnl_acct_init(void) { int ret; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 3c4b69e5fe1..d292c8d286e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -28,8 +28,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/list.h> -#include <linux/jhash.h> -#include <linux/random.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> @@ -75,7 +73,6 @@ struct nfulnl_instance { }; #define INSTANCE_BUCKETS 16 -static unsigned int hash_init; static int nfnl_log_net_id __read_mostly; @@ -1053,6 +1050,7 @@ static void __net_exit nfnl_log_net_exit(struct net *net) #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif + nf_log_unset(net, &nfulnl_logger); } static struct pernet_operations nfnl_log_net_ops = { @@ -1066,11 +1064,6 @@ static int __init nfnetlink_log_init(void) { int status = -ENOMEM; - /* it's not really all that important to have a random value, so - * we can do this from the init function, even if there hasn't - * been that much entropy yet */ - get_random_bytes(&hash_init, sizeof(hash_init)); - netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 21258cf7009..108120f216b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -29,6 +29,7 @@ #include <linux/netfilter/nfnetlink_queue.h> #include <linux/list.h> #include <net/sock.h> +#include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> #include <net/netns/generic.h> #include <net/netfilter/nfnetlink_queue.h> @@ -235,51 +236,6 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_unlock_bh(&queue->lock); } -static void -nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) -{ - int i, j = 0; - int plen = 0; /* length of skb->head fragment */ - struct page *page; - unsigned int offset; - - /* dont bother with small payloads */ - if (len <= skb_tailroom(to)) { - skb_copy_bits(from, 0, skb_put(to, len), len); - return; - } - - if (hlen) { - skb_copy_bits(from, 0, skb_put(to, hlen), hlen); - len -= hlen; - } else { - plen = min_t(int, skb_headlen(from), len); - if (plen) { - page = virt_to_head_page(from->head); - offset = from->data - (unsigned char *)page_address(page); - __skb_fill_page_desc(to, 0, page, offset, plen); - get_page(page); - j = 1; - len -= plen; - } - } - - to->truesize += len + plen; - to->len += len + plen; - to->data_len += len + plen; - - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { - if (!len) - break; - skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; - skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); - len -= skb_shinfo(to)->frags[j].size; - skb_frag_ref(to, j); - j++; - } - skb_shinfo(to)->nr_frags = j; -} - static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, bool csum_verify) @@ -297,6 +253,31 @@ nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; } +static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) +{ + const struct cred *cred; + + if (sk->sk_state == TCP_TIME_WAIT) + return 0; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + cred = sk->sk_socket->file->f_cred; + if (nla_put_be32(skb, NFQA_UID, + htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) + goto nla_put_failure; + if (nla_put_be32(skb, NFQA_GID, + htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) + goto nla_put_failure; + } + read_unlock_bh(&sk->sk_callback_lock); + return 0; + +nla_put_failure: + read_unlock_bh(&sk->sk_callback_lock); + return -1; +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -304,7 +285,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, { size_t size; size_t data_len = 0, cap_len = 0; - int hlen = 0; + unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; @@ -356,14 +337,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (data_len > entskb->len) data_len = entskb->len; - if (!entskb->head_frag || - skb_headlen(entskb) < L1_CACHE_BYTES || - skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS) - hlen = skb_headlen(entskb); - - if (skb_has_frag_list(entskb)) - hlen = entskb->len; - hlen = min_t(int, data_len, hlen); + hlen = skb_zerocopy_headlen(entskb); + hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; @@ -372,15 +347,23 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (queue->flags & NFQA_CFG_F_CONNTRACK) ct = nfqnl_ct_get(entskb, &size, &ctinfo); + if (queue->flags & NFQA_CFG_F_UID_GID) { + size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t))); /* gid */ + } + skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, GFP_ATOMIC); - if (!skb) + if (!skb) { + skb_tx_error(entskb); return NULL; + } nlh = nlmsg_put(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg), 0); if (!nlh) { + skb_tx_error(entskb); kfree_skb(skb); return NULL; } @@ -484,6 +467,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure; } + if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && + nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) + goto nla_put_failure; + if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) goto nla_put_failure; @@ -504,13 +491,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); - nfqnl_zcopy(skb, entskb, data_len, hlen); + if (skb_zerocopy(skb, entskb, data_len, hlen)) + goto nla_put_failure; } nlh->nlmsg_len = skb->len; return skb; nla_put_failure: + skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); return NULL; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 954925db414..e2b3f51c81f 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -128,7 +128,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, BUG_ON(err < 0); desc.len *= BITS_PER_BYTE; - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - desc.len); + mask = nft_cmp_fast_mask(desc.len); priv->data = data.data[0] & mask; priv->len = desc.len; return 0; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index da0c1f4ada1..1840989092e 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -92,7 +92,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, if (ctx->chain->flags & NFT_BASE_CHAIN) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops; + const struct nf_hook_ops *ops = &basechain->ops[0]; par->hook_mask = 1 << ops->hooknum; } @@ -192,9 +192,18 @@ err: } static void -nft_target_destroy(const struct nft_expr *expr) +nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgdtor_param par; + + par.net = ctx->net; + par.target = target; + par.targinfo = info; + par.family = ctx->afi->family; + if (par.target->destroy != NULL) + par.target->destroy(&par); module_put(target->me); } @@ -253,7 +262,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (ctx->chain->flags & NFT_BASE_CHAIN) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops; + const struct nf_hook_ops *ops = &basechain->ops[0]; hook_mask = 1 << ops->hooknum; if (hook_mask & target->hooks) @@ -323,7 +332,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, if (ctx->chain->flags & NFT_BASE_CHAIN) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops; + const struct nf_hook_ops *ops = &basechain->ops[0]; par->hook_mask = 1 << ops->hooknum; } @@ -379,9 +388,18 @@ err: } static void -nft_match_destroy(const struct nft_expr *expr) +nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_mtdtor_param par; + + par.net = ctx->net; + par.match = match; + par.matchinfo = info; + par.family = ctx->afi->family; + if (par.match->destroy != NULL) + par.match->destroy(&par); module_put(match->me); } @@ -449,7 +467,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (ctx->chain->flags & NFT_BASE_CHAIN) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops; + const struct nf_hook_ops *ops = &basechain->ops[0]; hook_mask = 1 << ops->hooknum; if (hook_mask & match->hooks) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 955f4e6e708..cc560301624 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -18,17 +18,21 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> struct nft_ct { enum nft_ct_keys key:8; enum ip_conntrack_dir dir:8; - enum nft_registers dreg:8; - uint8_t family; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; }; -static void nft_ct_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +static void nft_ct_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); struct nft_data *dest = &data[priv->dreg]; @@ -93,6 +97,26 @@ static void nft_ct_eval(const struct nft_expr *expr, goto err; strncpy((char *)dest->data, helper->name, sizeof(dest->data)); return; +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: { + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + unsigned int size; + + if (!labels) { + memset(dest->data, 0, sizeof(dest->data)); + return; + } + + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data)); + size = labels->words * sizeof(long); + + memcpy(dest->data, labels->bits, size); + if (size < sizeof(dest->data)) + memset(((char *) dest->data) + size, 0, + sizeof(dest->data) - size); + return; + } +#endif } tuple = &ct->tuplehash[priv->dir].tuple; @@ -123,35 +147,82 @@ err: data[NFT_REG_VERDICT].verdict = NFT_BREAK; } +static void nft_ct_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; +#ifdef CONFIG_NF_CONNTRACK_MARK + u32 value = data[priv->sreg].data[0]; +#endif + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return; + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + if (ct->mark != value) { + ct->mark = value; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; +#endif + } +} + static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, [NFTA_CT_KEY] = { .type = NLA_U32 }, [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, + [NFTA_CT_SREG] = { .type = NLA_U32 }, }; -static int nft_ct_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_ct_l3proto_try_module_get(uint8_t family) { - struct nft_ct *priv = nft_expr_priv(expr); int err; - if (tb[NFTA_CT_DREG] == NULL || - tb[NFTA_CT_KEY] == NULL) - return -EINVAL; - - priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); - if (tb[NFTA_CT_DIRECTION] != NULL) { - priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); - switch (priv->dir) { - case IP_CT_DIR_ORIGINAL: - case IP_CT_DIR_REPLY: - break; - default: - return -EINVAL; - } + if (family == NFPROTO_INET) { + err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_l3proto_try_module_get(family); + if (err < 0) + goto err1; } + return 0; + +err2: + nf_ct_l3proto_module_put(NFPROTO_IPV4); +err1: + return err; +} + +static void nft_ct_l3proto_module_put(uint8_t family) +{ + if (family == NFPROTO_INET) { + nf_ct_l3proto_module_put(NFPROTO_IPV4); + nf_ct_l3proto_module_put(NFPROTO_IPV6); + } else + nf_ct_l3proto_module_put(family); +} + +static int nft_ct_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ct *priv = nft_expr_priv(expr); + int err; + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { case NFT_CT_STATE: case NFT_CT_DIRECTION: @@ -162,11 +233,15 @@ static int nft_ct_init(const struct nft_ctx *ctx, #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: +#endif case NFT_CT_EXPIRATION: case NFT_CT_HELPER: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; break; + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: @@ -179,34 +254,69 @@ static int nft_ct_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nf_ct_l3proto_try_module_get(ctx->afi->family); - if (err < 0) - return err; - priv->family = ctx->afi->family; + if (tb[NFTA_CT_DIRECTION] != NULL) { + priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + case IP_CT_DIR_REPLY: + break; + default: + return -EINVAL; + } + } priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); err = nft_validate_output_register(priv->dreg); if (err < 0) - goto err1; + return err; err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); if (err < 0) - goto err1; - return 0; + return err; -err1: - nf_ct_l3proto_module_put(ctx->afi->family); - return err; + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + + return 0; } -static void nft_ct_destroy(const struct nft_expr *expr) +static int nft_ct_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + break; +#endif + default: + return -EOPNOTSUPP; + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + + return 0; +} - nf_ct_l3proto_module_put(priv->family); +static void nft_ct_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nft_ct_l3proto_module_put(ctx->afi->family); } -static int nft_ct_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ct *priv = nft_expr_priv(expr); @@ -214,7 +324,32 @@ static int nft_ct_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; - if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + + switch (priv->key) { + case NFT_CT_PROTOCOL: + case NFT_CT_SRC: + case NFT_CT_DST: + case NFT_CT_PROTO_SRC: + case NFT_CT_PROTO_DST: + if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + default: + break; + } + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; return 0; @@ -223,18 +358,46 @@ nla_put_failure: } static struct nft_expr_type nft_ct_type; -static const struct nft_expr_ops nft_ct_ops = { +static const struct nft_expr_ops nft_ct_get_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_destroy, + .dump = nft_ct_get_dump, +}; + +static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), - .eval = nft_ct_eval, - .init = nft_ct_init, + .eval = nft_ct_set_eval, + .init = nft_ct_set_init, .destroy = nft_ct_destroy, - .dump = nft_ct_dump, + .dump = nft_ct_set_dump, }; +static const struct nft_expr_ops * +nft_ct_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_CT_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_CT_DREG]) + return &nft_ct_get_ops; + + if (tb[NFTA_CT_SREG]) + return &nft_ct_set_ops; + + return ERR_PTR(-EINVAL); +} + static struct nft_expr_type nft_ct_type __read_mostly = { .name = "ct", - .ops = &nft_ct_ops, + .select_ops = &nft_ct_select_ops, .policy = nft_ct_policy, .maxattr = NFTA_CT_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 8e0bb75e7c5..55c939f5371 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -31,7 +31,7 @@ static void nft_exthdr_eval(const struct nft_expr *expr, { struct nft_exthdr *priv = nft_expr_priv(expr); struct nft_data *dest = &data[priv->dreg]; - unsigned int offset; + unsigned int offset = 0; int err; err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 3d3f8fce10a..4080ed6a072 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -12,23 +12,36 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/list.h> +#include <linux/log2.h> #include <linux/jhash.h> #include <linux/netlink.h> +#include <linux/vmalloc.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#define NFT_HASH_MIN_SIZE 4UL + struct nft_hash { - struct hlist_head *hash; - unsigned int hsize; + struct nft_hash_table __rcu *tbl; +}; + +struct nft_hash_table { + unsigned int size; + struct nft_hash_elem __rcu *buckets[]; }; struct nft_hash_elem { - struct hlist_node hnode; - struct nft_data key; - struct nft_data data[]; + struct nft_hash_elem __rcu *next; + struct nft_data key; + struct nft_data data[]; }; +#define nft_hash_for_each_entry(i, head) \ + for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next)) +#define nft_hash_for_each_entry_rcu(i, head) \ + for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next)) + static u32 nft_hash_rnd __read_mostly; static bool nft_hash_rnd_initted __read_mostly; @@ -38,7 +51,7 @@ static unsigned int nft_hash_data(const struct nft_data *data, unsigned int h; h = jhash(data->data, len, nft_hash_rnd); - return ((u64)h * hsize) >> 32; + return h & (hsize - 1); } static bool nft_hash_lookup(const struct nft_set *set, @@ -46,11 +59,12 @@ static bool nft_hash_lookup(const struct nft_set *set, struct nft_data *data) { const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = rcu_dereference(priv->tbl); const struct nft_hash_elem *he; unsigned int h; - h = nft_hash_data(key, priv->hsize, set->klen); - hlist_for_each_entry(he, &priv->hash[h], hnode) { + h = nft_hash_data(key, tbl->size, set->klen); + nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) { if (nft_data_cmp(&he->key, key, set->klen)) continue; if (set->flags & NFT_SET_MAP) @@ -60,19 +74,148 @@ static bool nft_hash_lookup(const struct nft_set *set, return false; } -static void nft_hash_elem_destroy(const struct nft_set *set, - struct nft_hash_elem *he) +static void nft_hash_tbl_free(const struct nft_hash_table *tbl) { - nft_data_uninit(&he->key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) - nft_data_uninit(he->data, set->dtype); - kfree(he); + kvfree(tbl); +} + +static unsigned int nft_hash_tbl_size(unsigned int nelem) +{ + return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); +} + +static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) +{ + struct nft_hash_table *tbl; + size_t size; + + size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); + tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN); + if (tbl == NULL) + tbl = vzalloc(size); + if (tbl == NULL) + return NULL; + tbl->size = nbuckets; + + return tbl; +} + +static void nft_hash_chain_unzip(const struct nft_set *set, + const struct nft_hash_table *ntbl, + struct nft_hash_table *tbl, unsigned int n) +{ + struct nft_hash_elem *he, *last, *next; + unsigned int h; + + he = nft_dereference(tbl->buckets[n]); + if (he == NULL) + return; + h = nft_hash_data(&he->key, ntbl->size, set->klen); + + /* Find last element of first chain hashing to bucket h */ + last = he; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + break; + last = he; + } + + /* Unlink first chain from the old table */ + RCU_INIT_POINTER(tbl->buckets[n], last->next); + + /* If end of chain reached, done */ + if (he == NULL) + return; + + /* Find first element of second chain hashing to bucket h */ + next = NULL; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + continue; + next = he; + break; + } + + /* Link the two chains */ + RCU_INIT_POINTER(last->next, next); +} + +static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem *he; + unsigned int i, h; + bool complete; + + ntbl = nft_hash_tbl_alloc(tbl->size * 2); + if (ntbl == NULL) + return -ENOMEM; + + /* Link new table's buckets to first element in the old table + * hashing to the new bucket. + */ + for (i = 0; i < ntbl->size; i++) { + h = i < tbl->size ? i : i - tbl->size; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != i) + continue; + RCU_INIT_POINTER(ntbl->buckets[i], he); + break; + } + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + + /* Unzip interleaved hash chains */ + do { + /* Wait for readers to use new table/unzipped chains */ + synchronize_rcu(); + + complete = true; + for (i = 0; i < tbl->size; i++) { + nft_hash_chain_unzip(set, ntbl, tbl, i); + if (tbl->buckets[i] != NULL) + complete = false; + } + } while (!complete); + + nft_hash_tbl_free(tbl); + return 0; +} + +static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem __rcu **pprev; + unsigned int i; + + ntbl = nft_hash_tbl_alloc(tbl->size / 2); + if (ntbl == NULL) + return -ENOMEM; + + for (i = 0; i < ntbl->size; i++) { + ntbl->buckets[i] = tbl->buckets[i]; + + for (pprev = &ntbl->buckets[i]; *pprev != NULL; + pprev = &nft_dereference(*pprev)->next) + ; + RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + synchronize_rcu(); + + nft_hash_tbl_free(tbl); + return 0; } static int nft_hash_insert(const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); struct nft_hash_elem *he; unsigned int size, h; @@ -91,33 +234,64 @@ static int nft_hash_insert(const struct nft_set *set, if (set->flags & NFT_SET_MAP) nft_data_copy(he->data, &elem->data); - h = nft_hash_data(&he->key, priv->hsize, set->klen); - hlist_add_head_rcu(&he->hnode, &priv->hash[h]); + h = nft_hash_data(&he->key, tbl->size, set->klen); + RCU_INIT_POINTER(he->next, tbl->buckets[h]); + rcu_assign_pointer(tbl->buckets[h], he); + + /* Expand table when exceeding 75% load */ + if (set->nelems + 1 > tbl->size / 4 * 3) + nft_hash_tbl_expand(set, priv); + return 0; } +static void nft_hash_elem_destroy(const struct nft_set *set, + struct nft_hash_elem *he) +{ + nft_data_uninit(&he->key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP) + nft_data_uninit(he->data, set->dtype); + kfree(he); +} + static void nft_hash_remove(const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_hash_elem *he = elem->cookie; + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, __rcu **pprev; + + pprev = elem->cookie; + he = nft_dereference((*pprev)); - hlist_del_rcu(&he->hnode); + RCU_INIT_POINTER(*pprev, he->next); + synchronize_rcu(); kfree(he); + + /* Shrink table beneath 30% load */ + if (set->nelems - 1 < tbl->size * 3 / 10 && + tbl->size > NFT_HASH_MIN_SIZE) + nft_hash_tbl_shrink(set, priv); } static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) { const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem __rcu * const *pprev; struct nft_hash_elem *he; unsigned int h; - h = nft_hash_data(&elem->key, priv->hsize, set->klen); - hlist_for_each_entry(he, &priv->hash[h], hnode) { - if (nft_data_cmp(&he->key, &elem->key, set->klen)) + h = nft_hash_data(&elem->key, tbl->size, set->klen); + pprev = &tbl->buckets[h]; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_data_cmp(&he->key, &elem->key, set->klen)) { + pprev = &he->next; continue; + } - elem->cookie = he; - elem->flags = 0; + elem->cookie = (void *)pprev; + elem->flags = 0; if (set->flags & NFT_SET_MAP) nft_data_copy(&elem->data, he->data); return 0; @@ -129,12 +303,13 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_iter *iter) { const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); const struct nft_hash_elem *he; struct nft_set_elem elem; unsigned int i; - for (i = 0; i < priv->hsize; i++) { - hlist_for_each_entry(he, &priv->hash[i], hnode) { + for (i = 0; i < tbl->size; i++) { + nft_hash_for_each_entry(he, tbl->buckets[i]) { if (iter->count < iter->skip) goto cont; @@ -158,50 +333,77 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) } static int nft_hash_init(const struct nft_set *set, + const struct nft_set_desc *desc, const struct nlattr * const tb[]) { struct nft_hash *priv = nft_set_priv(set); - unsigned int cnt, i; + struct nft_hash_table *tbl; + unsigned int size; if (unlikely(!nft_hash_rnd_initted)) { get_random_bytes(&nft_hash_rnd, 4); nft_hash_rnd_initted = true; } - /* Aim for a load factor of 0.75 */ - // FIXME: temporarily broken until we have set descriptions - cnt = 100; - cnt = cnt * 4 / 3; + size = NFT_HASH_MIN_SIZE; + if (desc->size) + size = nft_hash_tbl_size(desc->size); - priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL); - if (priv->hash == NULL) + tbl = nft_hash_tbl_alloc(size); + if (tbl == NULL) return -ENOMEM; - priv->hsize = cnt; - - for (i = 0; i < cnt; i++) - INIT_HLIST_HEAD(&priv->hash[i]); - + RCU_INIT_POINTER(priv->tbl, tbl); return 0; } static void nft_hash_destroy(const struct nft_set *set) { const struct nft_hash *priv = nft_set_priv(set); - const struct hlist_node *next; - struct nft_hash_elem *elem; + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, *next; unsigned int i; - for (i = 0; i < priv->hsize; i++) { - hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) { - hlist_del(&elem->hnode); - nft_hash_elem_destroy(set, elem); + for (i = 0; i < tbl->size; i++) { + for (he = nft_dereference(tbl->buckets[i]); he != NULL; + he = next) { + next = nft_dereference(he->next); + nft_hash_elem_destroy(set, he); } } - kfree(priv->hash); + kfree(tbl); +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int esize; + + esize = sizeof(struct nft_hash_elem); + if (features & NFT_SET_MAP) + esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); + + if (desc->size) { + est->size = sizeof(struct nft_hash) + + nft_hash_tbl_size(desc->size) * + sizeof(struct nft_hash_elem *) + + desc->size * esize; + } else { + /* Resizing happens when the load drops below 30% or goes + * above 75%. The average of 52.5% load (approximated by 50%) + * is used for the size estimation of the hash buckets, + * meaning we calculate two buckets per element. + */ + est->size = esize + 2 * sizeof(struct nft_hash_elem *); + } + + est->class = NFT_SET_CLASS_O_1; + + return true; } static struct nft_set_ops nft_hash_ops __read_mostly = { .privsize = nft_hash_privsize, + .estimate = nft_hash_estimate, .init = nft_hash_init, .destroy = nft_hash_destroy, .get = nft_hash_get, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index f169501f1ad..810385eb724 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -70,7 +70,8 @@ err1: return err; } -static void nft_immediate_destroy(const struct nft_expr *expr) +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 57cad072a13..10cfb156cdf 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -23,7 +23,6 @@ static const char *nft_log_null_prefix = ""; struct nft_log { struct nf_loginfo loginfo; char *prefix; - int family; }; static void nft_log_eval(const struct nft_expr *expr, @@ -33,7 +32,7 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_log *priv = nft_expr_priv(expr); struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, priv->family, pkt->hooknum, pkt->skb, pkt->in, + nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, pkt->out, &priv->loginfo, "%s", priv->prefix); } @@ -52,8 +51,6 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; - priv->family = ctx->afi->family; - nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); @@ -77,7 +74,8 @@ static int nft_log_init(const struct nft_ctx *ctx, return 0; } -static void nft_log_destroy(const struct nft_expr *expr) +static void nft_log_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_log *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 8a6116b75b5..6404a726d17 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -16,6 +16,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> struct nft_lookup { struct nft_set *set; @@ -55,8 +56,14 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return -EINVAL; set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); - if (IS_ERR(set)) - return PTR_ERR(set); + if (IS_ERR(set)) { + if (tb[NFTA_LOOKUP_SET_ID]) { + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_LOOKUP_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); err = nft_validate_input_register(priv->sreg); @@ -88,11 +95,12 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return 0; } -static void nft_lookup_destroy(const struct nft_expr *expr) +static void nft_lookup_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); - nf_tables_unbind_set(NULL, priv->set, &priv->binding); + nf_tables_unbind_set(ctx, priv->set, &priv->binding); } static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8c28220a90b..852b178c6ae 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -18,15 +18,11 @@ #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> -struct nft_meta { - enum nft_meta_keys key:8; - enum nft_registers dreg:8; -}; - -static void nft_meta_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; @@ -40,6 +36,12 @@ static void nft_meta_eval(const struct nft_expr *expr, case NFT_META_PROTOCOL: *(__be16 *)dest->data = skb->protocol; break; + case NFT_META_NFPROTO: + dest->data[0] = pkt->ops->pf; + break; + case NFT_META_L4PROTO: + dest->data[0] = pkt->tprot; + break; case NFT_META_PRIORITY: dest->data[0] = skb->priority; break; @@ -107,7 +109,7 @@ static void nft_meta_eval(const struct nft_expr *expr, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); break; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: { const struct dst_entry *dst = skb_dst(skb); @@ -131,26 +133,52 @@ static void nft_meta_eval(const struct nft_expr *expr, err: data[NFT_REG_VERDICT].verdict = NFT_BREAK; } +EXPORT_SYMBOL_GPL(nft_meta_get_eval); -static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *meta = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + u32 value = data[meta->sreg].data[0]; + + switch (meta->key) { + case NFT_META_MARK: + skb->mark = value; + break; + case NFT_META_PRIORITY: + skb->priority = value; + break; + case NFT_META_NFTRACE: + skb->nf_trace = 1; + break; + default: + WARN_ON(1); + } +} +EXPORT_SYMBOL_GPL(nft_meta_set_eval); + +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, + [NFTA_META_SREG] = { .type = NLA_U32 }, }; +EXPORT_SYMBOL_GPL(nft_meta_policy); -static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); int err; - if (tb[NFTA_META_DREG] == NULL || - tb[NFTA_META_KEY] == NULL) - return -EINVAL; - priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_LEN: case NFT_META_PROTOCOL: + case NFT_META_NFPROTO: + case NFT_META_L4PROTO: case NFT_META_PRIORITY: case NFT_META_MARK: case NFT_META_IIF: @@ -161,7 +189,7 @@ static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr, case NFT_META_OIFTYPE: case NFT_META_SKUID: case NFT_META_SKGID: -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: #endif #ifdef CONFIG_NETWORK_SECMARK @@ -176,35 +204,113 @@ static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr, err = nft_validate_output_register(priv->dreg); if (err < 0) return err; - return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + return 0; } +EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_MARK: + case NFT_META_PRIORITY: + case NFT_META_NFTRACE: + break; + default: + return -EOPNOTSUPP; + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_set_init); + +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); + if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) + goto nla_put_failure; if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg))) goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_get_dump); + +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_META_SREG, htonl(priv->sreg))) + goto nla_put_failure; + return 0; nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_set_dump); static struct nft_expr_type nft_meta_type; -static const struct nft_expr_ops nft_meta_ops = { +static const struct nft_expr_ops nft_meta_get_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_get_eval, + .init = nft_meta_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_set_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .eval = nft_meta_eval, - .init = nft_meta_init, - .dump = nft_meta_dump, + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .dump = nft_meta_set_dump, }; +static const struct nft_expr_ops * +nft_meta_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_set_ops; + + return ERR_PTR(-EINVAL); +} + static struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", - .ops = &nft_meta_ops, + .select_ops = &nft_meta_select_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_meta_target.c b/net/netfilter/nft_meta_target.c deleted file mode 100644 index 71177df75ff..00000000000 --- a/net/netfilter/nft_meta_target.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/rbtree.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> - -struct nft_meta { - enum nft_meta_keys key; -}; - -static void nft_meta_eval(const struct nft_expr *expr, - struct nft_data *nfres, - struct nft_data *data, - const struct nft_pktinfo *pkt) -{ - const struct nft_meta *meta = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; - u32 val = data->data[0]; - - switch (meta->key) { - case NFT_META_MARK: - skb->mark = val; - break; - case NFT_META_PRIORITY: - skb->priority = val; - break; - case NFT_META_NFTRACE: - skb->nf_trace = val; - break; -#ifdef CONFIG_NETWORK_SECMARK - case NFT_META_SECMARK: - skb->secmark = val; - break; -#endif - default: - WARN_ON(1); - } -} - -static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { - [NFTA_META_KEY] = { .type = NLA_U32 }, -}; - -static int nft_meta_init(const struct nft_expr *expr, struct nlattr *tb[]) -{ - struct nft_meta *meta = nft_expr_priv(expr); - - if (tb[NFTA_META_KEY] == NULL) - return -EINVAL; - - meta->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); - switch (meta->key) { - case NFT_META_MARK: - case NFT_META_PRIORITY: - case NFT_META_NFTRACE: -#ifdef CONFIG_NETWORK_SECMARK - case NFT_META_SECMARK: -#endif - break; - default: - return -EINVAL; - } - - return 0; -} - -static int nft_meta_dump(struct sk_buff *skb, const struct nft_expr *expr) -{ - struct nft_meta *meta = nft_expr_priv(expr); - - NLA_PUT_BE32(skb, NFTA_META_KEY, htonl(meta->key)); - return 0; - -nla_put_failure: - return -1; -} - -static struct nft_expr_ops meta_target __read_mostly = { - .name = "meta", - .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .owner = THIS_MODULE, - .eval = nft_meta_eval, - .init = nft_meta_init, - .dump = nft_meta_dump, - .policy = nft_meta_policy, - .maxattr = NFTA_META_MAX, -}; - -static int __init nft_meta_target_init(void) -{ - return nft_register_expr(&meta_target); -} - -static void __exit nft_meta_target_exit(void) -{ - nft_unregister_expr(&meta_target); -} - -module_init(nft_meta_target_init); -module_exit(nft_meta_target_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("meta"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index d3b1ffe2618..79ff58cd36d 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -31,8 +31,8 @@ struct nft_nat { enum nft_registers sreg_addr_max:8; enum nft_registers sreg_proto_min:8; enum nft_registers sreg_proto_max:8; - int family; - enum nf_nat_manip_type type; + enum nf_nat_manip_type type:8; + u8 family; }; static void nft_nat_eval(const struct nft_expr *expr, @@ -88,6 +88,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); + u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL) @@ -107,9 +108,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; - priv->family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); - if (priv->family != AF_INET && priv->family != AF_INET6) - return -EINVAL; + family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + if (family != ctx->afi->family) + return -EOPNOTSUPP; + priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { priv->sreg_addr_min = ntohl(nla_get_be32( @@ -171,12 +175,14 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_be32(skb, NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MIN, htonl(priv->sreg_proto_min))) - goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MAX, htonl(priv->sreg_proto_max))) - goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } return 0; nla_put_failure: @@ -202,13 +208,7 @@ static struct nft_expr_type nft_nat_type __read_mostly = { static int __init nft_nat_module_init(void) { - int err; - - err = nft_register_expr(&nft_nat_type); - if (err < 0) - return err; - - return 0; + return nft_register_expr(&nft_nat_type); } static void __exit nft_nat_module_exit(void) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a2aeb318678..85daa84bfdf 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -135,7 +135,8 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data)) return ERR_PTR(-EINVAL); - if (len <= 4 && IS_ALIGNED(offset, len) && base != NFT_PAYLOAD_LL_HEADER) + if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && + base != NFT_PAYLOAD_LL_HEADER) return &nft_payload_fast_ops; else return &nft_payload_ops; diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c new file mode 100644 index 00000000000..e8ae2f6bf23 --- /dev/null +++ b/net/netfilter/nft_queue.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code partly funded by OISF + * (http://www.openinfosecfoundation.org/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/jhash.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_queue.h> + +static u32 jhash_initval __read_mostly; + +struct nft_queue { + u16 queuenum; + u16 queues_total; + u16 flags; +}; + +static void nft_queue_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_queue *priv = nft_expr_priv(expr); + u32 queue = priv->queuenum; + u32 ret; + + if (priv->queues_total > 1) { + if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { + int cpu = smp_processor_id(); + + queue = priv->queuenum + cpu % priv->queues_total; + } else { + queue = nfqueue_hash(pkt->skb, queue, + priv->queues_total, pkt->ops->pf, + jhash_initval); + } + } + + ret = NF_QUEUE_NR(queue); + if (priv->flags & NFT_QUEUE_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + data[NFT_REG_VERDICT].verdict = ret; +} + +static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { + [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, + [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, + [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, +}; + +static int nft_queue_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_queue *priv = nft_expr_priv(expr); + + if (tb[NFTA_QUEUE_NUM] == NULL) + return -EINVAL; + + init_hashrandom(&jhash_initval); + priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); + + if (tb[NFTA_QUEUE_TOTAL] != NULL) + priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); + if (tb[NFTA_QUEUE_FLAGS] != NULL) { + priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); + if (priv->flags & ~NFT_QUEUE_FLAG_MASK) + return -EINVAL; + } + return 0; +} + +static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_queue *priv = nft_expr_priv(expr); + + if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) || + nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) || + nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_queue_type; +static const struct nft_expr_ops nft_queue_ops = { + .type = &nft_queue_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), + .eval = nft_queue_eval, + .init = nft_queue_init, + .dump = nft_queue_dump, +}; + +static struct nft_expr_type nft_queue_type __read_mostly = { + .name = "queue", + .ops = &nft_queue_ops, + .policy = nft_queue_policy, + .maxattr = NFTA_QUEUE_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_queue_module_init(void) +{ + return nft_register_expr(&nft_queue_type); +} + +static void __exit nft_queue_module_exit(void) +{ + nft_unregister_expr(&nft_queue_type); +} + +module_init(nft_queue_module_init); +module_exit(nft_queue_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); +MODULE_ALIAS_NFT_EXPR("queue"); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index ca0c1b231bf..e1836ff8819 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -18,6 +18,8 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +static DEFINE_SPINLOCK(nft_rbtree_lock); + struct nft_rbtree { struct rb_root root; }; @@ -38,6 +40,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const struct rb_node *parent = priv->root.rb_node; int d; + spin_lock_bh(&nft_rbtree_lock); while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -53,6 +56,8 @@ found: goto out; if (set->flags & NFT_SET_MAP) nft_data_copy(data, rbe->data); + + spin_unlock_bh(&nft_rbtree_lock); return true; } } @@ -62,6 +67,7 @@ found: goto found; } out: + spin_unlock_bh(&nft_rbtree_lock); return false; } @@ -69,8 +75,10 @@ static void nft_rbtree_elem_destroy(const struct nft_set *set, struct nft_rbtree_elem *rbe) { nft_data_uninit(&rbe->key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_uninit(rbe->data, set->dtype); + kfree(rbe); } @@ -108,7 +116,8 @@ static int nft_rbtree_insert(const struct nft_set *set, int err; size = sizeof(*rbe); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) size += sizeof(rbe->data[0]); rbe = kzalloc(size, GFP_KERNEL); @@ -117,12 +126,16 @@ static int nft_rbtree_insert(const struct nft_set *set, rbe->flags = elem->flags; nft_data_copy(&rbe->key, &elem->key); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(rbe->data, &elem->data); + spin_lock_bh(&nft_rbtree_lock); err = __nft_rbtree_insert(set, rbe); if (err < 0) kfree(rbe); + + spin_unlock_bh(&nft_rbtree_lock); return err; } @@ -132,7 +145,9 @@ static void nft_rbtree_remove(const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe = elem->cookie; + spin_lock_bh(&nft_rbtree_lock); rb_erase(&rbe->node, &priv->root); + spin_unlock_bh(&nft_rbtree_lock); kfree(rbe); } @@ -143,6 +158,7 @@ static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) struct nft_rbtree_elem *rbe; int d; + spin_lock_bh(&nft_rbtree_lock); while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -153,12 +169,15 @@ static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) parent = parent->rb_right; else { elem->cookie = rbe; - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(&elem->data, rbe->data); elem->flags = rbe->flags; + spin_unlock_bh(&nft_rbtree_lock); return 0; } } + spin_unlock_bh(&nft_rbtree_lock); return -ENOENT; } @@ -171,22 +190,27 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_set_elem elem; struct rb_node *node; + spin_lock_bh(&nft_rbtree_lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { if (iter->count < iter->skip) goto cont; rbe = rb_entry(node, struct nft_rbtree_elem, node); nft_data_copy(&elem.key, &rbe->key); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(&elem.data, rbe->data); elem.flags = rbe->flags; iter->err = iter->fn(ctx, set, iter, &elem); - if (iter->err < 0) + if (iter->err < 0) { + spin_unlock_bh(&nft_rbtree_lock); return; + } cont: iter->count++; } + spin_unlock_bh(&nft_rbtree_lock); } static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) @@ -195,6 +219,7 @@ static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) } static int nft_rbtree_init(const struct nft_set *set, + const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_rbtree *priv = nft_set_priv(set); @@ -209,15 +234,37 @@ static void nft_rbtree_destroy(const struct nft_set *set) struct nft_rbtree_elem *rbe; struct rb_node *node; + spin_lock_bh(&nft_rbtree_lock); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nft_rbtree_elem_destroy(set, rbe); } + spin_unlock_bh(&nft_rbtree_lock); +} + +static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int nsize; + + nsize = sizeof(struct nft_rbtree_elem); + if (features & NFT_SET_MAP) + nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); + + if (desc->size) + est->size = sizeof(struct nft_rbtree) + desc->size * nsize; + else + est->size = nsize; + + est->class = NFT_SET_CLASS_O_LOG_N; + + return true; } static struct nft_set_ops nft_rbtree_ops __read_mostly = { .privsize = nft_rbtree_privsize, + .estimate = nft_rbtree_estimate, .init = nft_rbtree_init, .destroy = nft_rbtree_destroy, .insert = nft_rbtree_insert, diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c new file mode 100644 index 00000000000..f3448c29644 --- /dev/null +++ b/net/netfilter/nft_reject.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { + [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, +}; +EXPORT_SYMBOL_GPL(nft_reject_policy); + +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_reject_init); + +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_reject_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 00000000000..b718a52a465 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_inet_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + return nft_reject_ipv4_eval(expr, data, pkt); + case NFPROTO_IPV6: + return nft_reject_ipv6_eval(expr, data, pkt); + } +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { + .type = &nft_reject_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_inet_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "reject", + .ops = &nft_reject_inet_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ + return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ + nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 3228d7f24eb..4973cbddc44 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -146,11 +146,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) if (par->family == NFPROTO_BRIDGE) { switch (eth_hdr(skb)->h_proto) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): audit_ip4(ab, skb); break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): audit_ip6(ab, skb); break; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index da35ac06a97..75747aecdeb 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -211,8 +211,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, ret = 0; if ((info->ct_events || info->exp_events) && !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, - GFP_KERNEL)) + GFP_KERNEL)) { + ret = -EINVAL; goto err3; + } if (info->helper[0]) { ret = xt_ct_set_helper(ct, info->helper, par); @@ -226,12 +228,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err3; } - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); - - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &par->net->ct.tmpl); + nf_conntrack_tmpl_insert(par->net, ct); out: info->ct = ct; return 0; diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index ed00fef5899..8f1779ff7e3 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -11,15 +11,13 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/jhash.h> - #include <linux/netfilter.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFQUEUE.h> +#include <net/netfilter/nf_queue.h> + MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); MODULE_LICENSE("GPL"); @@ -28,7 +26,6 @@ MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static u32 jhash_initval __read_mostly; -static bool rnd_inited __read_mostly; static unsigned int nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -38,69 +35,16 @@ nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) return NF_QUEUE_NR(tinfo->queuenum); } -static u32 hash_v4(const struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - /* packets in either direction go into same queue */ - if ((__force u32)iph->saddr < (__force u32)iph->daddr) - return jhash_3words((__force u32)iph->saddr, - (__force u32)iph->daddr, iph->protocol, jhash_initval); - - return jhash_3words((__force u32)iph->daddr, - (__force u32)iph->saddr, iph->protocol, jhash_initval); -} - -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -static u32 hash_v6(const struct sk_buff *skb) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - u32 a, b, c; - - if ((__force u32)ip6h->saddr.s6_addr32[3] < - (__force u32)ip6h->daddr.s6_addr32[3]) { - a = (__force u32) ip6h->saddr.s6_addr32[3]; - b = (__force u32) ip6h->daddr.s6_addr32[3]; - } else { - b = (__force u32) ip6h->saddr.s6_addr32[3]; - a = (__force u32) ip6h->daddr.s6_addr32[3]; - } - - if ((__force u32)ip6h->saddr.s6_addr32[1] < - (__force u32)ip6h->daddr.s6_addr32[1]) - c = (__force u32) ip6h->saddr.s6_addr32[1]; - else - c = (__force u32) ip6h->daddr.s6_addr32[1]; - - return jhash_3words(a, b, c, jhash_initval); -} -#endif - -static u32 -nfqueue_hash(const struct sk_buff *skb, const struct xt_action_param *par) -{ - const struct xt_NFQ_info_v1 *info = par->targinfo; - u32 queue = info->queuenum; - - if (par->family == NFPROTO_IPV4) - queue += ((u64) hash_v4(skb) * info->queues_total) >> 32; -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - else if (par->family == NFPROTO_IPV6) - queue += ((u64) hash_v6(skb) * info->queues_total) >> 32; -#endif - - return queue; -} - static unsigned int nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v1 *info = par->targinfo; u32 queue = info->queuenum; - if (info->queues_total > 1) - queue = nfqueue_hash(skb, par); - + if (info->queues_total > 1) { + queue = nfqueue_hash(skb, queue, info->queues_total, + par->family, jhash_initval); + } return NF_QUEUE_NR(queue); } @@ -120,10 +64,8 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par) const struct xt_NFQ_info_v3 *info = par->targinfo; u32 maxid; - if (unlikely(!rnd_inited)) { - get_random_bytes(&jhash_initval, sizeof(jhash_initval)); - rnd_inited = true; - } + init_hashrandom(&jhash_initval); + if (info->queues_total == 0) { pr_err("NFQUEUE: number of total queues is 0\n"); return -EINVAL; @@ -154,8 +96,10 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) int cpu = smp_processor_id(); queue = info->queuenum + cpu % info->queues_total; - } else - queue = nfqueue_hash(skb, par); + } else { + queue = nfqueue_hash(skb, queue, info->queues_total, + par->family, jhash_initval); + } } ret = NF_QUEUE_NR(queue); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 12d4da8e6c7..bbffdbdaf60 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -23,10 +23,11 @@ MODULE_ALIAS("ip6t_bpf"); static int bpf_mt_check(const struct xt_mtchk_param *par) { struct xt_bpf_info *info = par->matchinfo; - struct sock_fprog program; + struct sock_fprog_kern program; program.len = info->bpf_program_num_elem; - program.filter = (struct sock_filter __user *) info->bpf_program; + program.filter = info->bpf_program; + if (sk_unattached_filter_create(&info->filter, &program)) { pr_info("bpf: check failed: parse error\n"); return -EINVAL; diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c new file mode 100644 index 00000000000..f4e83300532 --- /dev/null +++ b/net/netfilter/xt_cgroup.c @@ -0,0 +1,72 @@ +/* + * Xtables module to match the process control group. + * + * Might be used to implement individual "per-application" firewall + * policies in contrast to global policies based on control groups. + * Matching is based upon processes tagged to net_cls' classid marker. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/skbuff.h> +#include <linux/module.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_cgroup.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("Xtables: process control group matching"); +MODULE_ALIAS("ipt_cgroup"); +MODULE_ALIAS("ip6t_cgroup"); + +static int cgroup_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + + return info->id ? 0 : -EINVAL; +} + +static bool +cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info *info = par->matchinfo; + + if (skb->sk == NULL) + return false; + + return (info->id == skb->sk->sk_classid) ^ info->invert; +} + +static struct xt_match cgroup_mt_reg __read_mostly = { + .name = "cgroup", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check, + .match = cgroup_mt, + .matchsize = sizeof(struct xt_cgroup_info), + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), +}; + +static int __init cgroup_mt_init(void) +{ + return xt_register_match(&cgroup_mt_reg); +} + +static void __exit cgroup_mt_exit(void) +{ + xt_unregister_match(&cgroup_mt_reg); +} + +module_init(cgroup_mt_init); +module_exit(cgroup_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index c40b2695633..fbc66bb250d 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -19,6 +19,7 @@ #include <linux/jhash.h> #include <linux/slab.h> #include <linux/list.h> +#include <linux/rbtree.h> #include <linux/module.h> #include <linux/random.h> #include <linux/skbuff.h> @@ -31,6 +32,16 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> +#define CONNLIMIT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS 8U +#else +#define CONNLIMIT_LOCK_SLOTS 256U +#endif + +#define CONNLIMIT_GC_MAX_NODES 8 + /* we will save the tuples of all connections we care about */ struct xt_connlimit_conn { struct hlist_node node; @@ -38,16 +49,27 @@ struct xt_connlimit_conn { union nf_inet_addr addr; }; +struct xt_connlimit_rb { + struct rb_node node; + struct hlist_head hhead; /* connections/hosts in same subnet */ + union nf_inet_addr addr; /* search key */ +}; + +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; + struct xt_connlimit_data { - struct hlist_head iphash[256]; - spinlock_t lock; + struct rb_root climit_root4[CONNLIMIT_SLOTS]; + struct rb_root climit_root6[CONNLIMIT_SLOTS]; }; static u_int32_t connlimit_rnd __read_mostly; +static struct kmem_cache *connlimit_rb_cachep __read_mostly; +static struct kmem_cache *connlimit_conn_cachep __read_mostly; static inline unsigned int connlimit_iphash(__be32 addr) { - return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; + return jhash_1word((__force __u32)addr, + connlimit_rnd) % CONNLIMIT_SLOTS; } static inline unsigned int @@ -60,7 +82,8 @@ connlimit_iphash6(const union nf_inet_addr *addr, for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) res.ip6[i] = addr->ip6[i] & mask->ip6[i]; - return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; + return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), + connlimit_rnd) % CONNLIMIT_SLOTS; } static inline bool already_closed(const struct nf_conn *conn) @@ -72,13 +95,14 @@ static inline bool already_closed(const struct nf_conn *conn) return 0; } -static inline unsigned int +static int same_source_net(const union nf_inet_addr *addr, const union nf_inet_addr *mask, const union nf_inet_addr *u3, u_int8_t family) { if (family == NFPROTO_IPV4) { - return (addr->ip & mask->ip) == (u3->ip & mask->ip); + return ntohl(addr->ip & mask->ip) - + ntohl(u3->ip & mask->ip); } else { union nf_inet_addr lh, rh; unsigned int i; @@ -88,89 +112,205 @@ same_source_net(const union nf_inet_addr *addr, rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; } - return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; + return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)); } } -static int count_them(struct net *net, - struct xt_connlimit_data *data, +static bool add_hlist(struct hlist_head *head, const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, - const union nf_inet_addr *mask, - u_int8_t family) + const union nf_inet_addr *addr) +{ + struct xt_connlimit_conn *conn; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return false; + conn->tuple = *tuple; + conn->addr = *addr; + hlist_add_head(&conn->node, head); + return true; +} + +static unsigned int check_hlist(struct net *net, + struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + bool *addit) { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; struct hlist_node *n; struct nf_conn *found_ct; - struct hlist_head *hash; - bool addit = true; - int matches = 0; - - if (family == NFPROTO_IPV6) - hash = &data->iphash[connlimit_iphash6(addr, mask)]; - else - hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; + unsigned int length = 0; + *addit = true; rcu_read_lock(); /* check the saved connections */ - hlist_for_each_entry_safe(conn, n, hash, node) { + hlist_for_each_entry_safe(conn, n, head, node) { found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, &conn->tuple); - found_ct = NULL; + if (found == NULL) { + hlist_del(&conn->node); + kmem_cache_free(connlimit_conn_cachep, conn); + continue; + } - if (found != NULL) - found_ct = nf_ct_tuplehash_to_ctrack(found); + found_ct = nf_ct_tuplehash_to_ctrack(found); - if (found_ct != NULL && - nf_ct_tuple_equal(&conn->tuple, tuple) && - !already_closed(found_ct)) + if (nf_ct_tuple_equal(&conn->tuple, tuple)) { /* * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". */ - addit = false; - - if (found == NULL) { - /* this one is gone */ - hlist_del(&conn->node); - kfree(conn); - continue; - } - - if (already_closed(found_ct)) { + *addit = false; + } else if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); hlist_del(&conn->node); - kfree(conn); + kmem_cache_free(connlimit_conn_cachep, conn); continue; } - if (same_source_net(addr, mask, &conn->addr, family)) - /* same source network -> be counted! */ - ++matches; nf_ct_put(found_ct); + length++; } rcu_read_unlock(); - if (addit) { - /* save the new connection in our list */ - conn = kmalloc(sizeof(*conn), GFP_ATOMIC); - if (conn == NULL) - return -ENOMEM; - conn->tuple = *tuple; - conn->addr = *addr; - hlist_add_head(&conn->node, hash); - ++matches; + return length; +} + +static void tree_nodes_free(struct rb_root *root, + struct xt_connlimit_rb *gc_nodes[], + unsigned int gc_count) +{ + struct xt_connlimit_rb *rbconn; + + while (gc_count) { + rbconn = gc_nodes[--gc_count]; + rb_erase(&rbconn->node, root); + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, const union nf_inet_addr *mask, + u8 family) +{ + struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; + struct rb_node **rbnode, *parent; + struct xt_connlimit_rb *rbconn; + struct xt_connlimit_conn *conn; + unsigned int gc_count; + bool no_gc = false; + + restart: + gc_count = 0; + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { + int diff; + bool addit; + + rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); + + parent = *rbnode; + diff = same_source_net(addr, mask, &rbconn->addr, family); + if (diff < 0) { + rbnode = &((*rbnode)->rb_left); + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { + /* same source network -> be counted! */ + unsigned int count; + count = check_hlist(net, &rbconn->hhead, tuple, &addit); + + tree_nodes_free(root, gc_nodes, gc_count); + if (!addit) + return count; + + if (!add_hlist(&rbconn->hhead, tuple, addr)) + return 0; /* hotdrop */ + + return count + 1; + } + + if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + /* only used for GC on hhead, retval and 'addit' ignored */ + check_hlist(net, &rbconn->hhead, tuple, &addit); + if (hlist_empty(&rbconn->hhead)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + no_gc = true; + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + goto restart; + } + + /* no match, need to insert new node */ + rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + return 0; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(connlimit_rb_cachep, rbconn); + return 0; + } + + conn->tuple = *tuple; + conn->addr = *addr; + rbconn->addr = *addr; + + INIT_HLIST_HEAD(&rbconn->hhead); + hlist_add_head(&conn->node, &rbconn->hhead); + + rb_link_node(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + return 1; +} + +static int count_them(struct net *net, + struct xt_connlimit_data *data, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + u_int8_t family) +{ + struct rb_root *root; + int count; + u32 hash; + + if (family == NFPROTO_IPV6) { + hash = connlimit_iphash6(addr, mask); + root = &data->climit_root6[hash]; + } else { + hash = connlimit_iphash(addr->ip & mask->ip); + root = &data->climit_root4[hash]; } - return matches; + spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + count = count_tree(net, root, tuple, addr, mask, family); + + spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + return count; } static bool @@ -183,7 +323,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct nf_conntrack_tuple *tuple_ptr = &tuple; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; - int connections; + unsigned int connections; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) @@ -202,12 +342,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) iph->daddr : iph->saddr; } - spin_lock_bh(&info->data->lock); connections = count_them(net, info->data, tuple_ptr, &addr, &info->mask, par->family); - spin_unlock_bh(&info->data->lock); - - if (connections < 0) + if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; @@ -247,29 +384,44 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - spin_lock_init(&info->data->lock); - for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) - INIT_HLIST_HEAD(&info->data->iphash[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + info->data->climit_root4[i] = RB_ROOT; + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + info->data->climit_root6[i] = RB_ROOT; return 0; } -static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +static void destroy_tree(struct rb_root *r) { - const struct xt_connlimit_info *info = par->matchinfo; struct xt_connlimit_conn *conn; + struct xt_connlimit_rb *rbconn; struct hlist_node *n; - struct hlist_head *hash = info->data->iphash; + struct rb_node *node; + + while ((node = rb_first(r)) != NULL) { + rbconn = container_of(node, struct xt_connlimit_rb, node); + + rb_erase(node, r); + + hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) + kmem_cache_free(connlimit_conn_cachep, conn); + + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_connlimit_info *info = par->matchinfo; unsigned int i; nf_ct_l3proto_module_put(par->family); - for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { - hlist_for_each_entry_safe(conn, n, &hash[i], node) { - hlist_del(&conn->node); - kfree(conn); - } - } + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + destroy_tree(&info->data->climit_root4[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + destroy_tree(&info->data->climit_root6[i]); kfree(info->data); } @@ -287,12 +439,40 @@ static struct xt_match connlimit_mt_reg __read_mostly = { static int __init connlimit_mt_init(void) { - return xt_register_match(&connlimit_mt_reg); + int ret, i; + + BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); + BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) + spin_lock_init(&xt_connlimit_locks[i]); + + connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", + sizeof(struct xt_connlimit_conn), + 0, 0, NULL); + if (!connlimit_conn_cachep) + return -ENOMEM; + + connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", + sizeof(struct xt_connlimit_rb), + 0, 0, NULL); + if (!connlimit_rb_cachep) { + kmem_cache_destroy(connlimit_conn_cachep); + return -ENOMEM; + } + ret = xt_register_match(&connlimit_mt_reg); + if (ret != 0) { + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); + } + return ret; } static void __exit connlimit_mt_exit(void) { xt_unregister_match(&connlimit_mt_reg); + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9ff035c7140..a3910fc2122 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -325,21 +325,24 @@ static void htable_gc(unsigned long htlong) add_timer(&ht->timer); } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) +static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net); struct proc_dir_entry *parent; - del_timer_sync(&hinfo->timer); - if (hinfo->family == NFPROTO_IPV4) parent = hashlimit_net->ipt_hashlimit; else parent = hashlimit_net->ip6t_hashlimit; - if(parent != NULL) + if (parent != NULL) remove_proc_entry(hinfo->name, parent); +} +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ + del_timer_sync(&hinfo->timer); + htable_remove_proc_entry(hinfo); htable_selective_cleanup(hinfo, select_all); kfree(hinfo->name); vfree(hinfo); @@ -883,21 +886,15 @@ static int __net_init hashlimit_proc_net_init(struct net *net) static void __net_exit hashlimit_proc_net_exit(struct net *net) { struct xt_hashlimit_htable *hinfo; - struct proc_dir_entry *pde; struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); - /* recent_net_exit() is called before recent_mt_destroy(). Make sure - * that the parent xt_recent proc entry is is empty before trying to - * remove it. + /* hashlimit_net_exit() is called before hashlimit_mt_destroy(). + * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc + * entries is empty before trying to remove it. */ mutex_lock(&hashlimit_mutex); - pde = hashlimit_net->ipt_hashlimit; - if (pde == NULL) - pde = hashlimit_net->ip6t_hashlimit; - hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) - remove_proc_entry(hinfo->name, pde); - + htable_remove_proc_entry(hinfo); hashlimit_net->ipt_hashlimit = NULL; hashlimit_net->ip6t_hashlimit = NULL; mutex_unlock(&hashlimit_mutex); diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c new file mode 100644 index 00000000000..89d53104c6b --- /dev/null +++ b/net/netfilter/xt_ipcomp.c @@ -0,0 +1,111 @@ +/* Kernel module to match IPComp parameters for IPv4 and IPv6 + * + * Copyright (C) 2013 WindRiver + * + * Author: + * Fan Du <fan.du@windriver.com> + * + * Based on: + * net/netfilter/xt_esp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter/xt_ipcomp.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fan Du <fan.du@windriver.com>"); +MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r = (spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip_comp_hdr _comphdr; + const struct ip_comp_hdr *chdr; + const struct xt_ipcomp *compinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr); + if (chdr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil IPComp tinygram.\n"); + par->hotdrop = true; + return 0; + } + + return spi_match(compinfo->spis[0], compinfo->spis[1], + ntohs(chdr->cpi), + !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); +} + +static int comp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_ipcomp *compinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) { + pr_err("unknown flags %X\n", compinfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match comp_mt_reg[] __read_mostly = { + { + .name = "ipcomp", + .family = NFPROTO_IPV4, + .match = comp_mt, + .matchsize = sizeof(struct xt_ipcomp), + .proto = IPPROTO_COMP, + .checkentry = comp_mt_check, + .me = THIS_MODULE, + }, + { + .name = "ipcomp", + .family = NFPROTO_IPV6, + .match = comp_mt, + .matchsize = sizeof(struct xt_ipcomp), + .proto = IPPROTO_COMP, + .checkentry = comp_mt_check, + .me = THIS_MODULE, + }, +}; + +static int __init comp_mt_init(void) +{ + return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +static void __exit comp_mt_exit(void) +{ + xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +module_init(comp_mt_init); +module_exit(comp_mt_exit); diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c new file mode 100644 index 00000000000..8aee572771f --- /dev/null +++ b/net/netfilter/xt_l2tp.c @@ -0,0 +1,354 @@ +/* Kernel module to match L2TP header parameters. */ + +/* (C) 2013 James Chapman <jchapman@katalix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <linux/l2tp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_l2tp.h> + +/* L2TP header masks */ +#define L2TP_HDR_T_BIT 0x8000 +#define L2TP_HDR_L_BIT 0x4000 +#define L2TP_HDR_VER 0x000f + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); +MODULE_DESCRIPTION("Xtables: L2TP header match"); +MODULE_ALIAS("ipt_l2tp"); +MODULE_ALIAS("ip6t_l2tp"); + +/* The L2TP fields that can be matched */ +struct l2tp_data { + u32 tid; + u32 sid; + u8 type; + u8 version; +}; + +union l2tp_val { + __be16 val16[2]; + __be32 val32; +}; + +static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data) +{ + if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type)) + return false; + + if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version)) + return false; + + /* Check tid only for L2TPv3 control or any L2TPv2 packets */ + if ((info->flags & XT_L2TP_TID) && + ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) && + (info->tid != data->tid)) + return false; + + /* Check sid only for L2TP data packets */ + if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) && + (info->sid != data->sid)) + return false; + + return true; +} + +/* Parse L2TP header fields when UDP encapsulation is used. Handles + * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a + * different format. See + * RFC2661, Section 3.1, L2TPv2 Header Format + * RFC3931, Section 3.2.1, L2TPv3 Control Message Header + * RFC3931, Section 3.2.2, L2TPv3 Data Message Header + * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP + */ +static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ + const struct xt_l2tp_info *info = par->matchinfo; + int uhlen = sizeof(struct udphdr); + int offs = thoff + uhlen; + union l2tp_val *lh; + union l2tp_val lhbuf; + u16 flags; + struct l2tp_data data = { 0, }; + + if (par->fragoff != 0) + return false; + + /* Extract L2TP header fields. The flags in the first 16 bits + * tell us where the other fields are. + */ + lh = skb_header_pointer(skb, offs, 2, &lhbuf); + if (lh == NULL) + return false; + + flags = ntohs(lh->val16[0]); + if (flags & L2TP_HDR_T_BIT) + data.type = XT_L2TP_TYPE_CONTROL; + else + data.type = XT_L2TP_TYPE_DATA; + data.version = (u8) flags & L2TP_HDR_VER; + + /* Now extract the L2TP tid/sid. These are in different places + * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we + * must also check to see if the length field is present, + * since this affects the offsets into the packet of the + * tid/sid fields. + */ + if (data.version == 3) { + lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf); + if (lh == NULL) + return false; + if (data.type == XT_L2TP_TYPE_CONTROL) + data.tid = ntohl(lh->val32); + else + data.sid = ntohl(lh->val32); + } else if (data.version == 2) { + if (flags & L2TP_HDR_L_BIT) + offs += 2; + lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf); + if (lh == NULL) + return false; + data.tid = (u32) ntohs(lh->val16[0]); + data.sid = (u32) ntohs(lh->val16[1]); + } else + return false; + + return l2tp_match(info, &data); +} + +/* Parse L2TP header fields for IP encapsulation (no UDP header). + * L2TPv3 data packets have a different form with IP encap. See + * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP. + * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP. + */ +static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ + const struct xt_l2tp_info *info = par->matchinfo; + union l2tp_val *lh; + union l2tp_val lhbuf; + struct l2tp_data data = { 0, }; + + /* For IP encap, the L2TP sid is the first 32-bits. */ + lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf); + if (lh == NULL) + return false; + if (lh->val32 == 0) { + /* Must be a control packet. The L2TP tid is further + * into the packet. + */ + data.type = XT_L2TP_TYPE_CONTROL; + lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf), + &lhbuf); + if (lh == NULL) + return false; + data.tid = ntohl(lh->val32); + } else { + data.sid = ntohl(lh->val32); + data.type = XT_L2TP_TYPE_DATA; + } + + data.version = 3; + + return l2tp_match(info, &data); +} + +static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct iphdr *iph = ip_hdr(skb); + u8 ipproto = iph->protocol; + + /* l2tp_mt_check4 already restricts the transport protocol */ + switch (ipproto) { + case IPPROTO_UDP: + return l2tp_udp_mt(skb, par, par->thoff); + case IPPROTO_L2TP: + return l2tp_ip_mt(skb, par, par->thoff); + } + + return false; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + unsigned int thoff = 0; + unsigned short fragoff = 0; + int ipproto; + + ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (fragoff != 0) + return false; + + /* l2tp_mt_check6 already restricts the transport protocol */ + switch (ipproto) { + case IPPROTO_UDP: + return l2tp_udp_mt(skb, par, thoff); + case IPPROTO_L2TP: + return l2tp_ip_mt(skb, par, thoff); + } + + return false; +} +#endif + +static int l2tp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + + /* Check for invalid flags */ + if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | + XT_L2TP_TYPE)) { + pr_info("unknown flags: %x\n", info->flags); + return -EINVAL; + } + + /* At least one of tid, sid or type=control must be specified */ + if ((!(info->flags & XT_L2TP_TID)) && + (!(info->flags & XT_L2TP_SID)) && + ((!(info->flags & XT_L2TP_TYPE)) || + (info->type != XT_L2TP_TYPE_CONTROL))) { + pr_info("invalid flags combination: %x\n", info->flags); + return -EINVAL; + } + + /* If version 2 is specified, check that incompatible params + * are not supplied + */ + if (info->flags & XT_L2TP_VERSION) { + if ((info->version < 2) || (info->version > 3)) { + pr_info("wrong L2TP version: %u\n", info->version); + return -EINVAL; + } + + if (info->version == 2) { + if ((info->flags & XT_L2TP_TID) && + (info->tid > 0xffff)) { + pr_info("v2 tid > 0xffff: %u\n", info->tid); + return -EINVAL; + } + if ((info->flags & XT_L2TP_SID) && + (info->sid > 0xffff)) { + pr_info("v2 sid > 0xffff: %u\n", info->sid); + return -EINVAL; + } + } + } + + return 0; +} + +static int l2tp_mt_check4(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + const struct ipt_entry *e = par->entryinfo; + const struct ipt_ip *ip = &e->ip; + int ret; + + ret = l2tp_mt_check(par); + if (ret != 0) + return ret; + + if ((ip->proto != IPPROTO_UDP) && + (ip->proto != IPPROTO_L2TP)) { + pr_info("missing protocol rule (udp|l2tpip)\n"); + return -EINVAL; + } + + if ((ip->proto == IPPROTO_L2TP) && + (info->version == 2)) { + pr_info("v2 doesn't support IP mode\n"); + return -EINVAL; + } + + return 0; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int l2tp_mt_check6(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + const struct ip6t_entry *e = par->entryinfo; + const struct ip6t_ip6 *ip = &e->ipv6; + int ret; + + ret = l2tp_mt_check(par); + if (ret != 0) + return ret; + + if ((ip->proto != IPPROTO_UDP) && + (ip->proto != IPPROTO_L2TP)) { + pr_info("missing protocol rule (udp|l2tpip)\n"); + return -EINVAL; + } + + if ((ip->proto == IPPROTO_L2TP) && + (info->version == 2)) { + pr_info("v2 doesn't support IP mode\n"); + return -EINVAL; + } + + return 0; +} +#endif + +static struct xt_match l2tp_mt_reg[] __read_mostly = { + { + .name = "l2tp", + .revision = 0, + .family = NFPROTO_IPV4, + .match = l2tp_mt4, + .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), + .checkentry = l2tp_mt_check4, + .hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD)), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "l2tp", + .revision = 0, + .family = NFPROTO_IPV6, + .match = l2tp_mt6, + .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), + .checkentry = l2tp_mt_check6, + .hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD)), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init l2tp_mt_init(void) +{ + return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +static void __exit l2tp_mt_exit(void) +{ + xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +module_init(l2tp_mt_init); +module_exit(l2tp_mt_exit); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index b3be0ef21f1..8c646ed9c92 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -21,11 +21,14 @@ MODULE_ALIAS("ip6t_nfacct"); static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) { + int overquota; const struct xt_nfacct_match_info *info = par->targinfo; nfnl_acct_update(skb, info->nfacct); - return true; + overquota = nfnl_acct_overquota(skb, info->nfacct); + + return overquota == NFACCT_UNDERQUOTA ? false : true; } static int diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 7174611bd67..c529161cdbf 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -422,4 +422,6 @@ module_exit(xt_osf_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 1e657cf715c..a9faae89f95 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -313,10 +313,7 @@ out: static void recent_table_free(void *addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } static int recent_mt_check(const struct xt_mtchk_param *par, diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 6efe4e5a81c..8fd324116e6 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -5,23 +5,35 @@ * they serve as the hanging-off data accessed through repl.data[]. */ +/* tbl has the following structure equivalent, but is C99 compliant: + * struct { + * struct type##_replace repl; + * struct type##_standard entries[nhooks]; + * struct type##_error term; + * } *tbl; + */ + #define xt_alloc_initial_table(type, typ2) ({ \ unsigned int hook_mask = info->valid_hooks; \ unsigned int nhooks = hweight32(hook_mask); \ unsigned int bytes = 0, hooknum = 0, i = 0; \ struct { \ struct type##_replace repl; \ - struct type##_standard entries[nhooks]; \ - struct type##_error term; \ - } *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ + struct type##_standard entries[]; \ + } *tbl; \ + struct type##_error *term; \ + size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \ + __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \ + tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \ if (tbl == NULL) \ return NULL; \ + term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ - tbl->term = (struct type##_error)typ2##_ERROR_INIT; \ + *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ - sizeof(struct type##_error); \ + sizeof(struct type##_error); \ for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \ if (!(hook_mask & 1)) \ continue; \ diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 4fe4fb4276d..11de55e7a86 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -37,7 +37,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: - if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) + if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability) ret = !ret; break; case XT_STATISTIC_MODE_NTH: diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index bca50b95c18..e6fac7e3db5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -131,7 +131,7 @@ int netlink_add_tap(struct netlink_tap *nt) } EXPORT_SYMBOL_GPL(netlink_add_tap); -int __netlink_remove_tap(struct netlink_tap *nt) +static int __netlink_remove_tap(struct netlink_tap *nt) { bool found = false; struct netlink_tap *tmp; @@ -155,7 +155,6 @@ out: return found ? 0 : -ENODEV; } -EXPORT_SYMBOL_GPL(__netlink_remove_tap); int netlink_remove_tap(struct netlink_tap *nt) { @@ -204,6 +203,8 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); + nskb->pkt_type = netlink_is_kernel(sk) ? + PACKET_KERNEL : PACKET_USER; ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) @@ -239,6 +240,13 @@ static void netlink_deliver_tap(struct sk_buff *skb) rcu_read_unlock(); } +static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, + struct sk_buff *skb) +{ + if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) + netlink_deliver_tap(skb); +} + static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -628,7 +636,7 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, while (nlk->cb_running && netlink_dump_space(nlk)) { err = netlink_dump(sk); if (err < 0) { - sk->sk_err = err; + sk->sk_err = -err; sk->sk_error_report(sk); break; } @@ -1198,7 +1206,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; - void (*bind)(int group); + int (*bind)(int group); + void (*unbind)(int group); int err = 0; sock->state = SS_UNCONNECTED; @@ -1224,6 +1233,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; + unbind = nl_table[protocol].unbind; netlink_unlock_table(); if (err < 0) @@ -1240,6 +1250,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; + nlk->netlink_unbind = unbind; out: return err; @@ -1293,6 +1304,7 @@ static int netlink_release(struct socket *sock) kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; + nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } @@ -1352,7 +1364,74 @@ retry: return err; } -static inline int netlink_capable(const struct socket *sock, unsigned int flag) +/** + * __netlink_ns_capable - General netlink message capability test + * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, + struct user_namespace *user_ns, int cap) +{ + return ((nsp->flags & NETLINK_SKB_DST) || + file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(__netlink_ns_capable); + +/** + * netlink_ns_capable - General netlink message capability test + * @skb: socket buffer holding a netlink command from userspace + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool netlink_ns_capable(const struct sk_buff *skb, + struct user_namespace *user_ns, int cap) +{ + return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); +} +EXPORT_SYMBOL(netlink_ns_capable); + +/** + * netlink_capable - Netlink global message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in all user namespaces. + */ +bool netlink_capable(const struct sk_buff *skb, int cap) +{ + return netlink_ns_capable(skb, &init_user_ns, cap); +} +EXPORT_SYMBOL(netlink_capable); + +/** + * netlink_net_capable - Netlink network namespace message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap over the network namespace of + * the socket we received the message from. + */ +bool netlink_net_capable(const struct sk_buff *skb, int cap) +{ + return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); +} +EXPORT_SYMBOL(netlink_net_capable); + +static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); @@ -1403,6 +1482,19 @@ static int netlink_realloc_groups(struct sock *sk) return err; } +static void netlink_unbind(int group, long unsigned int groups, + struct netlink_sock *nlk) +{ + int undo; + + if (!nlk->netlink_unbind) + return; + + for (undo = 0; undo < group; undo++) + if (test_bit(group, &groups)) + nlk->netlink_unbind(undo); +} + static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -1411,6 +1503,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; + long unsigned int groups = nladdr->nl_groups; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -1419,45 +1512,53 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; /* Only superuser is allowed to listen multicasts */ - if (nladdr->nl_groups) { - if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) + if (groups) { + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } - if (nlk->portid) { + if (nlk->portid) if (nladdr->nl_pid != nlk->portid) return -EINVAL; - } else { + + if (nlk->netlink_bind && groups) { + int group; + + for (group = 0; group < nlk->ngroups; group++) { + if (!test_bit(group, &groups)) + continue; + err = nlk->netlink_bind(group); + if (!err) + continue; + netlink_unbind(group, groups, nlk); + return err; + } + } + + if (!nlk->portid) { err = nladdr->nl_pid ? netlink_insert(sk, net, nladdr->nl_pid) : netlink_autobind(sock); - if (err) + if (err) { + netlink_unbind(nlk->ngroups - 1, groups, nlk); return err; + } } - if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) + if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) return 0; netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + - hweight32(nladdr->nl_groups) - + hweight32(groups) - hweight32(nlk->groups[0])); - nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); - if (nlk->netlink_bind && nlk->groups[0]) { - int i; - - for (i=0; i<nlk->ngroups; i++) { - if (test_bit(i, nlk->groups)) - nlk->netlink_bind(i); - } - } - return 0; } @@ -1481,8 +1582,8 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family != AF_NETLINK) return -EINVAL; - /* Only superuser is allowed to send multicasts */ - if (nladdr->nl_groups && !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) + if ((nladdr->nl_groups || nladdr->nl_pid) && + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; if (!nlk->portid) @@ -1645,7 +1746,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) else #endif /* CONFIG_NETLINK_MMAP */ skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, len); + sk->sk_data_ready(sk); return len; } @@ -1697,14 +1798,10 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { - /* We could do a netlink_deliver_tap(skb) here as well - * but since this is intended for the kernel only, we - * should rather let it stay under the hood. - */ - ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; + netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { @@ -1769,6 +1866,9 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, if (ring->pg_vec == NULL) goto out_put; + if (ring->frame_size - NL_MMAP_HDRLEN < size) + goto out_put; + skb = alloc_skb_head(gfp_mask); if (skb == NULL) goto err1; @@ -1778,6 +1878,7 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, if (ring->pg_vec == NULL) goto out_free; + /* check again under lock */ maxlen = ring->frame_size - NL_MMAP_HDRLEN; if (maxlen < size) goto out_free; @@ -2088,20 +2189,24 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { - if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; + if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { + err = nlk->netlink_bind(val); + if (err) + return err; + } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); - - if (nlk->netlink_bind) - nlk->netlink_bind(val); + if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) + nlk->netlink_unbind(val); err = 0; break; @@ -2214,12 +2319,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *addr = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; + u32 netlink_skb_flags = 0; if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; @@ -2239,8 +2345,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && - !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; + netlink_skb_flags |= NETLINK_SKB_DST; } else { dst_portid = nlk->dst_portid; dst_group = nlk->dst_group; @@ -2270,6 +2377,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = siocb->scm->creds; + NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { @@ -2335,6 +2443,11 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } #endif + /* Record the max length of recvmsg() calls for future allocations */ + nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); + nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, + 16384); + copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; @@ -2345,7 +2458,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); if (msg->msg_name) { - struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; @@ -2370,7 +2483,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { - sk->sk_err = ret; + sk->sk_err = -ret; sk->sk_error_report(sk); } } @@ -2381,7 +2494,7 @@ out: return err ? : copied; } -static void netlink_data_ready(struct sock *sk, int len) +static void netlink_data_ready(struct sock *sk) { BUG(); } @@ -2535,28 +2648,13 @@ void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) netlink_update_socket_mc(nlk_sk(sk), group, 0); } -/** - * netlink_clear_multicast_users - kick off multicast listeners - * - * This function removes all listeners from the given group. - * @ksk: The kernel netlink socket, as returned by - * netlink_kernel_create(). - * @group: The multicast group to clear. - */ -void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) -{ - netlink_table_grab(); - __netlink_clear_multicast_users(ksk, group); - netlink_table_ungrab(); -} - struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); - nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size)); + nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; @@ -2594,7 +2692,27 @@ static int netlink_dump(struct sock *sk) if (!netlink_rx_is_mmaped(sk) && atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); + + /* NLMSG_GOODSIZE is small to avoid high order allocations being + * required, but it makes sense to _attempt_ a 16K bytes allocation + * to reduce number of system calls on dump operations, if user + * ever provided a big enough buffer. + */ + if (alloc_size < nlk->max_recvmsg_len) { + skb = netlink_alloc_skb(sk, + nlk->max_recvmsg_len, + nlk->portid, + GFP_KERNEL | + __GFP_NOWARN | + __GFP_NORETRY); + /* available room should be exact amount to avoid MSG_TRUNC */ + if (skb) + skb_reserve(skb, skb_tailroom(skb) - + nlk->max_recvmsg_len); + } + if (!skb) + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, + GFP_KERNEL); if (!skb) goto errout_skb; netlink_skb_set_owner_r(skb, sk); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index acbd774eeb7..0b59d441f5b 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -31,13 +31,15 @@ struct netlink_sock { u32 ngroups; unsigned long *groups; unsigned long state; + size_t max_recvmsg_len; wait_queue_head_t wait; bool cb_running; struct netlink_callback cb; struct mutex *cb_mutex; struct mutex cb_def_mutex; void (*netlink_rcv)(struct sk_buff *skb); - void (*netlink_bind)(int group); + int (*netlink_bind)(int group); + void (*netlink_unbind)(int group); struct module *module; #ifdef CONFIG_NETLINK_MMAP struct mutex pg_vec_lock; @@ -73,7 +75,8 @@ struct netlink_table { unsigned int groups; struct mutex *cb_mutex; struct module *module; - void (*bind)(int group); + int (*bind)(int group); + void (*unbind)(int group); bool (*compare)(struct net *net, struct sock *sock); int registered; }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 713671ae45a..76393f2f4b2 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -317,7 +317,7 @@ static void genl_unregister_mc_groups(struct genl_family *family) } } -static int genl_validate_ops(struct genl_family *family) +static int genl_validate_ops(const struct genl_family *family) { const struct genl_ops *ops = family->ops; unsigned int n_ops = family->n_ops; @@ -337,10 +337,6 @@ static int genl_validate_ops(struct genl_family *family) return -EINVAL; } - /* family is not registered yet, so no locking needed */ - family->ops = ops; - family->n_ops = n_ops; - return 0; } @@ -461,6 +457,26 @@ int genl_unregister_family(struct genl_family *family) EXPORT_SYMBOL(genl_unregister_family); /** + * genlmsg_new_unicast - Allocate generic netlink message for unicast + * @payload: size of the message payload + * @info: information on destination + * @flags: the type of memory to allocate + * + * Allocates a new sk_buff large enough to cover the specified payload + * plus required Netlink headers. Will check receiving socket for + * memory mapped i/o capability and use it if enabled. Will fall back + * to non-mapped skb if message size exceeds the frame size of the ring. + */ +struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, + gfp_t flags) +{ + size_t len = nlmsg_total_size(genlmsg_total_size(payload)); + + return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags); +} +EXPORT_SYMBOL_GPL(genlmsg_new_unicast); + +/** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to @@ -541,7 +557,7 @@ static int genl_family_rcv_msg(struct genl_family *family, return -EOPNOTSUPP; if ((ops->flags & GENL_ADMIN_PERM) && - !capable(CAP_NET_ADMIN)) + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { @@ -600,6 +616,7 @@ static int genl_family_rcv_msg(struct genl_family *family, info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; + info.dst_sk = skb->sk; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 53c19a35fc6..ede50d197e1 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1011,7 +1011,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) skb_queue_head(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); bh_unlock_sock(sk); @@ -1028,7 +1028,7 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); - struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name); int err; struct sockaddr_ax25 sax; struct sk_buff *skb; @@ -1137,7 +1137,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; - struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name); size_t copied; struct sk_buff *skb; int er; diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 9d68441e2a5..2277276f52b 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/nfc.h> diff --git a/net/nfc/core.c b/net/nfc/core.c index 872529105ab..819b87702b7 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ @@ -135,11 +133,8 @@ int nfc_dev_up(struct nfc_dev *dev) dev->dev_up = true; /* We have to enable the device before discovering SEs */ - if (dev->ops->discover_se) { - rc = dev->ops->discover_se(dev); - if (rc) - pr_warn("SE discovery failed\n"); - } + if (dev->ops->discover_se && dev->ops->discover_se(dev)) + pr_err("SE discovery failed\n"); error: device_unlock(&dev->dev); @@ -285,9 +280,6 @@ static struct nfc_target *nfc_find_target(struct nfc_dev *dev, u32 target_idx) { int i; - if (dev->n_targets == 0) - return NULL; - for (i = 0; i < dev->n_targets; i++) { if (dev->targets[i].idx == target_idx) return &dev->targets[i]; @@ -384,7 +376,7 @@ int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, { dev->dep_link_up = true; - if (!dev->active_target) { + if (!dev->active_target && rf_mode == NFC_RF_INITIATOR) { struct nfc_target *target; target = nfc_find_target(dev, target_idx); @@ -551,9 +543,9 @@ error: struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx) { - struct nfc_se *se, *n; + struct nfc_se *se; - list_for_each_entry_safe(se, n, &dev->secure_elements, list) + list_for_each_entry(se, &dev->secure_elements, list) if (se->idx == se_idx) return se; @@ -660,9 +652,6 @@ int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); - if (gb_len > NFC_MAX_GT_LEN) - return -EINVAL; - return nfc_llcp_set_remote_gb(dev, gb, gb_len); } EXPORT_SYMBOL(nfc_set_remote_general_bytes); diff --git a/net/nfc/digital.h b/net/nfc/digital.h index 08b29b55ea6..71ad7eefddd 100644 --- a/net/nfc/digital.h +++ b/net/nfc/digital.h @@ -71,7 +71,14 @@ static inline int digital_in_send_cmd(struct nfc_digital_dev *ddev, void digital_poll_next_tech(struct nfc_digital_dev *ddev); int digital_in_send_sens_req(struct nfc_digital_dev *ddev, u8 rf_tech); +int digital_in_send_sensb_req(struct nfc_digital_dev *ddev, u8 rf_tech); int digital_in_send_sensf_req(struct nfc_digital_dev *ddev, u8 rf_tech); +int digital_in_send_iso15693_inv_req(struct nfc_digital_dev *ddev, u8 rf_tech); + +int digital_in_iso_dep_pull_sod(struct nfc_digital_dev *ddev, + struct sk_buff *skb); +int digital_in_iso_dep_push_sod(struct nfc_digital_dev *ddev, + struct sk_buff *skb); int digital_target_found(struct nfc_digital_dev *ddev, struct nfc_target *target, u8 protocol); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 09fc9543995..a6ce3c627e4 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -22,9 +22,13 @@ #define DIGITAL_PROTO_NFCA_RF_TECH \ (NFC_PROTO_JEWEL_MASK | NFC_PROTO_MIFARE_MASK | NFC_PROTO_NFC_DEP_MASK) +#define DIGITAL_PROTO_NFCB_RF_TECH NFC_PROTO_ISO14443_B_MASK + #define DIGITAL_PROTO_NFCF_RF_TECH \ (NFC_PROTO_FELICA_MASK | NFC_PROTO_NFC_DEP_MASK) +#define DIGITAL_PROTO_ISO15693_RF_TECH NFC_PROTO_ISO15693_MASK + struct digital_cmd { struct list_head queue; @@ -331,6 +335,24 @@ int digital_target_found(struct nfc_digital_dev *ddev, } break; + case NFC_PROTO_ISO15693: + framing = NFC_DIGITAL_FRAMING_ISO15693_T5T; + check_crc = digital_skb_check_crc_b; + add_crc = digital_skb_add_crc_b; + break; + + case NFC_PROTO_ISO14443: + framing = NFC_DIGITAL_FRAMING_NFCA_T4T; + check_crc = digital_skb_check_crc_a; + add_crc = digital_skb_add_crc_a; + break; + + case NFC_PROTO_ISO14443_B: + framing = NFC_DIGITAL_FRAMING_NFCB_T4T; + check_crc = digital_skb_check_crc_b; + add_crc = digital_skb_add_crc_b; + break; + default: pr_err("Invalid protocol %d\n", protocol); return -EINVAL; @@ -339,7 +361,6 @@ int digital_target_found(struct nfc_digital_dev *ddev, pr_debug("rf_tech=%d, protocol=%d\n", rf_tech, protocol); ddev->curr_rf_tech = rf_tech; - ddev->curr_protocol = protocol; if (DIGITAL_DRV_CAPS_IN_CRC(ddev)) { ddev->skb_add_crc = digital_skb_add_crc_none; @@ -365,6 +386,8 @@ int digital_target_found(struct nfc_digital_dev *ddev, void digital_poll_next_tech(struct nfc_digital_dev *ddev) { + u8 rand_mod; + digital_switch_rf(ddev, 0); mutex_lock(&ddev->poll_lock); @@ -374,8 +397,8 @@ void digital_poll_next_tech(struct nfc_digital_dev *ddev) return; } - ddev->poll_tech_index = (ddev->poll_tech_index + 1) % - ddev->poll_tech_count; + get_random_bytes(&rand_mod, sizeof(rand_mod)); + ddev->poll_tech_index = rand_mod % ddev->poll_tech_count; mutex_unlock(&ddev->poll_lock); @@ -462,7 +485,11 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols, digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_106A, digital_in_send_sens_req); - if (im_protocols & DIGITAL_PROTO_NFCF_RF_TECH) { + if (matching_im_protocols & DIGITAL_PROTO_NFCB_RF_TECH) + digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_106B, + digital_in_send_sensb_req); + + if (matching_im_protocols & DIGITAL_PROTO_NFCF_RF_TECH) { digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_212F, digital_in_send_sensf_req); @@ -470,7 +497,11 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols, digital_in_send_sensf_req); } - if (tm_protocols & NFC_PROTO_NFC_DEP_MASK) { + if (matching_im_protocols & DIGITAL_PROTO_ISO15693_RF_TECH) + digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_ISO15693, + digital_in_send_iso15693_inv_req); + + if (matching_tm_protocols & NFC_PROTO_NFC_DEP_MASK) { if (ddev->ops->tg_listen_mdaa) { digital_add_poll_tech(ddev, 0, digital_tg_listen_mdaa); @@ -541,8 +572,14 @@ static int digital_dep_link_up(struct nfc_dev *nfc_dev, __u8 comm_mode, __u8 *gb, size_t gb_len) { struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev); + int rc; + + rc = digital_in_send_atr_req(ddev, target, comm_mode, gb, gb_len); + + if (!rc) + ddev->curr_protocol = NFC_PROTO_NFC_DEP; - return digital_in_send_atr_req(ddev, target, comm_mode, gb, gb_len); + return rc; } static int digital_dep_link_down(struct nfc_dev *nfc_dev) @@ -557,6 +594,20 @@ static int digital_dep_link_down(struct nfc_dev *nfc_dev) static int digital_activate_target(struct nfc_dev *nfc_dev, struct nfc_target *target, __u32 protocol) { + struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev); + + if (ddev->poll_tech_count) { + pr_err("Can't activate a target while polling\n"); + return -EBUSY; + } + + if (ddev->curr_protocol) { + pr_err("A target is already active\n"); + return -EBUSY; + } + + ddev->curr_protocol = protocol; + return 0; } @@ -565,6 +616,11 @@ static void digital_deactivate_target(struct nfc_dev *nfc_dev, { struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev); + if (!ddev->curr_protocol) { + pr_err("No active target\n"); + return; + } + ddev->curr_protocol = 0; } @@ -583,20 +639,31 @@ static void digital_in_send_complete(struct nfc_digital_dev *ddev, void *arg, if (IS_ERR(resp)) { rc = PTR_ERR(resp); + resp = NULL; goto done; } - if (ddev->curr_protocol == NFC_PROTO_MIFARE) + if (ddev->curr_protocol == NFC_PROTO_MIFARE) { rc = digital_in_recv_mifare_res(resp); - else - rc = ddev->skb_check_crc(resp); + /* crc check is done in digital_in_recv_mifare_res() */ + goto done; + } + if ((ddev->curr_protocol == NFC_PROTO_ISO14443) || + (ddev->curr_protocol == NFC_PROTO_ISO14443_B)) { + rc = digital_in_iso_dep_pull_sod(ddev, resp); + if (rc) + goto done; + } + + rc = ddev->skb_check_crc(resp); + +done: if (rc) { kfree_skb(resp); resp = NULL; } -done: data_exch->cb(data_exch->cb_context, resp, rc); kfree(data_exch); @@ -608,6 +675,7 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, { struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev); struct digital_data_exch *data_exch; + int rc; data_exch = kzalloc(sizeof(struct digital_data_exch), GFP_KERNEL); if (!data_exch) { @@ -618,13 +686,28 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, data_exch->cb = cb; data_exch->cb_context = cb_context; - if (ddev->curr_protocol == NFC_PROTO_NFC_DEP) - return digital_in_send_dep_req(ddev, target, skb, data_exch); + if (ddev->curr_protocol == NFC_PROTO_NFC_DEP) { + rc = digital_in_send_dep_req(ddev, target, skb, data_exch); + goto exit; + } + + if ((ddev->curr_protocol == NFC_PROTO_ISO14443) || + (ddev->curr_protocol == NFC_PROTO_ISO14443_B)) { + rc = digital_in_iso_dep_push_sod(ddev, skb); + if (rc) + goto exit; + } ddev->skb_add_crc(skb); - return digital_in_send_cmd(ddev, skb, 500, digital_in_send_complete, - data_exch); + rc = digital_in_send_cmd(ddev, skb, 500, digital_in_send_complete, + data_exch); + +exit: + if (rc) + kfree(data_exch); + + return rc; } static struct nfc_ops digital_nfc_ops = { @@ -676,6 +759,12 @@ struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, ddev->protocols |= NFC_PROTO_FELICA_MASK; if (supported_protocols & NFC_PROTO_NFC_DEP_MASK) ddev->protocols |= NFC_PROTO_NFC_DEP_MASK; + if (supported_protocols & NFC_PROTO_ISO15693_MASK) + ddev->protocols |= NFC_PROTO_ISO15693_MASK; + if (supported_protocols & NFC_PROTO_ISO14443_MASK) + ddev->protocols |= NFC_PROTO_ISO14443_MASK; + if (supported_protocols & NFC_PROTO_ISO14443_B_MASK) + ddev->protocols |= NFC_PROTO_ISO14443_B_MASK; ddev->tx_headroom = tx_headroom + DIGITAL_MAX_HEADER_LEN; ddev->tx_tailroom = tx_tailroom + DIGITAL_CRC_LEN; diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 07bbc24fb4c..171cb9949ab 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -32,7 +32,6 @@ #define DIGITAL_ATR_REQ_MIN_SIZE 16 #define DIGITAL_ATR_REQ_MAX_SIZE 64 -#define DIGITAL_NFCID3_LEN ((u8)8) #define DIGITAL_LR_BITS_PAYLOAD_SIZE_254B 0x30 #define DIGITAL_GB_BIT 0x02 @@ -206,10 +205,9 @@ int digital_in_send_atr_req(struct nfc_digital_dev *ddev, atr_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT; atr_req->cmd = DIGITAL_CMD_ATR_REQ; if (target->nfcid2_len) - memcpy(atr_req->nfcid3, target->nfcid2, - max(target->nfcid2_len, DIGITAL_NFCID3_LEN)); + memcpy(atr_req->nfcid3, target->nfcid2, NFC_NFCID2_MAXSIZE); else - get_random_bytes(atr_req->nfcid3, DIGITAL_NFCID3_LEN); + get_random_bytes(atr_req->nfcid3, NFC_NFCID3_MAXSIZE); atr_req->did = 0; atr_req->bs = 0; @@ -226,9 +224,8 @@ int digital_in_send_atr_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res, target); - - return 0; + return digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res, + target); } static int digital_in_send_rtox(struct nfc_digital_dev *ddev, @@ -382,6 +379,33 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev, data_exch); } +static void digital_tg_set_rf_tech(struct nfc_digital_dev *ddev, u8 rf_tech) +{ + ddev->curr_rf_tech = rf_tech; + + ddev->skb_add_crc = digital_skb_add_crc_none; + ddev->skb_check_crc = digital_skb_check_crc_none; + + if (DIGITAL_DRV_CAPS_TG_CRC(ddev)) + return; + + switch (ddev->curr_rf_tech) { + case NFC_DIGITAL_RF_TECH_106A: + ddev->skb_add_crc = digital_skb_add_crc_a; + ddev->skb_check_crc = digital_skb_check_crc_a; + break; + + case NFC_DIGITAL_RF_TECH_212F: + case NFC_DIGITAL_RF_TECH_424F: + ddev->skb_add_crc = digital_skb_add_crc_f; + ddev->skb_check_crc = digital_skb_check_crc_f; + break; + + default: + break; + } +} + static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, struct sk_buff *resp) { @@ -472,11 +496,13 @@ int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb) static void digital_tg_send_psl_res_complete(struct nfc_digital_dev *ddev, void *arg, struct sk_buff *resp) { - u8 rf_tech = PTR_ERR(arg); + u8 rf_tech = (unsigned long)arg; if (IS_ERR(resp)) return; + digital_tg_set_rf_tech(ddev, rf_tech); + digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, rf_tech); digital_tg_listen(ddev, 1500, digital_tg_recv_dep_req, NULL); @@ -508,7 +534,7 @@ static int digital_tg_send_psl_res(struct nfc_digital_dev *ddev, u8 did, ddev->skb_add_crc(skb); rc = digital_tg_send_cmd(ddev, skb, 0, digital_tg_send_psl_res_complete, - ERR_PTR(rf_tech)); + (void *)(unsigned long)rf_tech); if (rc) kfree_skb(skb); @@ -563,7 +589,7 @@ static void digital_tg_recv_psl_req(struct nfc_digital_dev *ddev, void *arg, rf_tech = NFC_DIGITAL_RF_TECH_424F; break; default: - pr_err("Unsuported dsi value %d\n", dsi); + pr_err("Unsupported dsi value %d\n", dsi); goto exit; } @@ -661,16 +687,10 @@ void digital_tg_recv_atr_req(struct nfc_digital_dev *ddev, void *arg, if (resp->data[0] == DIGITAL_NFC_DEP_NFCA_SOD_SB) { min_size = DIGITAL_ATR_REQ_MIN_SIZE + 2; - - ddev->curr_rf_tech = NFC_DIGITAL_RF_TECH_106A; - ddev->skb_add_crc = digital_skb_add_crc_a; - ddev->skb_check_crc = digital_skb_check_crc_a; + digital_tg_set_rf_tech(ddev, NFC_DIGITAL_RF_TECH_106A); } else { min_size = DIGITAL_ATR_REQ_MIN_SIZE + 1; - - ddev->curr_rf_tech = NFC_DIGITAL_RF_TECH_212F; - ddev->skb_add_crc = digital_skb_add_crc_f; - ddev->skb_check_crc = digital_skb_check_crc_f; + digital_tg_set_rf_tech(ddev, NFC_DIGITAL_RF_TECH_212F); } if (resp->len < min_size) { @@ -678,10 +698,7 @@ void digital_tg_recv_atr_req(struct nfc_digital_dev *ddev, void *arg, goto exit; } - if (DIGITAL_DRV_CAPS_TG_CRC(ddev)) { - ddev->skb_add_crc = digital_skb_add_crc_none; - ddev->skb_check_crc = digital_skb_check_crc_none; - } + ddev->curr_protocol = NFC_PROTO_NFC_DEP_MASK; rc = ddev->skb_check_crc(resp); if (rc) { diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index 251c8c753eb..c2c1c0189b7 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -30,6 +30,7 @@ #define DIGITAL_SEL_RES_NFCID1_COMPLETE(sel_res) (!((sel_res) & 0x04)) #define DIGITAL_SEL_RES_IS_T2T(sel_res) (!((sel_res) & 0x60)) +#define DIGITAL_SEL_RES_IS_T4T(sel_res) ((sel_res) & 0x20) #define DIGITAL_SEL_RES_IS_NFC_DEP(sel_res) ((sel_res) & 0x40) #define DIGITAL_SENS_RES_IS_T1T(sens_res) (((sens_res) & 0x0C00) == 0x0C00) @@ -40,6 +41,24 @@ #define DIGITAL_MIFARE_READ_RES_LEN 16 #define DIGITAL_MIFARE_ACK_RES 0x0A +#define DIGITAL_CMD_SENSB_REQ 0x05 +#define DIGITAL_SENSB_ADVANCED BIT(5) +#define DIGITAL_SENSB_EXTENDED BIT(4) +#define DIGITAL_SENSB_ALLB_REQ BIT(3) +#define DIGITAL_SENSB_N(n) ((n) & 0x7) + +#define DIGITAL_CMD_SENSB_RES 0x50 + +#define DIGITAL_CMD_ATTRIB_REQ 0x1D +#define DIGITAL_ATTRIB_P1_TR0_DEFAULT (0x0 << 6) +#define DIGITAL_ATTRIB_P1_TR1_DEFAULT (0x0 << 4) +#define DIGITAL_ATTRIB_P1_SUPRESS_EOS BIT(3) +#define DIGITAL_ATTRIB_P1_SUPRESS_SOS BIT(2) +#define DIGITAL_ATTRIB_P2_LISTEN_POLL_1 (0x0 << 6) +#define DIGITAL_ATTRIB_P2_POLL_LISTEN_1 (0x0 << 4) +#define DIGITAL_ATTRIB_P2_MAX_FRAME_256 0x8 +#define DIGITAL_ATTRIB_P4_DID(n) ((n) & 0xf) + #define DIGITAL_CMD_SENSF_REQ 0x00 #define DIGITAL_CMD_SENSF_RES 0x01 @@ -51,6 +70,35 @@ #define DIGITAL_SENSF_REQ_RC_SC 1 #define DIGITAL_SENSF_REQ_RC_AP 2 +#define DIGITAL_CMD_ISO15693_INVENTORY_REQ 0x01 + +#define DIGITAL_ISO15693_REQ_FLAG_DATA_RATE BIT(1) +#define DIGITAL_ISO15693_REQ_FLAG_INVENTORY BIT(2) +#define DIGITAL_ISO15693_REQ_FLAG_NB_SLOTS BIT(5) +#define DIGITAL_ISO15693_RES_FLAG_ERROR BIT(0) +#define DIGITAL_ISO15693_RES_IS_VALID(flags) \ + (!((flags) & DIGITAL_ISO15693_RES_FLAG_ERROR)) + +#define DIGITAL_ISO_DEP_I_PCB 0x02 +#define DIGITAL_ISO_DEP_PNI(pni) ((pni) & 0x01) + +#define DIGITAL_ISO_DEP_PCB_TYPE(pcb) ((pcb) & 0xC0) + +#define DIGITAL_ISO_DEP_I_BLOCK 0x00 + +#define DIGITAL_ISO_DEP_BLOCK_HAS_DID(pcb) ((pcb) & 0x08) + +static const u8 digital_ats_fsc[] = { + 16, 24, 32, 40, 48, 64, 96, 128, +}; + +#define DIGITAL_ATS_FSCI(t0) ((t0) & 0x0F) +#define DIGITAL_SENSB_FSCI(pi2) (((pi2) & 0xF0) >> 4) +#define DIGITAL_ATS_MAX_FSC 256 + +#define DIGITAL_RATS_BYTE1 0xE0 +#define DIGITAL_RATS_PARAM 0x80 + struct digital_sdd_res { u8 nfcid1[4]; u8 bcc; @@ -63,6 +111,32 @@ struct digital_sel_req { u8 bcc; } __packed; +struct digital_sensb_req { + u8 cmd; + u8 afi; + u8 param; +} __packed; + +struct digital_sensb_res { + u8 cmd; + u8 nfcid0[4]; + u8 app_data[4]; + u8 proto_info[3]; +} __packed; + +struct digital_attrib_req { + u8 cmd; + u8 nfcid0[4]; + u8 param1; + u8 param2; + u8 param3; + u8 param4; +} __packed; + +struct digital_attrib_res { + u8 mbli_did; +} __packed; + struct digital_sensf_req { u8 cmd; u8 sc1; @@ -82,9 +156,127 @@ struct digital_sensf_res { u8 rd[2]; } __packed; +struct digital_iso15693_inv_req { + u8 flags; + u8 cmd; + u8 mask_len; + u64 mask; +} __packed; + +struct digital_iso15693_inv_res { + u8 flags; + u8 dsfid; + u64 uid; +} __packed; + static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev, struct nfc_target *target); +int digital_in_iso_dep_pull_sod(struct nfc_digital_dev *ddev, + struct sk_buff *skb) +{ + u8 pcb; + u8 block_type; + + if (skb->len < 1) + return -EIO; + + pcb = *skb->data; + block_type = DIGITAL_ISO_DEP_PCB_TYPE(pcb); + + /* No support fo R-block nor S-block */ + if (block_type != DIGITAL_ISO_DEP_I_BLOCK) { + pr_err("ISO_DEP R-block and S-block not supported\n"); + return -EIO; + } + + if (DIGITAL_ISO_DEP_BLOCK_HAS_DID(pcb)) { + pr_err("DID field in ISO_DEP PCB not supported\n"); + return -EIO; + } + + skb_pull(skb, 1); + + return 0; +} + +int digital_in_iso_dep_push_sod(struct nfc_digital_dev *ddev, + struct sk_buff *skb) +{ + /* + * Chaining not supported so skb->len + 1 PCB byte + 2 CRC bytes must + * not be greater than remote FSC + */ + if (skb->len + 3 > ddev->target_fsc) + return -EIO; + + skb_push(skb, 1); + + *skb->data = DIGITAL_ISO_DEP_I_PCB | ddev->curr_nfc_dep_pni; + + ddev->curr_nfc_dep_pni = + DIGITAL_ISO_DEP_PNI(ddev->curr_nfc_dep_pni + 1); + + return 0; +} + +static void digital_in_recv_ats(struct nfc_digital_dev *ddev, void *arg, + struct sk_buff *resp) +{ + struct nfc_target *target = arg; + u8 fsdi; + int rc; + + if (IS_ERR(resp)) { + rc = PTR_ERR(resp); + resp = NULL; + goto exit; + } + + if (resp->len < 2) { + rc = -EIO; + goto exit; + } + + fsdi = DIGITAL_ATS_FSCI(resp->data[1]); + if (fsdi >= 8) + ddev->target_fsc = DIGITAL_ATS_MAX_FSC; + else + ddev->target_fsc = digital_ats_fsc[fsdi]; + + ddev->curr_nfc_dep_pni = 0; + + rc = digital_target_found(ddev, target, NFC_PROTO_ISO14443); + +exit: + dev_kfree_skb(resp); + kfree(target); + + if (rc) + digital_poll_next_tech(ddev); +} + +static int digital_in_send_rats(struct nfc_digital_dev *ddev, + struct nfc_target *target) +{ + int rc; + struct sk_buff *skb; + + skb = digital_skb_alloc(ddev, 2); + if (!skb) + return -ENOMEM; + + *skb_put(skb, 1) = DIGITAL_RATS_BYTE1; + *skb_put(skb, 1) = DIGITAL_RATS_PARAM; + + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_ats, + target); + if (rc) + kfree_skb(skb); + + return rc; +} + static void digital_in_recv_sel_res(struct nfc_digital_dev *ddev, void *arg, struct sk_buff *resp) { @@ -122,8 +314,19 @@ static void digital_in_recv_sel_res(struct nfc_digital_dev *ddev, void *arg, goto exit_free_skb; } + target->sel_res = sel_res; + if (DIGITAL_SEL_RES_IS_T2T(sel_res)) { nfc_proto = NFC_PROTO_MIFARE; + } else if (DIGITAL_SEL_RES_IS_T4T(sel_res)) { + rc = digital_in_send_rats(ddev, target); + if (rc) + goto exit; + /* + * Skip target_found and don't free it for now. This will be + * done when receiving the ATS + */ + goto exit_free_skb; } else if (DIGITAL_SEL_RES_IS_NFC_DEP(sel_res)) { nfc_proto = NFC_PROTO_NFC_DEP; } else { @@ -131,8 +334,6 @@ static void digital_in_recv_sel_res(struct nfc_digital_dev *ddev, void *arg, goto exit; } - target->sel_res = sel_res; - rc = digital_target_found(ddev, target, nfc_proto); exit: @@ -375,6 +576,175 @@ int digital_in_recv_mifare_res(struct sk_buff *resp) return -EIO; } +static void digital_in_recv_attrib_res(struct nfc_digital_dev *ddev, void *arg, + struct sk_buff *resp) +{ + struct nfc_target *target = arg; + struct digital_attrib_res *attrib_res; + int rc; + + if (IS_ERR(resp)) { + rc = PTR_ERR(resp); + resp = NULL; + goto exit; + } + + if (resp->len < sizeof(*attrib_res)) { + PROTOCOL_ERR("12.6.2"); + rc = -EIO; + goto exit; + } + + attrib_res = (struct digital_attrib_res *)resp->data; + + if (attrib_res->mbli_did & 0x0f) { + PROTOCOL_ERR("12.6.2.1"); + rc = -EIO; + goto exit; + } + + rc = digital_target_found(ddev, target, NFC_PROTO_ISO14443_B); + +exit: + dev_kfree_skb(resp); + kfree(target); + + if (rc) + digital_poll_next_tech(ddev); +} + +static int digital_in_send_attrib_req(struct nfc_digital_dev *ddev, + struct nfc_target *target, + struct digital_sensb_res *sensb_res) +{ + struct digital_attrib_req *attrib_req; + struct sk_buff *skb; + int rc; + + skb = digital_skb_alloc(ddev, sizeof(*attrib_req)); + if (!skb) + return -ENOMEM; + + attrib_req = (struct digital_attrib_req *)skb_put(skb, + sizeof(*attrib_req)); + + attrib_req->cmd = DIGITAL_CMD_ATTRIB_REQ; + memcpy(attrib_req->nfcid0, sensb_res->nfcid0, + sizeof(attrib_req->nfcid0)); + attrib_req->param1 = DIGITAL_ATTRIB_P1_TR0_DEFAULT | + DIGITAL_ATTRIB_P1_TR1_DEFAULT; + attrib_req->param2 = DIGITAL_ATTRIB_P2_LISTEN_POLL_1 | + DIGITAL_ATTRIB_P2_POLL_LISTEN_1 | + DIGITAL_ATTRIB_P2_MAX_FRAME_256; + attrib_req->param3 = sensb_res->proto_info[1] & 0x07; + attrib_req->param4 = DIGITAL_ATTRIB_P4_DID(0); + + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_attrib_res, + target); + if (rc) + kfree_skb(skb); + + return rc; +} + +static void digital_in_recv_sensb_res(struct nfc_digital_dev *ddev, void *arg, + struct sk_buff *resp) +{ + struct nfc_target *target = NULL; + struct digital_sensb_res *sensb_res; + u8 fsci; + int rc; + + if (IS_ERR(resp)) { + rc = PTR_ERR(resp); + resp = NULL; + goto exit; + } + + if (resp->len != sizeof(*sensb_res)) { + PROTOCOL_ERR("5.6.2.1"); + rc = -EIO; + goto exit; + } + + sensb_res = (struct digital_sensb_res *)resp->data; + + if (sensb_res->cmd != DIGITAL_CMD_SENSB_RES) { + PROTOCOL_ERR("5.6.2"); + rc = -EIO; + goto exit; + } + + if (!(sensb_res->proto_info[1] & BIT(0))) { + PROTOCOL_ERR("5.6.2.12"); + rc = -EIO; + goto exit; + } + + if (sensb_res->proto_info[1] & BIT(3)) { + PROTOCOL_ERR("5.6.2.16"); + rc = -EIO; + goto exit; + } + + fsci = DIGITAL_SENSB_FSCI(sensb_res->proto_info[1]); + if (fsci >= 8) + ddev->target_fsc = DIGITAL_ATS_MAX_FSC; + else + ddev->target_fsc = digital_ats_fsc[fsci]; + + target = kzalloc(sizeof(struct nfc_target), GFP_KERNEL); + if (!target) { + rc = -ENOMEM; + goto exit; + } + + rc = digital_in_send_attrib_req(ddev, target, sensb_res); + +exit: + dev_kfree_skb(resp); + + if (rc) { + kfree(target); + digital_poll_next_tech(ddev); + } +} + +int digital_in_send_sensb_req(struct nfc_digital_dev *ddev, u8 rf_tech) +{ + struct digital_sensb_req *sensb_req; + struct sk_buff *skb; + int rc; + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, + NFC_DIGITAL_RF_TECH_106B); + if (rc) + return rc; + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCB); + if (rc) + return rc; + + skb = digital_skb_alloc(ddev, sizeof(*sensb_req)); + if (!skb) + return -ENOMEM; + + sensb_req = (struct digital_sensb_req *)skb_put(skb, + sizeof(*sensb_req)); + + sensb_req->cmd = DIGITAL_CMD_SENSB_REQ; + sensb_req->afi = 0x00; /* All families and sub-families */ + sensb_req->param = DIGITAL_SENSB_N(0); + + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sensb_res, + NULL); + if (rc) + kfree_skb(skb); + + return rc; +} + static void digital_in_recv_sensf_res(struct nfc_digital_dev *ddev, void *arg, struct sk_buff *resp) { @@ -473,6 +843,93 @@ int digital_in_send_sensf_req(struct nfc_digital_dev *ddev, u8 rf_tech) return rc; } +static void digital_in_recv_iso15693_inv_res(struct nfc_digital_dev *ddev, + void *arg, struct sk_buff *resp) +{ + struct digital_iso15693_inv_res *res; + struct nfc_target *target = NULL; + int rc; + + if (IS_ERR(resp)) { + rc = PTR_ERR(resp); + resp = NULL; + goto out_free_skb; + } + + if (resp->len != sizeof(*res)) { + rc = -EIO; + goto out_free_skb; + } + + res = (struct digital_iso15693_inv_res *)resp->data; + + if (!DIGITAL_ISO15693_RES_IS_VALID(res->flags)) { + PROTOCOL_ERR("ISO15693 - 10.3.1"); + rc = -EINVAL; + goto out_free_skb; + } + + target = kzalloc(sizeof(*target), GFP_KERNEL); + if (!target) { + rc = -ENOMEM; + goto out_free_skb; + } + + target->is_iso15693 = 1; + target->iso15693_dsfid = res->dsfid; + memcpy(target->iso15693_uid, &res->uid, sizeof(target->iso15693_uid)); + + rc = digital_target_found(ddev, target, NFC_PROTO_ISO15693); + + kfree(target); + +out_free_skb: + dev_kfree_skb(resp); + + if (rc) + digital_poll_next_tech(ddev); +} + +int digital_in_send_iso15693_inv_req(struct nfc_digital_dev *ddev, u8 rf_tech) +{ + struct digital_iso15693_inv_req *req; + struct sk_buff *skb; + int rc; + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, + NFC_DIGITAL_RF_TECH_ISO15693); + if (rc) + return rc; + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_ISO15693_INVENTORY); + if (rc) + return rc; + + skb = digital_skb_alloc(ddev, sizeof(*req)); + if (!skb) + return -ENOMEM; + + skb_put(skb, sizeof(*req) - sizeof(req->mask)); /* No mask */ + req = (struct digital_iso15693_inv_req *)skb->data; + + /* Single sub-carrier, high data rate, no AFI, single slot + * Inventory command + */ + req->flags = DIGITAL_ISO15693_REQ_FLAG_DATA_RATE | + DIGITAL_ISO15693_REQ_FLAG_INVENTORY | + DIGITAL_ISO15693_REQ_FLAG_NB_SLOTS; + req->cmd = DIGITAL_CMD_ISO15693_INVENTORY_REQ; + req->mask_len = 0; + + rc = digital_in_send_cmd(ddev, skb, 30, + digital_in_recv_iso15693_inv_res, NULL); + if (rc) + kfree_skb(skb); + + return rc; +} + static int digital_tg_send_sel_res(struct nfc_digital_dev *ddev) { struct sk_buff *skb; @@ -634,6 +1091,18 @@ exit: dev_kfree_skb(resp); } +static void digital_tg_recv_atr_or_sensf_req(struct nfc_digital_dev *ddev, + void *arg, struct sk_buff *resp) +{ + if (!IS_ERR(resp) && (resp->len >= 2) && + (resp->data[1] == DIGITAL_CMD_SENSF_REQ)) + digital_tg_recv_sensf_req(ddev, arg, resp); + else + digital_tg_recv_atr_req(ddev, arg, resp); + + return; +} + static int digital_tg_send_sensf_res(struct nfc_digital_dev *ddev, struct digital_sensf_req *sensf_req) { @@ -644,7 +1113,7 @@ static int digital_tg_send_sensf_res(struct nfc_digital_dev *ddev, size = sizeof(struct digital_sensf_res); - if (sensf_req->rc != DIGITAL_SENSF_REQ_RC_NONE) + if (sensf_req->rc == DIGITAL_SENSF_REQ_RC_NONE) size -= sizeof(sensf_res->rd); skb = digital_skb_alloc(ddev, size); @@ -679,7 +1148,7 @@ static int digital_tg_send_sensf_res(struct nfc_digital_dev *ddev, digital_skb_add_crc_f(skb); rc = digital_tg_send_cmd(ddev, skb, 300, - digital_tg_recv_atr_req, NULL); + digital_tg_recv_atr_or_sensf_req, NULL); if (rc) kfree_skb(skb); diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index 64f922be928..677d24bb70f 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hci: %s: " fmt, __func__ @@ -28,6 +26,8 @@ #include "hci.h" +#define MAX_FWI 4949 + static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, const u8 *param, size_t param_len, data_exchange_cb_t cb, void *cb_context) @@ -39,7 +39,7 @@ static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, * for all commands? */ return nfc_hci_hcp_message_tx(hdev, pipe, NFC_HCI_HCP_COMMAND, cmd, - param, param_len, cb, cb_context, 3000); + param, param_len, cb, cb_context, MAX_FWI); } /* @@ -84,7 +84,7 @@ static int nfc_hci_execute_cmd(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, NFC_HCI_HCP_COMMAND, cmd, param, param_len, nfc_hci_execute_cb, &hcp_ew, - 3000); + MAX_FWI); if (hcp_ew.exec_result < 0) return hcp_ew.exec_result; diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index d07ca4c5cf8..47403705197 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hci: %s: " fmt, __func__ @@ -227,7 +225,7 @@ int nfc_hci_target_discovered(struct nfc_hci_dev *hdev, u8 gate) goto exit; } - targets->sens_res = be16_to_cpu(*(u16 *)atqa_skb->data); + targets->sens_res = be16_to_cpu(*(__be16 *)atqa_skb->data); targets->sel_res = sak_skb->data[0]; r = nfc_hci_get_param(hdev, NFC_HCI_RF_READER_A_GATE, @@ -337,11 +335,8 @@ exit: kfree_skb(skb); exit_noskb: - if (r) { - /* TODO: There was an error dispatching the event, - * how to propagate up to nfc core? - */ - } + if (r) + nfc_hci_driver_failure(hdev, r); } static void nfc_hci_cmd_timeout(unsigned long data) @@ -385,34 +380,31 @@ static int hci_dev_session_init(struct nfc_hci_dev *hdev) if (r < 0) goto disconnect_all; - if (skb->len && skb->len == strlen(hdev->init_data.session_id)) - if (memcmp(hdev->init_data.session_id, skb->data, - skb->len) == 0) { - /* TODO ELa: restore gate<->pipe table from - * some TBD location. - * note: it doesn't seem possible to get the chip - * currently open gate/pipe table. - * It is only possible to obtain the supported - * gate list. - */ + if (skb->len && skb->len == strlen(hdev->init_data.session_id) && + (memcmp(hdev->init_data.session_id, skb->data, + skb->len) == 0) && hdev->ops->load_session) { + /* Restore gate<->pipe table from some proprietary location. */ - /* goto exit - * For now, always do a full initialization */ - } + r = hdev->ops->load_session(hdev); - r = nfc_hci_disconnect_all_gates(hdev); - if (r < 0) - goto exit; + if (r < 0) + goto disconnect_all; + } else { - r = hci_dev_connect_gates(hdev, hdev->init_data.gate_count, - hdev->init_data.gates); - if (r < 0) - goto disconnect_all; + r = nfc_hci_disconnect_all_gates(hdev); + if (r < 0) + goto exit; + + r = hci_dev_connect_gates(hdev, hdev->init_data.gate_count, + hdev->init_data.gates); + if (r < 0) + goto disconnect_all; - r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE, - NFC_HCI_ADMIN_SESSION_IDENTITY, - hdev->init_data.session_id, - strlen(hdev->init_data.session_id)); + r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE, + NFC_HCI_ADMIN_SESSION_IDENTITY, + hdev->init_data.session_id, + strlen(hdev->init_data.session_id)); + } if (r == 0) goto exit; diff --git a/net/nfc/hci/hci.h b/net/nfc/hci/hci.h index b274d12c18a..c3d2e2c1394 100644 --- a/net/nfc/hci/hci.h +++ b/net/nfc/hci/hci.h @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef __LOCAL_HCI_H diff --git a/net/nfc/hci/hcp.c b/net/nfc/hci/hcp.c index b6b4109f234..e9de1514656 100644 --- a/net/nfc/hci/hcp.c +++ b/net/nfc/hci/hcp.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hci: %s: " fmt, __func__ diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index fe5e966e5b8..1b90c053185 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -13,23 +13,19 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <net/nfc/llc.h> #include "llc.h" -static struct list_head llc_engines; +static LIST_HEAD(llc_engines); int nfc_llc_init(void) { int r; - INIT_LIST_HEAD(&llc_engines); - r = nfc_llc_nop_register(); if (r) goto exit; diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h index 7be0b7f3ceb..5dad4c57ffb 100644 --- a/net/nfc/hci/llc.h +++ b/net/nfc/hci/llc.h @@ -13,9 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef __LOCAL_LLC_H_ diff --git a/net/nfc/hci/llc_nop.c b/net/nfc/hci/llc_nop.c index 87b10291b40..d0435d5a197 100644 --- a/net/nfc/hci/llc_nop.c +++ b/net/nfc/hci/llc_nop.c @@ -13,9 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/types.h> diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 27b313befc3..401c7e25527 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -13,9 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "shdlc: %s: " fmt, __func__ @@ -300,7 +298,7 @@ static void llc_shdlc_rcv_rej(struct llc_shdlc *shdlc, int y_nr) { struct sk_buff *skb; - pr_debug("remote asks retransmition from frame %d\n", y_nr); + pr_debug("remote asks retransmission from frame %d\n", y_nr); if (llc_shdlc_x_lteq_y_lt_z(shdlc->dnr, y_nr, shdlc->ns)) { if (shdlc->t2_active) { diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index f4d48b57ea1..de1789e3cc8 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ enum llcp_state { diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 1017894807c..a3ad69a4c64 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ @@ -389,7 +387,7 @@ int nfc_llcp_send_symm(struct nfc_dev *dev) __net_timestamp(skb); - nfc_llcp_send_to_raw_sock(local, skb, NFC_LLCP_DIRECTION_TX); + nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); return nfc_data_exchange(dev, local->target_idx, skb, nfc_llcp_recv, local); @@ -677,7 +675,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, do { remote_miu = sock->remote_miu > LLCP_MAX_MIU ? - local->remote_miu : sock->remote_miu; + LLCP_DEFAULT_MIU : sock->remote_miu; frag_len = min_t(size_t, remote_miu, remaining_len); @@ -686,8 +684,10 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, pdu = llcp_allocate_pdu(sock, LLCP_PDU_I, frag_len + LLCP_SEQUENCE_SIZE); - if (pdu == NULL) + if (pdu == NULL) { + kfree(msg_data); return -ENOMEM; + } skb_put(pdu, LLCP_SEQUENCE_SIZE); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 81cd3416c7d..51e78879731 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ @@ -29,7 +27,7 @@ static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; -static struct list_head llcp_devices; +static LIST_HEAD(llcp_devices); static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); @@ -295,9 +293,9 @@ static void nfc_llcp_sdreq_timer(unsigned long data) struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { - struct nfc_llcp_local *local, *n; + struct nfc_llcp_local *local; - list_for_each_entry_safe(local, n, &llcp_devices, list) + list_for_each_entry(local, &llcp_devices, list) if (local->dev == dev) return local; @@ -611,14 +609,16 @@ u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) { - struct nfc_llcp_local *local = nfc_llcp_find_local(dev); + struct nfc_llcp_local *local; + + if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN) + return -EINVAL; + local = nfc_llcp_find_local(dev); if (local == NULL) { pr_err("No LLCP device\n"); return -ENODEV; } - if (gb_len < 3) - return -EINVAL; memset(local->remote_gb, 0, NFC_MAX_GT_LEN); memcpy(local->remote_gb, gb, gb_len); @@ -680,16 +680,17 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, continue; if (skb_copy == NULL) { - skb_copy = __pskb_copy(skb, NFC_LLCP_RAW_HEADER_SIZE, - GFP_ATOMIC); + skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, + GFP_ATOMIC, true); if (skb_copy == NULL) continue; - data = skb_push(skb_copy, NFC_LLCP_RAW_HEADER_SIZE); + data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = local->dev ? local->dev->idx : 0xFF; - data[1] = direction; + data[1] = direction & 0x01; + data[1] |= (RAW_PAYLOAD_LLCP << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); @@ -747,7 +748,7 @@ static void nfc_llcp_tx_work(struct work_struct *work) __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, - NFC_LLCP_DIRECTION_TX); + NFC_DIRECTION_TX); ret = nfc_data_exchange(local->dev, local->target_idx, skb, nfc_llcp_recv, local); @@ -945,7 +946,6 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, new_sock->local = nfc_llcp_local_get(local); new_sock->rw = sock->rw; new_sock->miux = sock->miux; - new_sock->remote_miu = local->remote_miu; new_sock->nfc_protocol = sock->nfc_protocol; new_sock->dsap = ssap; new_sock->target_idx = local->target_idx; @@ -977,7 +977,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, new_sk->sk_state = LLCP_CONNECTED; /* Wake the listening processes */ - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); /* Send CC */ nfc_llcp_send_cc(new_sock); @@ -1477,7 +1477,7 @@ static void nfc_llcp_rx_work(struct work_struct *work) __net_timestamp(skb); - nfc_llcp_send_to_raw_sock(local, skb, NFC_LLCP_DIRECTION_RX); + nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_RX); nfc_llcp_rx_skb(local, skb); @@ -1625,8 +1625,6 @@ void nfc_llcp_unregister_device(struct nfc_dev *dev) int __init nfc_llcp_init(void) { - INIT_LIST_HEAD(&llcp_devices); - return nfc_llcp_sock_init(); } diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 824c6056bf8..51f077a92fa 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -12,9 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ @@ -702,7 +700,6 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->dev = dev; llcp_sock->local = nfc_llcp_local_get(local); - llcp_sock->remote_miu = llcp_sock->local->remote_miu; llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -ENOMEM; @@ -772,8 +769,8 @@ static int llcp_sock_sendmsg(struct kiocb *iocb, struct socket *sock, lock_sock(sk); if (sk->sk_type == SOCK_DGRAM) { - struct sockaddr_nfc_llcp *addr = - (struct sockaddr_nfc_llcp *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr, + msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); @@ -845,8 +842,8 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_type == SOCK_DGRAM && msg->msg_name) { struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb); - struct sockaddr_nfc_llcp *sockaddr = - (struct sockaddr_nfc_llcp *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, sockaddr, + msg->msg_name); msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index b943d46a164..2b400e1a869 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ @@ -75,7 +74,7 @@ static int __nci_request(struct nci_dev *ndev, ndev->req_status = NCI_REQ_PEND; - init_completion(&ndev->req_completion); + reinit_completion(&ndev->req_completion); req(ndev, opt); completion_rc = wait_for_completion_interruptible_timeout(&ndev->req_completion, @@ -302,6 +301,9 @@ static int nci_open_device(struct nci_dev *ndev) rc = __nci_request(ndev, nci_reset_req, 0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); + if (ndev->ops->setup) + ndev->ops->setup(ndev); + if (!rc) { rc = __nci_request(ndev, nci_init_req, 0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); @@ -362,6 +364,8 @@ static int nci_close_device(struct nci_dev *ndev) msecs_to_jiffies(NCI_RESET_TIMEOUT)); clear_bit(NCI_INIT, &ndev->flags); + del_timer_sync(&ndev->cmd_timer); + /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); @@ -409,12 +413,26 @@ static int nci_dev_down(struct nfc_dev *nfc_dev) return nci_close_device(ndev); } +int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val) +{ + struct nci_set_config_param param; + + if (!val || !len) + return 0; + + param.id = id; + param.len = len; + param.val = val; + + return __nci_request(ndev, nci_set_config_req, (unsigned long)¶m, + msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); +} +EXPORT_SYMBOL(nci_set_config); + static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); struct nci_set_config_param param; - __u8 local_gb[NFC_MAX_GT_LEN]; - int i; param.val = nfc_get_local_general_bytes(nfc_dev, ¶m.len); if ((param.val == NULL) || (param.len == 0)) @@ -423,11 +441,7 @@ static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) if (param.len > NFC_MAX_GT_LEN) return -EINVAL; - for (i = 0; i < param.len; i++) - local_gb[param.len-1-i] = param.val[i]; - param.id = NCI_PN_ATR_REQ_GEN_BYTES; - param.val = local_gb; return nci_request(ndev, nci_set_config_req, (unsigned long)¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); @@ -695,6 +709,7 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, ndev->ops = ops; ndev->tx_headroom = tx_headroom; ndev->tx_tailroom = tx_tailroom; + init_completion(&ndev->req_completion); ndev->nfc_dev = nfc_allocate_device(&nci_nfc_ops, supported_protocols, @@ -846,6 +861,10 @@ static int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); + /* Send copy to sniffer */ + nfc_send_to_raw_sock(ndev->nfc_dev, skb, + RAW_PAYLOAD_NCI, NFC_DIRECTION_TX); + return ndev->ops->send(ndev, skb); } @@ -920,6 +939,11 @@ static void nci_rx_work(struct work_struct *work) struct sk_buff *skb; while ((skb = skb_dequeue(&ndev->rx_q))) { + + /* Send copy to sniffer */ + nfc_send_to_raw_sock(ndev->nfc_dev, skb, + RAW_PAYLOAD_NCI, NFC_DIRECTION_RX); + /* Process frame */ switch (nci_mt(skb->data)) { case NCI_MT_RSP_PKT: diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 2a9399dd6c6..6c3aef85287 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -16,8 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/nci/lib.c b/net/nfc/nci/lib.c index 6b7fd26c68d..ed774a2e989 100644 --- a/net/nfc/nci/lib.c +++ b/net/nfc/nci/lib.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index b2aa98ef092..f8f6af23138 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ @@ -367,7 +366,6 @@ static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, struct nci_rf_intf_activated_ntf *ntf, __u8 *data) { struct activation_params_poll_nfc_dep *poll; - int i; switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: @@ -375,10 +373,8 @@ static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, poll = &ntf->activation_params.poll_nfc_dep; poll->atr_res_len = min_t(__u8, *data++, 63); pr_debug("atr_res_len %d\n", poll->atr_res_len); - if (poll->atr_res_len > 0) { - for (i = 0; i < poll->atr_res_len; i++) - poll->atr_res[poll->atr_res_len-1-i] = data[i]; - } + if (poll->atr_res_len > 0) + memcpy(poll->atr_res, data, poll->atr_res_len); break; default: diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index dd072f38ad0..041de51ccdb 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -20,8 +20,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index f1d426f10cc..ec250e77763 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -105,7 +105,7 @@ int nci_spi_send(struct nci_spi *nspi, if (ret != 0 || nspi->acknowledge_mode == NCI_SPI_CRC_DISABLED) goto done; - init_completion(&nspi->req_completion); + reinit_completion(&nspi->req_completion); completion_rc = wait_for_completion_interruptible_timeout( &nspi->req_completion, NCI_SPI_SEND_TIMEOUT); @@ -145,6 +145,7 @@ struct nci_spi *nci_spi_allocate_spi(struct spi_device *spi, nspi->spi = spi; nspi->ndev = ndev; + init_completion(&nspi->req_completion); return nspi; } diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index a9b2342d525..43cb1c17e26 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ @@ -96,6 +94,14 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, target->sensf_res)) goto nla_put_failure; + if (target->is_iso15693) { + if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID, + target->iso15693_dsfid) || + nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID, + sizeof(target->iso15693_uid), target->iso15693_uid)) + goto nla_put_failure; + } + return genlmsg_end(msg, hdr); nla_put_failure: diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index aaf606fc1fa..88d60064890 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef __LOCAL_NFC_H @@ -42,6 +40,12 @@ struct nfc_rawsock { struct work_struct tx_work; bool tx_work_scheduled; }; + +struct nfc_sock_list { + struct hlist_head head; + rwlock_t lock; +}; + #define nfc_rawsock(sk) ((struct nfc_rawsock *) sk) #define to_rawsock_sk(_tx_work) \ ((struct sock *) container_of(_tx_work, struct nfc_rawsock, tx_work)) diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 66bcd2eb577..11c3544ea54 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -16,9 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ @@ -29,6 +27,24 @@ #include "nfc.h" +static struct nfc_sock_list raw_sk_list = { + .lock = __RW_LOCK_UNLOCKED(raw_sk_list.lock) +}; + +static void nfc_sock_link(struct nfc_sock_list *l, struct sock *sk) +{ + write_lock(&l->lock); + sk_add_node(sk, &l->head); + write_unlock(&l->lock); +} + +static void nfc_sock_unlink(struct nfc_sock_list *l, struct sock *sk) +{ + write_lock(&l->lock); + sk_del_node_init(sk); + write_unlock(&l->lock); +} + static void rawsock_write_queue_purge(struct sock *sk) { pr_debug("sk=%p\n", sk); @@ -59,6 +75,9 @@ static int rawsock_release(struct socket *sock) if (!sk) return 0; + if (sock->type == SOCK_RAW) + nfc_sock_unlink(&raw_sk_list, sk); + sock_orphan(sk); sock_put(sk); @@ -277,6 +296,26 @@ static const struct proto_ops rawsock_ops = { .mmap = sock_no_mmap, }; +static const struct proto_ops rawsock_raw_ops = { + .family = PF_NFC, + .owner = THIS_MODULE, + .release = rawsock_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = sock_no_sendmsg, + .recvmsg = rawsock_recvmsg, + .mmap = sock_no_mmap, +}; + static void rawsock_destruct(struct sock *sk) { pr_debug("sk=%p\n", sk); @@ -302,10 +341,13 @@ static int rawsock_create(struct net *net, struct socket *sock, pr_debug("sock=%p\n", sock); - if (sock->type != SOCK_SEQPACKET) + if ((sock->type != SOCK_SEQPACKET) && (sock->type != SOCK_RAW)) return -ESOCKTNOSUPPORT; - sock->ops = &rawsock_ops; + if (sock->type == SOCK_RAW) + sock->ops = &rawsock_raw_ops; + else + sock->ops = &rawsock_ops; sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto); if (!sk) @@ -315,13 +357,53 @@ static int rawsock_create(struct net *net, struct socket *sock, sk->sk_protocol = nfc_proto->id; sk->sk_destruct = rawsock_destruct; sock->state = SS_UNCONNECTED; - - INIT_WORK(&nfc_rawsock(sk)->tx_work, rawsock_tx_work); - nfc_rawsock(sk)->tx_work_scheduled = false; + if (sock->type == SOCK_RAW) + nfc_sock_link(&raw_sk_list, sk); + else { + INIT_WORK(&nfc_rawsock(sk)->tx_work, rawsock_tx_work); + nfc_rawsock(sk)->tx_work_scheduled = false; + } return 0; } +void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb, + u8 payload_type, u8 direction) +{ + struct sk_buff *skb_copy = NULL, *nskb; + struct sock *sk; + u8 *data; + + read_lock(&raw_sk_list.lock); + + sk_for_each(sk, &raw_sk_list.head) { + if (!skb_copy) { + skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, + GFP_ATOMIC, true); + if (!skb_copy) + continue; + + data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); + + data[0] = dev ? dev->idx : 0xFF; + data[1] = direction & 0x01; + data[1] |= (payload_type << 1); + } + + nskb = skb_clone(skb_copy, GFP_ATOMIC); + if (!nskb) + continue; + + if (sock_queue_rcv_skb(sk, nskb)) + kfree_skb(nskb); + } + + read_unlock(&raw_sk_list.lock); + + kfree_skb(skb_copy); +} +EXPORT_SYMBOL(nfc_send_to_raw_sock); + static struct proto rawsock_proto = { .name = "NFC_RAW", .owner = THIS_MODULE, diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 65cfaa81607..e70d8b18e96 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -134,8 +134,8 @@ static int set_eth_addr(struct sk_buff *skb, skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); - memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN); - memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN); + ether_addr_copy(eth_hdr(skb)->h_source, eth_key->eth_src); + ether_addr_copy(eth_hdr(skb)->h_dest, eth_key->eth_dst); ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); @@ -165,7 +165,7 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, } csum_replace4(&nh->check, *addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); *addr = new_addr; } @@ -199,7 +199,7 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, if (recalculate_csum) update_ipv6_checksum(skb, l4_proto, addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); memcpy(addr, new_addr, sizeof(__be32[4])); } @@ -296,7 +296,7 @@ static void set_tp_port(struct sk_buff *skb, __be16 *port, { inet_proto_csum_replace2(check, skb, *port, new_port, 0); *port = new_port; - skb->rxhash = 0; + skb_clear_hash(skb); } static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) @@ -310,7 +310,7 @@ static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) uh->check = CSUM_MANGLED_0; } else { *port = new_port; - skb->rxhash = 0; + skb_clear_hash(skb); } } @@ -381,7 +381,7 @@ static int set_sctp(struct sk_buff *skb, /* Carry any checksum errors through. */ sh->checksum = old_csum ^ old_correct_csum ^ new_csum; - skb->rxhash = 0; + skb_clear_hash(skb); } return 0; @@ -445,7 +445,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { switch (nla_type(a)) { case OVS_SAMPLE_ATTR_PROBABILITY: - if (net_random() >= nla_get_u32(a)) + if (prandom_u32() >= nla_get_u32(a)) return 0; break; @@ -551,6 +551,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, a); + if (unlikely(err)) /* skb already freed. */ + return err; break; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6f5e1dd3be2..9db4bf6740d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -44,23 +44,49 @@ #include <linux/netfilter_ipv4.h> #include <linux/inetdevice.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> -#include <linux/workqueue.h> +#include <linux/genetlink.h> +#include <net/genetlink.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "datapath.h" #include "flow.h" +#include "flow_table.h" #include "flow_netlink.h" #include "vport-internal_dev.h" #include "vport-netdev.h" int ovs_net_id __read_mostly; +static struct genl_family dp_packet_genl_family; +static struct genl_family dp_flow_genl_family; +static struct genl_family dp_datapath_genl_family; + +static struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP +}; + +static struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP +}; + +struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP +}; + +/* Check if need to build a reply message. + * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ +static bool ovs_must_notify(struct genl_info *info, + const struct genl_multicast_group *grp) +{ + return info->nlhdr->nlmsg_flags & NLM_F_ECHO || + netlink_has_listeners(genl_info_net(info)->genl_sock, 0); +} + static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { @@ -108,10 +134,9 @@ int lockdep_ovsl_is_held(void) #endif static struct vport *new_vport(const struct vport_parms *); -static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *, +static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *); -static int queue_userspace_packet(struct net *, int dp_ifindex, - struct sk_buff *, +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *); /* Must be called with rcu_read_lock or ovs_mutex. */ @@ -133,7 +158,7 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) } /* Must be called with rcu_read_lock or ovs_mutex. */ -const char *ovs_dp_name(const struct datapath *dp) +static const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); @@ -161,7 +186,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -174,6 +198,7 @@ static struct hlist_head *vport_hash_bucket(const struct datapath *dp, return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; } +/* Called with ovs_mutex or RCU read lock. */ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; @@ -234,7 +259,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } /* Look up flow. */ - flow = ovs_flow_tbl_lookup(&dp->table, &key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, &key, &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -251,33 +276,22 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) OVS_CB(skb)->flow = flow; OVS_CB(skb)->pkt_key = &key; - stats_counter = &stats->n_hit; - ovs_flow_used(OVS_CB(skb)->flow, skb); + ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); ovs_execute_actions(dp, skb); + stats_counter = &stats->n_hit; out: /* Update datapath statistics. */ - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; - u64_stats_update_end(&stats->sync); + u64_stats_update_end(&stats->syncp); } -static struct genl_family dp_packet_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = sizeof(struct ovs_header), - .name = OVS_PACKET_FAMILY, - .version = OVS_PACKET_VERSION, - .maxattr = OVS_PACKET_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, -}; - int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info) { struct dp_stats_percpu *stats; - int dp_ifindex; int err; if (upcall_info->portid == 0) { @@ -285,16 +299,10 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, goto err; } - dp_ifindex = get_dpifindex(dp); - if (!dp_ifindex) { - err = -ENODEV; - goto err; - } - if (!skb_is_gso(skb)) - err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + err = queue_userspace_packet(dp, skb, upcall_info); else - err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + err = queue_gso_packets(dp, skb, upcall_info); if (err) goto err; @@ -303,15 +311,14 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, err: stats = this_cpu_ptr(dp->stats_percpu); - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); stats->n_lost++; - u64_stats_update_end(&stats->sync); + u64_stats_update_end(&stats->syncp); return err; } -static int queue_gso_packets(struct net *net, int dp_ifindex, - struct sk_buff *skb, +static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info) { unsigned short gso_type = skb_shinfo(skb)->gso_type; @@ -320,14 +327,14 @@ static int queue_gso_packets(struct net *net, int dp_ifindex, struct sk_buff *segs, *nskb; int err; - segs = __skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM, false); + segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); /* Queue all of the segments. */ skb = segs; do { - err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info); + err = queue_userspace_packet(dp, skb, upcall_info); if (err) break; @@ -380,11 +387,11 @@ static size_t key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } -static size_t upcall_msg_size(const struct sk_buff *skb, - const struct nlattr *userdata) +static size_t upcall_msg_size(const struct nlattr *userdata, + unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ /* OVS_PACKET_ATTR_USERDATA */ @@ -394,15 +401,24 @@ static size_t upcall_msg_size(const struct sk_buff *skb, return size; } -static int queue_userspace_packet(struct net *net, int dp_ifindex, - struct sk_buff *skb, +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; struct sk_buff *user_skb; /* to be queued to userspace */ struct nlattr *nla; - int err; + struct genl_info info = { + .dst_sk = ovs_dp_get_net(dp)->genl_sock, + .snd_portid = upcall_info->portid, + }; + size_t len; + unsigned int hlen; + int err, dp_ifindex; + + dp_ifindex = get_dpifindex(dp); + if (!dp_ifindex) + return -ENODEV; if (vlan_tx_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); @@ -422,7 +438,22 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, goto out; } - user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC); + /* Complete checksum if needed */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto out; + + /* Older versions of OVS user space enforce alignment of the last + * Netlink attribute to NLA_ALIGNTO which would require extensive + * padding logic. Only perform zerocopy if padding is not required. + */ + if (dp->user_features & OVS_DP_F_UNALIGNED) + hlen = skb_zerocopy_headlen(skb); + else + hlen = skb->len; + + len = upcall_msg_size(upcall_info->userdata, hlen); + user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -441,26 +472,36 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); - nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len); + /* Only reserve room for attribute header, packet data is added + * in skb_zerocopy() */ + if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { + err = -ENOBUFS; + goto out; + } + nla->nla_len = nla_attr_size(skb->len); - skb_copy_and_csum_dev(skb, nla_data(nla)); + err = skb_zerocopy(user_skb, skb, skb->len, hlen); + if (err) + goto out; - genlmsg_end(user_skb, upcall); - err = genlmsg_unicast(net, user_skb, upcall_info->portid); + /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; + if (plen > 0) + memset(skb_put(user_skb, plen), 0, plen); + } + + ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; + + err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); out: + if (err) + skb_tx_error(skb); kfree_skb(nskb); return err; } -static void clear_stats(struct sw_flow *flow) -{ - flow->used = 0; - flow->tcp_flags = 0; - flow->packet_count = 0; - flow->byte_count = 0; -} - static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = info->userhdr; @@ -565,6 +606,18 @@ static const struct genl_ops dp_packet_genl_ops[] = { } }; +static struct genl_family dp_packet_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_PACKET_FAMILY, + .version = OVS_PACKET_VERSION, + .maxattr = OVS_PACKET_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_packet_genl_ops, + .n_ops = ARRAY_SIZE(dp_packet_genl_ops), +}; + static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, struct ovs_dp_megaflow_stats *mega_stats) { @@ -585,9 +638,9 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { - start = u64_stats_fetch_begin_bh(&percpu_stats->sync); + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; @@ -596,26 +649,6 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, } } -static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { - [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, -}; - -static struct genl_family dp_flow_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = sizeof(struct ovs_header), - .name = OVS_FLOW_FAMILY, - .version = OVS_FLOW_VERSION, - .maxattr = OVS_FLOW_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, -}; - -static struct genl_multicast_group ovs_dp_flow_multicast_group = { - .name = OVS_FLOW_MCGROUP -}; - static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) @@ -627,25 +660,25 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) + nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */ } -/* Called with ovs_mutex. */ -static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { const int skb_orig_len = skb->len; struct nlattr *start; struct ovs_flow_stats stats; + __be16 tcp_flags; + unsigned long used; struct ovs_header *ovs_header; struct nlattr *nla; - unsigned long used; - u8 tcp_flags; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; - ovs_header->dp_ifindex = get_dpifindex(dp); + ovs_header->dp_ifindex = dp_ifindex; /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); @@ -667,24 +700,18 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, nla_nest_end(skb, nla); - spin_lock_bh(&flow->lock); - used = flow->used; - stats.n_packets = flow->packet_count; - stats.n_bytes = flow->byte_count; - tcp_flags = (u8)ntohs(flow->tcp_flags); - spin_unlock_bh(&flow->lock); + ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) goto nla_put_failure; if (stats.n_packets && - nla_put(skb, OVS_FLOW_ATTR_STATS, - sizeof(struct ovs_flow_stats), &stats)) + nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) goto nla_put_failure; - if (tcp_flags && - nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags)) + if ((u8)ntohs(tcp_flags) && + nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) goto nla_put_failure; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if @@ -701,11 +728,10 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, if (start) { const struct sw_flow_actions *sf_acts; - sf_acts = rcu_dereference_check(flow->sf_acts, - lockdep_ovsl_is_held()); - + sf_acts = rcu_dereference_ovsl(flow->sf_acts); err = ovs_nla_put_actions(sf_acts->actions, sf_acts->actions_len, skb); + if (!err) nla_nest_end(skb, start); else { @@ -726,49 +752,192 @@ error: return err; } -static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow) +/* May not be called with RCU read lock. */ +static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, + struct genl_info *info, + bool always) { - const struct sw_flow_actions *sf_acts; + struct sk_buff *skb; - sf_acts = ovsl_dereference(flow->sf_acts); + if (!always && !ovs_must_notify(info, &ovs_dp_flow_multicast_group)) + return NULL; + + skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); - return genlmsg_new(ovs_flow_cmd_msg_size(sf_acts), GFP_KERNEL); + return skb; } -static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow, - struct datapath *dp, - u32 portid, u32 seq, u8 cmd) +/* Called with ovs_mutex. */ +static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, + int dp_ifindex, + struct genl_info *info, u8 cmd, + bool always) { struct sk_buff *skb; int retval; - skb = ovs_flow_cmd_alloc_info(flow); - if (!skb) - return ERR_PTR(-ENOMEM); + skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info, + always); + if (!skb || IS_ERR(skb)) + return skb; - retval = ovs_flow_cmd_fill_info(flow, dp, skb, portid, seq, 0, cmd); + retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, + info->snd_portid, info->snd_seq, 0, + cmd); BUG_ON(retval < 0); return skb; } -static struct sw_flow *__ovs_flow_tbl_lookup(struct flow_table *tbl, - const struct sw_flow_key *key) +static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { - u32 __always_unused n_mask_hit; + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow *flow, *new_flow; + struct sw_flow_mask mask; + struct sk_buff *reply; + struct datapath *dp; + struct sw_flow_actions *acts; + struct sw_flow_match match; + int error; + + /* Must have key and actions. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) + goto error; + if (!a[OVS_FLOW_ATTR_ACTIONS]) + goto error; + + /* Most of the time we need to allocate a new flow, do it before + * locking. + */ + new_flow = ovs_flow_alloc(); + if (IS_ERR(new_flow)) { + error = PTR_ERR(new_flow); + goto error; + } + + /* Extract key. */ + ovs_match_init(&match, &new_flow->unmasked_key, &mask); + error = ovs_nla_get_match(&match, + a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + if (error) + goto err_kfree_flow; + + ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask); + + /* Validate actions. */ + acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); + error = PTR_ERR(acts); + if (IS_ERR(acts)) + goto err_kfree_flow; + + error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, + 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); + goto err_kfree_acts; + } + + reply = ovs_flow_cmd_alloc_info(acts, info, false); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + /* Check if this is a duplicate flow */ + flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key); + if (likely(!flow)) { + rcu_assign_pointer(new_flow->sf_acts, acts); + + /* Put flow in bucket. */ + error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); + if (unlikely(error)) { + acts = NULL; + goto err_unlock_ovs; + } + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(new_flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW); + BUG_ON(error < 0); + } + ovs_unlock(); + } else { + struct sw_flow_actions *old_acts; + + /* Bail out if we're not allowed to modify an existing flow. + * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL + * because Generic Netlink treats the latter as a dump + * request. We also accept NLM_F_EXCL in case that bug ever + * gets fixed. + */ + if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE + | NLM_F_EXCL))) { + error = -EEXIST; + goto err_unlock_ovs; + } + /* The unmasked key has to be the same for flow updates. */ + if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { + error = -ENOENT; + goto err_unlock_ovs; + } + } + /* Update actions. */ + old_acts = ovsl_dereference(flow->sf_acts); + rcu_assign_pointer(flow->sf_acts, acts); + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW); + BUG_ON(error < 0); + } + ovs_unlock(); + + ovs_nla_free_flow_actions(old_acts); + ovs_flow_free(new_flow, false); + } - return ovs_flow_tbl_lookup(tbl, key, &n_mask_hit); + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + return 0; + +err_unlock_ovs: + ovs_unlock(); + kfree_skb(reply); +err_kfree_acts: + kfree(acts); +err_kfree_flow: + ovs_flow_free(new_flow, false); +error: + return error; } -static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) +static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow_key key, masked_key; - struct sw_flow *flow = NULL; + struct sw_flow *flow; struct sw_flow_mask mask; - struct sk_buff *reply; + struct sk_buff *reply = NULL; struct datapath *dp; - struct sw_flow_actions *acts = NULL; + struct sw_flow_actions *old_acts = NULL, *acts = NULL; struct sw_flow_match match; int error; @@ -795,99 +964,71 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) &masked_key, 0, &acts); if (error) { OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - goto err_kfree; + goto err_kfree_acts; + } + } + + /* Can allocate before locking if have acts. */ + if (acts) { + reply = ovs_flow_cmd_alloc_info(acts, info, false); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; } - } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { - error = -EINVAL; - goto error; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - error = -ENODEV; - if (!dp) + if (unlikely(!dp)) { + error = -ENODEV; goto err_unlock_ovs; - - /* Check if this is a duplicate flow */ - flow = __ovs_flow_tbl_lookup(&dp->table, &key); - if (!flow) { - /* Bail out if we're not allowed to create a new flow. */ + } + /* Check that the flow exists. */ + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { error = -ENOENT; - if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) - goto err_unlock_ovs; - - /* Allocate flow. */ - flow = ovs_flow_alloc(); - if (IS_ERR(flow)) { - error = PTR_ERR(flow); - goto err_unlock_ovs; - } - clear_stats(flow); + goto err_unlock_ovs; + } - flow->key = masked_key; - flow->unmasked_key = key; + /* Update actions, if present. */ + if (likely(acts)) { + old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); - /* Put flow in bucket. */ - error = ovs_flow_tbl_insert(&dp->table, flow, &mask); - if (error) { - acts = NULL; - goto err_flow_free; + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW); + BUG_ON(error < 0); } - - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); } else { - /* We found a matching flow. */ - struct sw_flow_actions *old_acts; - - /* Bail out if we're not allowed to modify an existing flow. - * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL - * because Generic Netlink treats the latter as a dump - * request. We also accept NLM_F_EXCL in case that bug ever - * gets fixed. - */ - error = -EEXIST; - if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW && - info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) - goto err_unlock_ovs; - - /* The unmasked key has to be the same for flow updates. */ - error = -EINVAL; - if (!ovs_flow_cmp_unmasked_key(flow, &match)) { - OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n"); + /* Could not alloc without acts before locking. */ + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, + info, OVS_FLOW_CMD_NEW, false); + if (unlikely(IS_ERR(reply))) { + error = PTR_ERR(reply); goto err_unlock_ovs; } - - /* Update actions. */ - old_acts = ovsl_dereference(flow->sf_acts); - rcu_assign_pointer(flow->sf_acts, acts); - ovs_nla_free_flow_actions(old_acts); - - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); - - /* Clear stats. */ - if (a[OVS_FLOW_ATTR_CLEAR]) { - spin_lock_bh(&flow->lock); - clear_stats(flow); - spin_unlock_bh(&flow->lock); - } } + + /* Clear stats. */ + if (a[OVS_FLOW_ATTR_CLEAR]) + ovs_flow_stats_clear(flow); ovs_unlock(); - if (!IS_ERR(reply)) + if (reply) ovs_notify(&dp_flow_genl_family, reply, info); - else - genl_set_err(&dp_flow_genl_family, sock_net(skb->sk), 0, - 0, PTR_ERR(reply)); + if (old_acts) + ovs_nla_free_flow_actions(old_acts); + return 0; -err_flow_free: - ovs_flow_free(flow, false); err_unlock_ovs: ovs_unlock(); -err_kfree: + kfree_skb(reply); +err_kfree_acts: kfree(acts); error: return error; @@ -921,14 +1062,14 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - flow = __ovs_flow_tbl_lookup(&dp->table, &key); - if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { err = -ENOENT; goto unlock; } - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, + OVS_FLOW_CMD_NEW, true); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; @@ -952,45 +1093,53 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct sw_flow_match match; int err; + if (likely(a[OVS_FLOW_ATTR_KEY])) { + ovs_match_init(&match, &key, NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); + if (unlikely(err)) + return err; + } + ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { + if (unlikely(!dp)) { err = -ENODEV; goto unlock; } - if (!a[OVS_FLOW_ATTR_KEY]) { + if (unlikely(!a[OVS_FLOW_ATTR_KEY])) { err = ovs_flow_tbl_flush(&dp->table); goto unlock; } - ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); - if (err) - goto unlock; - - flow = __ovs_flow_tbl_lookup(&dp->table, &key); - if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { err = -ENOENT; goto unlock; } - reply = ovs_flow_cmd_alloc_info(flow); - if (!reply) { - err = -ENOMEM; - goto unlock; - } - ovs_flow_tbl_remove(&dp->table, flow); + ovs_unlock(); - err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_FLOW_CMD_DEL); - BUG_ON(err < 0); + reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, + info, false); + if (likely(reply)) { + if (likely(!IS_ERR(reply))) { + rcu_read_lock(); /*To keep RCU checker happy. */ + err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_DEL); + rcu_read_unlock(); + BUG_ON(err < 0); + + ovs_notify(&dp_flow_genl_family, reply, info); + } else { + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); + } + } ovs_flow_free(flow, true); - ovs_unlock(); - - ovs_notify(&dp_flow_genl_family, reply, info); return 0; unlock: ovs_unlock(); @@ -1021,7 +1170,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!flow) break; - if (ovs_flow_cmd_fill_info(flow, dp, skb, + if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_FLOW_CMD_NEW) < 0) @@ -1034,11 +1183,17 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static const struct genl_ops dp_flow_genl_ops[] = { +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { + [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, +}; + +static struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, - .doit = ovs_flow_cmd_new_or_set + .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -1054,27 +1209,22 @@ static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_SET, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, - .doit = ovs_flow_cmd_new_or_set, + .doit = ovs_flow_cmd_set, }, }; -static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { - [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, -}; - -static struct genl_family dp_datapath_genl_family = { +static struct genl_family dp_flow_genl_family = { .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), - .name = OVS_DATAPATH_FAMILY, - .version = OVS_DATAPATH_VERSION, - .maxattr = OVS_DP_ATTR_MAX, + .name = OVS_FLOW_FAMILY, + .version = OVS_FLOW_VERSION, + .maxattr = OVS_FLOW_ATTR_MAX, .netnsok = true, .parallel_ops = true, -}; - -static struct genl_multicast_group ovs_dp_datapath_multicast_group = { - .name = OVS_DATAPATH_MCGROUP + .ops = dp_flow_genl_ops, + .n_ops = ARRAY_SIZE(dp_flow_genl_ops), + .mcgrps = &ovs_dp_flow_multicast_group, + .n_mcgrps = 1, }; static size_t ovs_dp_cmd_msg_size(void) @@ -1084,10 +1234,12 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size(IFNAMSIZ); msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ return msgsize; } +/* Called with ovs_mutex or RCU read lock. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { @@ -1103,9 +1255,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, ovs_header->dp_ifindex = get_dpifindex(dp); - rcu_read_lock(); err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); - rcu_read_unlock(); if (err) goto nla_put_failure; @@ -1119,6 +1269,9 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, &dp_megaflow_stats)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) + goto nla_put_failure; + return genlmsg_end(skb, ovs_header); nla_put_failure: @@ -1127,25 +1280,12 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 portid, - u32 seq, u8 cmd) +static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) { - struct sk_buff *skb; - int retval; - - skb = genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); - if (!skb) - return ERR_PTR(-ENOMEM); - - retval = ovs_dp_cmd_fill_info(dp, skb, portid, seq, 0, cmd); - if (retval < 0) { - kfree_skb(skb); - return ERR_PTR(retval); - } - return skb; + return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); } -/* Called with ovs_mutex. */ +/* Called with rcu_read_lock or ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) @@ -1157,14 +1297,30 @@ static struct datapath *lookup_datapath(struct net *net, else { struct vport *vport; - rcu_read_lock(); vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; - rcu_read_unlock(); } return dp ? dp : ERR_PTR(-ENODEV); } +static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +{ + struct datapath *dp; + + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + if (IS_ERR(dp)) + return; + + WARN(dp->user_features, "Dropping previously announced user features\n"); + dp->user_features = 0; +} + +static void ovs_dp_change(struct datapath *dp, struct nlattr **a) +{ + if (a[OVS_DP_ATTR_USER_FEATURES]) + dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); +} + static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1179,12 +1335,14 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - ovs_lock(); + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_unlock_ovs; + goto err_free_reply; ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); @@ -1193,18 +1351,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_free_dp; - dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); if (!dp->stats_percpu) { err = -ENOMEM; goto err_destroy_table; } - for_each_possible_cpu(i) { - struct dp_stats_percpu *dpath_stats; - dpath_stats = per_cpu_ptr(dp->stats_percpu, i); - u64_stats_init(&dpath_stats->sync); - } - dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), GFP_KERNEL); if (!dp->ports) { @@ -1223,20 +1375,32 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.port_no = OVSP_LOCAL; parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + ovs_dp_change(dp, a); + + /* So far only local changes have been made, now need the lock. */ + ovs_lock(); + vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); if (err == -EBUSY) err = -EEXIST; + if (err == -EEXIST) { + /* An outdated user space instance that does not understand + * the concept of user_features has attempted to create a new + * datapath and is likely to reuse it. Drop all user features. + */ + if (info->genlhdr->version < OVS_DP_VER_FEATURES) + ovs_dp_reset_user_features(skb, info); + } + goto err_destroy_ports_array; } - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto err_destroy_local_port; + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); list_add_tail_rcu(&dp->list_node, &ovs_net->dps); @@ -1246,19 +1410,18 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_local_port: - ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); err_destroy_ports_array: + ovs_unlock(); kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(&dp->table); + ovs_flow_tbl_destroy(&dp->table, false); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); -err_unlock_ovs: - ovs_unlock(); +err_free_reply: + kfree_skb(reply); err: return err; } @@ -1280,10 +1443,13 @@ static void __dp_destroy(struct datapath *dp) list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that - * all port in datapath are destroyed first before freeing datapath. + * all ports in datapath are destroyed first before freeing datapath. */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + /* RCU destroy the flow table */ + ovs_flow_tbl_destroy(&dp->table, true); + call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1293,17 +1459,19 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) - goto unlock; + goto err_unlock_free; - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_DEL); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto unlock; + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_DEL); + BUG_ON(err < 0); __dp_destroy(dp); ovs_unlock(); @@ -1311,8 +1479,10 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -unlock: + +err_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1322,28 +1492,30 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) - goto unlock; + goto err_unlock_free; - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - genl_set_err(&dp_datapath_genl_family, sock_net(skb->sk), 0, - 0, err); - err = 0; - goto unlock; - } + ovs_dp_change(dp, info->attrs); + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -unlock: + +err_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1353,25 +1525,26 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - ovs_lock(); + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + rcu_read_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); - goto unlock; - } - - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - goto unlock; + goto err_unlock_free; } + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + rcu_read_unlock(); - ovs_unlock(); return genlmsg_reply(reply, info); -unlock: - ovs_unlock(); +err_unlock_free: + rcu_read_unlock(); + kfree_skb(reply); return err; } @@ -1398,7 +1571,13 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static const struct genl_ops dp_datapath_genl_ops[] = { +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { + [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, +}; + +static struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, @@ -1422,27 +1601,18 @@ static const struct genl_ops dp_datapath_genl_ops[] = { }, }; -static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { - [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, - [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, -}; - -struct genl_family dp_vport_genl_family = { +static struct genl_family dp_datapath_genl_family = { .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), - .name = OVS_VPORT_FAMILY, - .version = OVS_VPORT_VERSION, - .maxattr = OVS_VPORT_ATTR_MAX, + .name = OVS_DATAPATH_FAMILY, + .version = OVS_DATAPATH_VERSION, + .maxattr = OVS_DP_ATTR_MAX, .netnsok = true, .parallel_ops = true, -}; - -struct genl_multicast_group ovs_dp_vport_multicast_group = { - .name = OVS_VPORT_MCGROUP + .ops = dp_datapath_genl_ops, + .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .mcgrps = &ovs_dp_datapath_multicast_group, + .n_mcgrps = 1, }; /* Called with ovs_mutex or RCU read lock. */ @@ -1484,7 +1654,12 @@ error: return err; } -/* Called with ovs_mutex or RCU read lock. */ +static struct sk_buff *ovs_vport_cmd_alloc_info(void) +{ + return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +} + +/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, u32 seq, u8 cmd) { @@ -1546,33 +1721,35 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) u32 port_no; int err; - err = -EINVAL; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) - goto exit; + return -EINVAL; + + port_no = a[OVS_VPORT_ATTR_PORT_NO] + ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; + if (port_no >= DP_MAX_PORTS) + return -EFBIG; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) - goto exit_unlock; - - if (a[OVS_VPORT_ATTR_PORT_NO]) { - port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); - - err = -EFBIG; - if (port_no >= DP_MAX_PORTS) - goto exit_unlock; + goto exit_unlock_free; + if (port_no) { vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) - goto exit_unlock; + goto exit_unlock_free; } else { for (port_no = 1; ; port_no++) { if (port_no >= DP_MAX_PORTS) { err = -EFBIG; - goto exit_unlock; + goto exit_unlock_free; } vport = ovs_vport_ovsl(dp, port_no); if (!vport) @@ -1590,22 +1767,19 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) vport = new_vport(&parms); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; - err = 0; - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, - OVS_VPORT_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - ovs_dp_detach_port(vport); - goto exit_unlock; - } + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); + return 0; -exit_unlock: +exit_unlock_free: ovs_unlock(); -exit: + kfree_skb(reply); return err; } @@ -1616,28 +1790,26 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; if (a[OVS_VPORT_ATTR_TYPE] && nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { err = -EINVAL; - goto exit_unlock; - } - - reply = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!reply) { - err = -ENOMEM; - goto exit_unlock; + goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_OPTIONS]) { err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); if (err) - goto exit_free; + goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_UPCALL_PID]) @@ -1651,10 +1823,9 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_vport_genl_family, reply, info); return 0; -exit_free: - kfree_skb(reply); -exit_unlock: +exit_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1665,30 +1836,33 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; if (vport->port_no == OVSP_LOCAL) { err = -EINVAL; - goto exit_unlock; + goto exit_unlock_free; } - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, - info->snd_seq, OVS_VPORT_CMD_DEL); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto exit_unlock; - - err = 0; + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_DEL); + BUG_ON(err < 0); ovs_dp_detach_port(vport); + ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); + return 0; -exit_unlock: +exit_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1700,24 +1874,25 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + rcu_read_lock(); vport = lookup_vport(sock_net(skb->sk), ovs_header, a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; - - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, - info->snd_seq, OVS_VPORT_CMD_NEW); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto exit_unlock; - + goto exit_unlock_free; + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); rcu_read_unlock(); return genlmsg_reply(reply, info); -exit_unlock: +exit_unlock_free: rcu_read_unlock(); + kfree_skb(reply); return err; } @@ -1728,11 +1903,12 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int bucket = cb->args[0], skip = cb->args[1]; int i, j = 0; + rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) + if (!dp) { + rcu_read_unlock(); return -ENODEV; - - rcu_read_lock(); + } for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; @@ -1759,7 +1935,16 @@ out: return skb->len; } -static const struct genl_ops dp_vport_genl_ops[] = { +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { + [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, + [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, +}; + +static struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, @@ -1783,26 +1968,25 @@ static const struct genl_ops dp_vport_genl_ops[] = { }, }; -struct genl_family_and_ops { - struct genl_family *family; - const struct genl_ops *ops; - int n_ops; - const struct genl_multicast_group *group; +struct genl_family dp_vport_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_VPORT_FAMILY, + .version = OVS_VPORT_VERSION, + .maxattr = OVS_VPORT_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_vport_genl_ops, + .n_ops = ARRAY_SIZE(dp_vport_genl_ops), + .mcgrps = &ovs_dp_vport_multicast_group, + .n_mcgrps = 1, }; -static const struct genl_family_and_ops dp_genl_families[] = { - { &dp_datapath_genl_family, - dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops), - &ovs_dp_datapath_multicast_group }, - { &dp_vport_genl_family, - dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops), - &ovs_dp_vport_multicast_group }, - { &dp_flow_genl_family, - dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops), - &ovs_dp_flow_multicast_group }, - { &dp_packet_genl_family, - dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), - NULL }, +static struct genl_family * const dp_genl_families[] = { + &dp_datapath_genl_family, + &dp_vport_genl_family, + &dp_flow_genl_family, + &dp_packet_genl_family, }; static void dp_unregister_genl(int n_families) @@ -1810,33 +1994,25 @@ static void dp_unregister_genl(int n_families) int i; for (i = 0; i < n_families; i++) - genl_unregister_family(dp_genl_families[i].family); + genl_unregister_family(dp_genl_families[i]); } static int dp_register_genl(void) { - int n_registered; int err; int i; - n_registered = 0; for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { - const struct genl_family_and_ops *f = &dp_genl_families[i]; - f->family->ops = f->ops; - f->family->n_ops = f->n_ops; - f->family->mcgrps = f->group; - f->family->n_mcgrps = f->group ? 1 : 0; - err = genl_register_family(f->family); + err = genl_register_family(dp_genl_families[i]); if (err) goto error; - n_registered++; } return 0; error: - dp_unregister_genl(n_registered); + dp_unregister_genl(i); return err; } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4067ea41be2..7ede507500d 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -55,7 +55,7 @@ struct dp_stats_percpu { u64 n_missed; u64 n_lost; u64 n_mask_hit; - struct u64_stats_sync sync; + struct u64_stats_sync syncp; }; /** @@ -88,6 +88,8 @@ struct datapath { /* Network namespace ref. */ struct net *net; #endif + + u32 user_features; }; /** @@ -145,6 +147,8 @@ int lockdep_ovsl_is_held(void); #define ASSERT_OVSL() WARN_ON(unlikely(!lockdep_ovsl_is_held())) #define ovsl_dereference(p) \ rcu_dereference_protected(p, lockdep_ovsl_is_held()) +#define rcu_dereference_ovsl(p) \ + rcu_dereference_check(p, lockdep_ovsl_is_held()) static inline struct net *ovs_dp_get_net(struct datapath *dp) { @@ -178,21 +182,21 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -extern struct genl_multicast_group ovs_dp_vport_multicast_group; void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); -const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); void ovs_dp_notify_wq(struct work_struct *work); -#define OVS_NLERR(fmt, ...) \ - pr_info_once("netlink: " fmt, ##__VA_ARGS__) - +#define OVS_NLERR(fmt, ...) \ +do { \ + if (net_ratelimit()) \ + pr_info("netlink: " fmt, ##__VA_ARGS__); \ +} while (0) #endif /* datapath.h */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b409f527960..d07ab538fc9 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -35,6 +35,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/sctp.h> +#include <linux/smp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> @@ -60,23 +61,113 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) -void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, + struct sk_buff *skb) { - __be16 tcp_flags = 0; + struct flow_stats *stats; + int node = numa_node_id(); + + stats = rcu_dereference(flow->stats[node]); + + /* Check if already have node-specific stats. */ + if (likely(stats)) { + spin_lock(&stats->lock); + /* Mark if we write on the pre-allocated stats. */ + if (node == 0 && unlikely(flow->stats_last_writer != node)) + flow->stats_last_writer = node; + } else { + stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ + spin_lock(&stats->lock); + + /* If the current NUMA-node is the only writer on the + * pre-allocated stats keep using them. + */ + if (unlikely(flow->stats_last_writer != node)) { + /* A previous locker may have already allocated the + * stats, so we need to check again. If node-specific + * stats were already allocated, we update the pre- + * allocated stats as we have already locked them. + */ + if (likely(flow->stats_last_writer != NUMA_NO_NODE) + && likely(!rcu_dereference(flow->stats[node]))) { + /* Try to allocate node-specific stats. */ + struct flow_stats *new_stats; + + new_stats = + kmem_cache_alloc_node(flow_stats_cache, + GFP_THISNODE | + __GFP_NOMEMALLOC, + node); + if (likely(new_stats)) { + new_stats->used = jiffies; + new_stats->packet_count = 1; + new_stats->byte_count = skb->len; + new_stats->tcp_flags = tcp_flags; + spin_lock_init(&new_stats->lock); + + rcu_assign_pointer(flow->stats[node], + new_stats); + goto unlock; + } + } + flow->stats_last_writer = node; + } + } + + stats->used = jiffies; + stats->packet_count++; + stats->byte_count += skb->len; + stats->tcp_flags |= tcp_flags; +unlock: + spin_unlock(&stats->lock); +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +void ovs_flow_stats_get(const struct sw_flow *flow, + struct ovs_flow_stats *ovs_stats, + unsigned long *used, __be16 *tcp_flags) +{ + int node; - if ((flow->key.eth.type == htons(ETH_P_IP) || - flow->key.eth.type == htons(ETH_P_IPV6)) && - flow->key.ip.proto == IPPROTO_TCP && - likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { - tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); + *used = 0; + *tcp_flags = 0; + memset(ovs_stats, 0, sizeof(*ovs_stats)); + + for_each_node(node) { + struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); + + if (stats) { + /* Local CPU may write on non-local stats, so we must + * block bottom-halves here. + */ + spin_lock_bh(&stats->lock); + if (!*used || time_after(stats->used, *used)) + *used = stats->used; + *tcp_flags |= stats->tcp_flags; + ovs_stats->n_packets += stats->packet_count; + ovs_stats->n_bytes += stats->byte_count; + spin_unlock_bh(&stats->lock); + } } +} - spin_lock(&flow->lock); - flow->used = jiffies; - flow->packet_count++; - flow->byte_count += skb->len; - flow->tcp_flags |= tcp_flags; - spin_unlock(&flow->lock); +/* Called with ovs_mutex. */ +void ovs_flow_stats_clear(struct sw_flow *flow) +{ + int node; + + for_each_node(node) { + struct flow_stats *stats = ovsl_dereference(flow->stats[node]); + + if (stats) { + spin_lock_bh(&stats->lock); + stats->used = 0; + stats->packet_count = 0; + stats->byte_count = 0; + stats->tcp_flags = 0; + spin_unlock_bh(&stats->lock); + } + } } static int check_header(struct sk_buff *skb, int len) @@ -263,8 +354,8 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, /* The ICMPv6 type and code fields use the 16-bit transport port * fields, so we need to store them in 16-bit network byte order. */ - key->ipv6.tp.src = htons(icmp->icmp6_type); - key->ipv6.tp.dst = htons(icmp->icmp6_code); + key->tp.src = htons(icmp->icmp6_type); + key->tp.dst = htons(icmp->icmp6_code); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -303,14 +394,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, && opt_len == 8) { if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) goto invalid; - memcpy(key->ipv6.nd.sll, - &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + ether_addr_copy(key->ipv6.nd.sll, + &nd->opt[offset+sizeof(*nd_opt)]); } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR && opt_len == 8) { if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) goto invalid; - memcpy(key->ipv6.nd.tll, - &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + ether_addr_copy(key->ipv6.nd.tll, + &nd->opt[offset+sizeof(*nd_opt)]); } icmp_len -= opt_len; @@ -370,8 +461,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) * header in the linear data area. */ eth = eth_hdr(skb); - memcpy(key->eth.src, eth->h_source, ETH_ALEN); - memcpy(key->eth.dst, eth->h_dest, ETH_ALEN); + ether_addr_copy(key->eth.src, eth->h_source); + ether_addr_copy(key->eth.dst, eth->h_dest); __skb_pull(skb, 2 * ETH_ALEN); /* We are going to push all headers that we pull, so no need to @@ -426,21 +517,21 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) if (key->ip.proto == IPPROTO_TCP) { if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); - key->ipv4.tp.src = tcp->source; - key->ipv4.tp.dst = tcp->dest; - key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp); + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); } } else if (key->ip.proto == IPPROTO_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); - key->ipv4.tp.src = udp->source; - key->ipv4.tp.dst = udp->dest; + key->tp.src = udp->source; + key->tp.dst = udp->dest; } } else if (key->ip.proto == IPPROTO_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); - key->ipv4.tp.src = sctp->source; - key->ipv4.tp.dst = sctp->dest; + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; } } else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { @@ -448,8 +539,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) /* The ICMP type and code fields use the 16-bit * transport port fields, so we need to store * them in 16-bit network byte order. */ - key->ipv4.tp.src = htons(icmp->type); - key->ipv4.tp.dst = htons(icmp->code); + key->tp.src = htons(icmp->type); + key->tp.dst = htons(icmp->code); } } @@ -469,8 +560,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->ip.proto = ntohs(arp->ar_op); memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); - memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); - memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); + ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); + ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ @@ -495,21 +586,21 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) if (key->ip.proto == NEXTHDR_TCP) { if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); - key->ipv6.tp.src = tcp->source; - key->ipv6.tp.dst = tcp->dest; - key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp); + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); } } else if (key->ip.proto == NEXTHDR_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); - key->ipv6.tp.src = udp->source; - key->ipv6.tp.dst = udp->dest; + key->tp.src = udp->source; + key->tp.dst = udp->dest; } } else if (key->ip.proto == NEXTHDR_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); - key->ipv6.tp.src = sctp->source; - key->ipv6.tp.dst = sctp->dest; + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; } } else if (key->ip.proto == NEXTHDR_ICMP) { if (icmp6hdr_ok(skb)) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 1510f51dbf7..5e5aaed3a85 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -19,6 +19,7 @@ #ifndef FLOW_H #define FLOW_H 1 +#include <linux/cache.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/openvswitch.h> @@ -46,7 +47,7 @@ struct ovs_key_ipv4_tunnel { __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; -}; +} __packed __aligned(4); /* Minimize padding. */ static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, const struct iphdr *iph, __be64 tun_id, @@ -70,7 +71,7 @@ struct sw_flow_key { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ - } phy; + } __packed phy; /* Safe when right after 'tun_key'. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ @@ -83,23 +84,21 @@ struct sw_flow_key { u8 ttl; /* IP TTL/hop limit. */ u8 frag; /* One of OVS_FRAG_TYPE_*. */ } ip; + struct { + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ + __be16 flags; /* TCP flags. */ + } tp; union { struct { struct { __be32 src; /* IP source address. */ __be32 dst; /* IP destination address. */ } addr; - union { - struct { - __be16 src; /* TCP/UDP/SCTP source port. */ - __be16 dst; /* TCP/UDP/SCTP destination port. */ - __be16 flags; /* TCP flags. */ - } tp; - struct { - u8 sha[ETH_ALEN]; /* ARP source hardware address. */ - u8 tha[ETH_ALEN]; /* ARP target hardware address. */ - } arp; - }; + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; } ipv4; struct { struct { @@ -108,11 +107,6 @@ struct sw_flow_key { } addr; __be32 label; /* IPv6 flow label. */ struct { - __be16 src; /* TCP/UDP/SCTP source port. */ - __be16 dst; /* TCP/UDP/SCTP destination port. */ - __be16 flags; /* TCP flags. */ - } tp; - struct { struct in6_addr target; /* ND target address. */ u8 sll[ETH_ALEN]; /* ND source link layer address. */ u8 tll[ETH_ALEN]; /* ND target link layer address. */ @@ -122,8 +116,8 @@ struct sw_flow_key { } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ struct sw_flow_key_range { - size_t start; - size_t end; + unsigned short int start; + unsigned short int end; }; struct sw_flow_mask { @@ -146,21 +140,30 @@ struct sw_flow_actions { struct nlattr actions[]; }; +struct flow_stats { + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + unsigned long used; /* Last used time (in jiffies). */ + spinlock_t lock; /* Lock for atomic stats update. */ + __be16 tcp_flags; /* Union of seen TCP flags. */ +}; + struct sw_flow { struct rcu_head rcu; struct hlist_node hash_node[2]; u32 hash; - + int stats_last_writer; /* NUMA-node id of the last writer on + * 'stats[0]'. + */ struct sw_flow_key key; struct sw_flow_key unmasked_key; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; - - spinlock_t lock; /* Lock for values below. */ - unsigned long used; /* Last used time (in jiffies). */ - u64 packet_count; /* Number of packets matched. */ - u64 byte_count; /* Number of bytes matched. */ - __be16 tcp_flags; /* Union of seen TCP flags. */ + struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one + * is allocated at flow creation time, + * the rest are allocated on demand + * while holding the 'stats[0].lock'. + */ }; struct arp_eth_header { @@ -177,7 +180,11 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; -void ovs_flow_used(struct sw_flow *, struct sk_buff *); +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, + struct sk_buff *); +void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, + unsigned long *used, __be16 *tcp_flags); +void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 2bc1bc1aca3..d757848da89 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -16,6 +16,8 @@ * 02110-1301, USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include "flow.h" #include "datapath.h" #include <linux/uaccess.h> @@ -202,11 +204,11 @@ static bool match_validate(const struct sw_flow_match *match, if (match->mask && (match->mask->key.ip.proto == 0xff)) mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; - if (match->key->ipv6.tp.src == + if (match->key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || - match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { key_expected |= 1 << OVS_KEY_ATTR_ND; - if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff))) + if (match->mask && (match->mask->key.tp.src == htons(0xffff))) mask_allowed |= 1 << OVS_KEY_ATTR_ND; } } @@ -216,14 +218,14 @@ static bool match_validate(const struct sw_flow_match *match, if ((key_attrs & key_expected) != key_expected) { /* Key attributes check failed. */ OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", - key_attrs, key_expected); + (unsigned long long)key_attrs, (unsigned long long)key_expected); return false; } if ((mask_attrs & mask_allowed) != mask_attrs) { /* Mask attributes check failed. */ OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", - mask_attrs, mask_allowed); + (unsigned long long)mask_attrs, (unsigned long long)mask_allowed); return false; } @@ -487,7 +489,7 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, return 0; } -static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct nlattr **a, bool is_mask) { int err; @@ -628,27 +630,18 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct ovs_key_tcp *tcp_key; tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - tcp_key->tcp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - tcp_key->tcp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - tcp_key->tcp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - tcp_key->tcp_dst, is_mask); - } + SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_TCP); } if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.flags, + SW_FLOW_KEY_PUT(match, tp.flags, nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), is_mask); } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.flags, + SW_FLOW_KEY_PUT(match, tp.flags, nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), is_mask); } @@ -659,17 +652,8 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct ovs_key_udp *udp_key; udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - udp_key->udp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - udp_key->udp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - udp_key->udp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - udp_key->udp_dst, is_mask); - } + SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_UDP); } @@ -677,17 +661,8 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct ovs_key_sctp *sctp_key; sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - sctp_key->sctp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - sctp_key->sctp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - sctp_key->sctp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - sctp_key->sctp_dst, is_mask); - } + SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_SCTP); } @@ -695,9 +670,9 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct ovs_key_icmp *icmp_key; icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - SW_FLOW_KEY_PUT(match, ipv4.tp.src, + SW_FLOW_KEY_PUT(match, tp.src, htons(icmp_key->icmp_type), is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + SW_FLOW_KEY_PUT(match, tp.dst, htons(icmp_key->icmp_code), is_mask); attrs &= ~(1 << OVS_KEY_ATTR_ICMP); } @@ -706,9 +681,9 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct ovs_key_icmpv6 *icmpv6_key; icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - SW_FLOW_KEY_PUT(match, ipv6.tp.src, + SW_FLOW_KEY_PUT(match, tp.src, htons(icmpv6_key->icmpv6_type), is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + SW_FLOW_KEY_PUT(match, tp.dst, htons(icmpv6_key->icmpv6_code), is_mask); attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); } @@ -934,8 +909,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, goto nla_put_failure; eth_key = nla_data(nla); - memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN); - memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN); + ether_addr_copy(eth_key->eth_src, output->eth.src); + ether_addr_copy(eth_key->eth_dst, output->eth.dst); if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { __be16 eth_type; @@ -1007,8 +982,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, arp_key->arp_sip = output->ipv4.addr.src; arp_key->arp_tip = output->ipv4.addr.dst; arp_key->arp_op = htons(output->ip.proto); - memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN); - memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN); + ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); + ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1022,19 +997,11 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (!nla) goto nla_put_failure; tcp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - tcp_key->tcp_src = output->ipv4.tp.src; - tcp_key->tcp_dst = output->ipv4.tp.dst; - if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, - output->ipv4.tp.flags)) - goto nla_put_failure; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - tcp_key->tcp_src = output->ipv6.tp.src; - tcp_key->tcp_dst = output->ipv6.tp.dst; - if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, - output->ipv6.tp.flags)) - goto nla_put_failure; - } + tcp_key->tcp_src = output->tp.src; + tcp_key->tcp_dst = output->tp.dst; + if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, + output->tp.flags)) + goto nla_put_failure; } else if (swkey->ip.proto == IPPROTO_UDP) { struct ovs_key_udp *udp_key; @@ -1042,13 +1009,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (!nla) goto nla_put_failure; udp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - udp_key->udp_src = output->ipv4.tp.src; - udp_key->udp_dst = output->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - udp_key->udp_src = output->ipv6.tp.src; - udp_key->udp_dst = output->ipv6.tp.dst; - } + udp_key->udp_src = output->tp.src; + udp_key->udp_dst = output->tp.dst; } else if (swkey->ip.proto == IPPROTO_SCTP) { struct ovs_key_sctp *sctp_key; @@ -1056,13 +1018,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (!nla) goto nla_put_failure; sctp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - sctp_key->sctp_src = swkey->ipv4.tp.src; - sctp_key->sctp_dst = swkey->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - sctp_key->sctp_src = swkey->ipv6.tp.src; - sctp_key->sctp_dst = swkey->ipv6.tp.dst; - } + sctp_key->sctp_src = output->tp.src; + sctp_key->sctp_dst = output->tp.dst; } else if (swkey->eth.type == htons(ETH_P_IP) && swkey->ip.proto == IPPROTO_ICMP) { struct ovs_key_icmp *icmp_key; @@ -1071,8 +1028,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (!nla) goto nla_put_failure; icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(output->ipv4.tp.src); - icmp_key->icmp_code = ntohs(output->ipv4.tp.dst); + icmp_key->icmp_type = ntohs(output->tp.src); + icmp_key->icmp_code = ntohs(output->tp.dst); } else if (swkey->eth.type == htons(ETH_P_IPV6) && swkey->ip.proto == IPPROTO_ICMPV6) { struct ovs_key_icmpv6 *icmpv6_key; @@ -1082,8 +1039,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (!nla) goto nla_put_failure; icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src); - icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst); + icmpv6_key->icmpv6_type = ntohs(output->tp.src); + icmpv6_key->icmpv6_code = ntohs(output->tp.dst); if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { @@ -1095,8 +1052,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, nd_key = nla_data(nla); memcpy(nd_key->nd_target, &output->ipv6.nd.target, sizeof(nd_key->nd_target)); - memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN); - memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN); + ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll); + ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll); } } } @@ -1128,19 +1085,11 @@ struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size) return sfa; } -/* RCU callback used by ovs_nla_free_flow_actions. */ -static void rcu_free_acts_callback(struct rcu_head *rcu) -{ - struct sw_flow_actions *sf_acts = container_of(rcu, - struct sw_flow_actions, rcu); - kfree(sf_acts); -} - /* Schedules 'sf_acts' to be freed after the next RCU grace period. * The caller must hold rcu_read_lock for this to be sensible. */ void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) { - call_rcu(&sf_acts->rcu, rcu_free_acts_callback); + kfree_rcu(sf_acts, rcu); } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, @@ -1269,13 +1218,10 @@ static int validate_and_copy_sample(const struct nlattr *attr, static int validate_tp_port(const struct sw_flow_key *flow_key) { - if (flow_key->eth.type == htons(ETH_P_IP)) { - if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) - return 0; - } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { - if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) - return 0; - } + if ((flow_key->eth.type == htons(ETH_P_IP) || + flow_key->eth.type == htons(ETH_P_IPV6)) && + (flow_key->tp.src || flow_key->tp.dst)) + return 0; return -EINVAL; } diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index e4254270608..cf2d853646f 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -25,7 +25,7 @@ #include <linux/if_vlan.h> #include <net/llc_pdu.h> #include <linux/kernel.h> -#include <linux/jhash.h> +#include <linux/hash.h> #include <linux/jiffies.h> #include <linux/llc.h> #include <linux/module.h> @@ -44,12 +44,11 @@ #include <net/ipv6.h> #include <net/ndisc.h> -#include "datapath.h" - #define TBL_MIN_BUCKETS 1024 #define REHASH_INTERVAL (10 * 60 * HZ) static struct kmem_cache *flow_cache; +struct kmem_cache *flow_stats_cache __read_mostly; static u16 range_n_bytes(const struct sw_flow_key_range *range) { @@ -59,8 +58,10 @@ static u16 range_n_bytes(const struct sw_flow_key_range *range) void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, const struct sw_flow_mask *mask) { - const long *m = (long *)((u8 *)&mask->key + mask->range.start); - const long *s = (long *)((u8 *)src + mask->range.start); + const long *m = (const long *)((const u8 *)&mask->key + + mask->range.start); + const long *s = (const long *)((const u8 *)src + + mask->range.start); long *d = (long *)((u8 *)dst + mask->range.start); int i; @@ -75,16 +76,35 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, struct sw_flow *ovs_flow_alloc(void) { struct sw_flow *flow; + struct flow_stats *stats; + int node; flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); - spin_lock_init(&flow->lock); flow->sf_acts = NULL; flow->mask = NULL; + flow->stats_last_writer = NUMA_NO_NODE; + + /* Initialize the default stat node. */ + stats = kmem_cache_alloc_node(flow_stats_cache, + GFP_KERNEL | __GFP_ZERO, 0); + if (!stats) + goto err; + + spin_lock_init(&stats->lock); + + RCU_INIT_POINTER(flow->stats[0], stats); + + for_each_node(node) + if (node != 0) + RCU_INIT_POINTER(flow->stats[node], NULL); return flow; +err: + kmem_cache_free(flow_cache, flow); + return ERR_PTR(-ENOMEM); } int ovs_flow_tbl_count(struct flow_table *table) @@ -117,7 +137,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets) static void flow_free(struct sw_flow *flow) { - kfree((struct sf_flow_acts __force *)flow->sf_acts); + int node; + + kfree((struct sw_flow_actions __force *)flow->sf_acts); + for_each_node(node) + if (flow->stats[node]) + kmem_cache_free(flow_stats_cache, + (struct flow_stats __force *)flow->stats[node]); kmem_cache_free(flow_cache, flow); } @@ -128,37 +154,11 @@ static void rcu_free_flow_callback(struct rcu_head *rcu) flow_free(flow); } -static void rcu_free_sw_flow_mask_cb(struct rcu_head *rcu) -{ - struct sw_flow_mask *mask = container_of(rcu, struct sw_flow_mask, rcu); - - kfree(mask); -} - -static void flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) -{ - if (!mask) - return; - - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - if (deferred) - call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb); - else - kfree(mask); - } -} - void ovs_flow_free(struct sw_flow *flow, bool deferred) { if (!flow) return; - flow_mask_del_ref(flow->mask, deferred); - if (deferred) call_rcu(&flow->rcu, rcu_free_flow_callback); else @@ -170,26 +170,9 @@ static void free_buckets(struct flex_array *buckets) flex_array_free(buckets); } + static void __table_instance_destroy(struct table_instance *ti) { - int i; - - if (ti->keep_flows) - goto skip_flows; - - for (i = 0; i < ti->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head = flex_array_get(ti->buckets, i); - struct hlist_node *n; - int ver = ti->node_ver; - - hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { - hlist_del(&flow->hash_node[ver]); - ovs_flow_free(flow, false); - } - } - -skip_flows: free_buckets(ti->buckets); kfree(ti); } @@ -240,20 +223,38 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) static void table_instance_destroy(struct table_instance *ti, bool deferred) { + int i; + if (!ti) return; + if (ti->keep_flows) + goto skip_flows; + + for (i = 0; i < ti->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(ti->buckets, i); + struct hlist_node *n; + int ver = ti->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + hlist_del_rcu(&flow->hash_node[ver]); + ovs_flow_free(flow, deferred); + } + } + +skip_flows: if (deferred) call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); else __table_instance_destroy(ti); } -void ovs_flow_tbl_destroy(struct flow_table *table) +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) { struct table_instance *ti = ovsl_dereference(table->ti); - table_instance_destroy(ti, false); + table_instance_destroy(ti, deferred); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -356,13 +357,13 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) static u32 flow_hash(const struct sw_flow_key *key, int key_start, int key_end) { - u32 *hash_key = (u32 *)((u8 *)key + key_start); + const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); int hash_u32s = (key_end - key_start) >> 2; /* Make sure number of hash bytes are multiple of u32. */ BUILD_BUG_ON(sizeof(long) % sizeof(u32)); - return jhash2(hash_key, hash_u32s, 0); + return arch_fast_hash2(hash_key, hash_u32s, 0); } static int flow_key_start(const struct sw_flow_key *key) @@ -378,8 +379,8 @@ static bool cmp_key(const struct sw_flow_key *key1, const struct sw_flow_key *key2, int key_start, int key_end) { - const long *cp1 = (long *)((u8 *)key1 + key_start); - const long *cp2 = (long *)((u8 *)key2 + key_start); + const long *cp1 = (const long *)((const u8 *)key1 + key_start); + const long *cp2 = (const long *)((const u8 *)key2 + key_start); long diffs = 0; int i; @@ -429,11 +430,11 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, return NULL; } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 *n_mask_hit) { - struct table_instance *ti = rcu_dereference(tbl->ti); + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct sw_flow_mask *mask; struct sw_flow *flow; @@ -447,6 +448,30 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, return NULL; } +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + u32 __always_unused n_mask_hit; + + return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); +} + +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + /* Always called under ovs-mutex. */ + list_for_each_entry(mask, &tbl->mask_list, list) { + flow = masked_flow_lookup(ti, match->key, mask); + if (flow && ovs_flow_cmp_unmasked_key(flow, match)) /* Found */ + return flow; + } + return NULL; +} + int ovs_flow_tbl_num_masks(const struct flow_table *table) { struct sw_flow_mask *mask; @@ -463,6 +488,25 @@ static struct table_instance *table_instance_expand(struct table_instance *ti) return table_instance_rehash(ti, ti->n_buckets * 2); } +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + kfree_rcu(mask, rcu); + } + } +} + +/* Must be called with OVS mutex held. */ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { struct table_instance *ti = ovsl_dereference(table->ti); @@ -470,6 +514,11 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) BUG_ON(table->count == 0); hlist_del_rcu(&flow->hash_node[ti->node_ver]); table->count--; + + /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be + * accessible as long as the RCU read lock is held. + */ + flow_mask_remove(table, flow->mask); } static struct sw_flow_mask *mask_alloc(void) @@ -478,21 +527,16 @@ static struct sw_flow_mask *mask_alloc(void) mask = kmalloc(sizeof(*mask), GFP_KERNEL); if (mask) - mask->ref_count = 0; + mask->ref_count = 1; return mask; } -static void mask_add_ref(struct sw_flow_mask *mask) -{ - mask->ref_count++; -} - static bool mask_equal(const struct sw_flow_mask *a, const struct sw_flow_mask *b) { - u8 *a_ = (u8 *)&a->key + a->range.start; - u8 *b_ = (u8 *)&b->key + b->range.start; + const u8 *a_ = (const u8 *)&a->key + a->range.start; + const u8 *b_ = (const u8 *)&b->key + b->range.start; return (a->range.end == b->range.end) && (a->range.start == b->range.start) @@ -514,11 +558,7 @@ static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, return NULL; } -/** - * add a new mask into the mask list. - * The caller needs to make sure that 'mask' is not the same - * as any masks that are already on the list. - */ +/* Add 'mask' into the mask list, if it is not already there. */ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, struct sw_flow_mask *new) { @@ -532,13 +572,16 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, mask->key = new->key; mask->range = new->range; list_add_rcu(&mask->list, &tbl->mask_list); + } else { + BUG_ON(!mask->ref_count); + mask->ref_count++; } - mask_add_ref(mask); flow->mask = mask; return 0; } +/* Must be called with OVS mutex held. */ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, struct sw_flow_mask *mask) { @@ -577,16 +620,28 @@ int ovs_flow_init(void) BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); - flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, - 0, NULL); + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + + (num_possible_nodes() + * sizeof(struct flow_stats *)), + 0, 0, NULL); if (flow_cache == NULL) return -ENOMEM; + flow_stats_cache + = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (flow_stats_cache == NULL) { + kmem_cache_destroy(flow_cache); + flow_cache = NULL; + return -ENOMEM; + } + return 0; } /* Uninitializes the flow module. */ void ovs_flow_exit(void) { + kmem_cache_destroy(flow_stats_cache); kmem_cache_destroy(flow_cache); } diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index fbe45d5ad07..5918bff7f3f 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -52,6 +52,8 @@ struct flow_table { unsigned int count; }; +extern struct kmem_cache *flow_stats_cache; + int ovs_flow_init(void); void ovs_flow_exit(void); @@ -60,7 +62,7 @@ void ovs_flow_free(struct sw_flow *, bool deferred); int ovs_flow_tbl_init(struct flow_table *); int ovs_flow_tbl_count(struct flow_table *table); -void ovs_flow_tbl_destroy(struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); int ovs_flow_tbl_flush(struct flow_table *flow_table); int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, @@ -69,10 +71,13 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, const struct sw_flow_key *, u32 *n_mask_hit); - +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match); bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, struct sw_flow_match *match); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index a3d6951602d..f49148a07da 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -110,6 +110,22 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_RCVD; } +/* Called with rcu_read_lock and BH disabled. */ +static int gre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) +{ + struct ovs_net *ovs_net; + struct vport *vport; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + + if (unlikely(!vport)) + return PACKET_REJECT; + else + return PACKET_RCVD; +} + static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); @@ -172,9 +188,9 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->local_df = 1; + skb->ignore_df = 1; - return iptunnel_xmit(rt, skb, fl.saddr, + return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, false); @@ -186,6 +202,7 @@ error: static struct gre_cisco_protocol gre_protocol = { .handler = gre_rcv, + .err_handler = gre_err, .priority = 1, }; @@ -256,7 +273,7 @@ static void gre_tnl_destroy(struct vport *vport) ovs_net = net_generic(net, ovs_net_id); - rcu_assign_pointer(ovs_net->vport_net.gre_vport, NULL); + RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); ovs_vport_deferred_free(vport); gre_exit(); } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 729c68763fe..789af9280e7 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -130,7 +130,7 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev->destructor = internal_dev_destructor; - SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops); + netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->tx_queue_len = 0; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index e797a50ac2b..0edbd95c60e 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -122,7 +122,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) vxlan_port = vxlan_vport(vport); strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, false); + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0); if (IS_ERR(vs)) { ovs_vport_free(vport); return (void *)vs; @@ -170,7 +170,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->local_df = 1; + skb->ignore_df = 1; inet_get_local_port_range(net, &port_min, &port_max); src_port = vxlan_src_port(port_min, port_max, skb); @@ -180,7 +180,8 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, src_port, dst_port, - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); + htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8), + false); if (err < 0) ip_rt_put(rt); error: diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d830a95f03a..42c0f4a0b78 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -33,6 +33,9 @@ #include "vport.h" #include "vport-internal_dev.h" +static void ovs_vport_record_error(struct vport *, + enum vport_err_type err_type); + /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the bottom of vport.h. */ static const struct vport_ops *vport_ops_list[] = { @@ -118,7 +121,6 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, { struct vport *vport; size_t alloc_size; - int i; alloc_size = sizeof(struct vport); if (priv_size) { @@ -136,19 +138,12 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); - vport->percpu_stats = alloc_percpu(struct pcpu_tstats); + vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->percpu_stats) { kfree(vport); return ERR_PTR(-ENOMEM); } - for_each_possible_cpu(i) { - struct pcpu_tstats *vport_stats; - vport_stats = per_cpu_ptr(vport->percpu_stats, i); - u64_stats_init(&vport_stats->syncp); - } - - spin_lock_init(&vport->stats_lock); return vport; @@ -275,16 +270,16 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) spin_unlock_bh(&vport->stats_lock); for_each_possible_cpu(i) { - const struct pcpu_tstats *percpu_stats; - struct pcpu_tstats local_stats; + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; unsigned int start; percpu_stats = per_cpu_ptr(vport->percpu_stats, i); do { - start = u64_stats_fetch_begin_bh(&percpu_stats->syncp); + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_bh(&percpu_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); stats->rx_bytes += local_stats.rx_bytes; stats->rx_packets += local_stats.rx_packets; @@ -344,7 +339,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, struct ovs_key_ipv4_tunnel *tun_key) { - struct pcpu_tstats *stats; + struct pcpu_sw_netstats *stats; stats = this_cpu_ptr(vport->percpu_stats); u64_stats_update_begin(&stats->syncp); @@ -370,7 +365,7 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) int sent = vport->ops->send(vport, skb); if (likely(sent > 0)) { - struct pcpu_tstats *stats; + struct pcpu_sw_netstats *stats; stats = this_cpu_ptr(vport->percpu_stats); @@ -396,7 +391,8 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) * If using the vport generic stats layer indicate that an error of the given * type has occurred. */ -void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) +static void ovs_vport_record_error(struct vport *vport, + enum vport_err_type err_type) { spin_lock(&vport->stats_lock); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 1a9fbcec6e1..8d721e62f38 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -87,7 +87,7 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_tstats __percpu *percpu_stats; + struct pcpu_sw_netstats __percpu *percpu_stats; spinlock_t stats_lock; struct vport_err_stats err_stats; @@ -172,7 +172,7 @@ void ovs_vport_deferred_free(struct vport *vport); */ static inline void *vport_priv(const struct vport *vport) { - return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); + return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); } /** @@ -185,14 +185,13 @@ static inline void *vport_priv(const struct vport *vport) * the result of a hash table lookup. @priv must point to the start of the * private data area. */ -static inline struct vport *vport_from_priv(const void *priv) +static inline struct vport *vport_from_priv(void *priv) { - return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); + return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } void ovs_vport_receive(struct vport *, struct sk_buff *, struct ovs_key_ipv4_tunnel *); -void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the top of vport.c. */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9d70f134992..b85c67ccb79 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -88,7 +88,7 @@ #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> -#include <linux/reciprocal_div.h> +#include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif @@ -243,40 +243,41 @@ static int packet_direct_xmit(struct sk_buff *skb) const struct net_device_ops *ops = dev->netdev_ops; netdev_features_t features; struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; u16 queue_map; - int ret; if (unlikely(!netif_running(dev) || - !netif_carrier_ok(dev))) { - kfree_skb(skb); - return NET_XMIT_DROP; - } + !netif_carrier_ok(dev))) + goto drop; features = netif_skb_features(skb); if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) { - kfree_skb(skb); - return NET_XMIT_DROP; - } + __skb_linearize(skb)) + goto drop; queue_map = skb_get_queue_mapping(skb); txq = netdev_get_tx_queue(dev, queue_map); - __netif_tx_lock_bh(txq); - if (unlikely(netif_xmit_frozen_or_stopped(txq))) { - ret = NETDEV_TX_BUSY; - kfree_skb(skb); - goto out; + local_bh_disable(); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) { + ret = ops->ndo_start_xmit(skb, dev); + if (ret == NETDEV_TX_OK) + txq_trans_update(txq); } + HARD_TX_UNLOCK(dev, txq); - ret = ops->ndo_start_xmit(skb, dev); - if (likely(dev_xmit_complete(ret))) - txq_trans_update(txq); - else + local_bh_enable(); + + if (!dev_xmit_complete(ret)) kfree_skb(skb); -out: - __netif_tx_unlock_bh(txq); + return ret; +drop: + atomic_long_inc(&dev->tx_dropped); + kfree_skb(skb); + return NET_XMIT_DROP; } static struct net_device *packet_cached_dev_get(struct packet_sock *po) @@ -308,9 +309,25 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 packet_pick_tx_queue(struct net_device *dev) +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) { - return (u16) smp_processor_id() % dev->real_num_tx_queues; + return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; +} + +static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +{ + const struct net_device_ops *ops = dev->netdev_ops; + u16 queue_index; + + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb, NULL, + __packet_pick_tx_queue); + queue_index = netdev_cap_txqueue(dev, queue_index); + } else { + queue_index = __packet_pick_tx_queue(dev, skb); + } + + skb_set_queue_mapping(skb, queue_index); } /* register_prot_hook must be invoked with the po->bind_lock held, @@ -963,7 +980,7 @@ static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { - ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); + ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, @@ -977,9 +994,11 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, { if (vlan_tx_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); - ppd->tp_status = TP_STATUS_VLAN_VALID; + ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); + ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; + ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } @@ -987,6 +1006,7 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { + ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) @@ -1165,6 +1185,47 @@ static void packet_increment_head(struct packet_ring_buffer *buff) buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } +static void packet_inc_pending(struct packet_ring_buffer *rb) +{ + this_cpu_inc(*rb->pending_refcnt); +} + +static void packet_dec_pending(struct packet_ring_buffer *rb) +{ + this_cpu_dec(*rb->pending_refcnt); +} + +static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) +{ + unsigned int refcnt = 0; + int cpu; + + /* We don't use pending refcount in rx_ring. */ + if (rb->pending_refcnt == NULL) + return 0; + + for_each_possible_cpu(cpu) + refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); + + return refcnt; +} + +static int packet_alloc_pending(struct packet_sock *po) +{ + po->rx_ring.pending_refcnt = NULL; + + po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); + if (unlikely(po->tx_ring.pending_refcnt == NULL)) + return -ENOBUFS; + + return 0; +} + +static void packet_free_pending(struct packet_sock *po) +{ + free_percpu(po->tx_ring.pending_refcnt); +} + static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { struct sock *sk = &po->sk; @@ -1217,7 +1278,7 @@ static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { - return reciprocal_divide(skb->rxhash, num); + return reciprocal_scale(skb_get_hash(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, @@ -1244,7 +1305,7 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { - return reciprocal_divide(prandom_u32(), num); + return prandom_u32_max(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, @@ -1268,6 +1329,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, return idx; } +static unsigned int fanout_demux_qm(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) +{ + return skb_get_queue_mapping(skb) % num; +} + static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); @@ -1295,7 +1363,6 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; } - skb_get_rxhash(skb); idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: @@ -1307,6 +1374,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; + case PACKET_FANOUT_QM: + idx = fanout_demux_qm(f, skb, num); + break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); break; @@ -1353,9 +1423,9 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } -static bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { - if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) + if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) return true; return false; @@ -1377,6 +1447,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: + case PACKET_FANOUT_QM: break; default: return -EINVAL; @@ -1539,7 +1610,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; - struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; __be16 proto = 0; @@ -1777,7 +1848,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb->dropcount = atomic_read(&sk->sk_drops); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 0; drop_n_acct: @@ -1812,6 +1883,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct timespec ts; __u32 ts_status; + /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. + * We may add members to them until current aligned size without forcing + * userspace to call getsockopt(..., PACKET_HDRLEN, ...). + */ + BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); + BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); + if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -1918,11 +1996,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h2->tp_nsec = ts.tv_nsec; if (vlan_tx_tag_present(skb)) { h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); - status |= TP_STATUS_VLAN_VALID; + h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); + status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; + h.h2->tp_vlan_tpid = 0; } - h.h2->tp_padding = 0; + memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: @@ -1936,6 +2016,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; + memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: @@ -1954,25 +2035,26 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, sll->sll_ifindex = dev->ifindex; smp_mb(); + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 - { + if (po->tp_version <= TPACKET_V2) { u8 *start, *end; - if (po->tp_version <= TPACKET_V2) { - end = (u8 *)PAGE_ALIGN((unsigned long)h.raw - + macoff + snaplen); - for (start = h.raw; start < end; start += PAGE_SIZE) - flush_dcache_page(pgv_to_page(start)); - } - smp_wmb(); + end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + + macoff + snaplen); + + for (start = h.raw; start < end; start += PAGE_SIZE) + flush_dcache_page(pgv_to_page(start)); } + smp_wmb(); #endif + if (po->tp_version <= TPACKET_V2) __packet_set_status(po, h.raw, status); else prb_clear_blk_fill_status(&po->rx_ring); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -1987,7 +2069,7 @@ ring_is_full: po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); kfree_skb(copy_skb); goto drop_n_restore; } @@ -1995,14 +2077,13 @@ ring_is_full: static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); - void *ph; if (likely(po->tx_ring.pg_vec)) { + void *ph; __u32 ts; ph = skb_shinfo(skb)->destructor_arg; - BUG_ON(atomic_read(&po->tx_ring.pending) == 0); - atomic_dec(&po->tx_ring.pending); + packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); @@ -2142,7 +2223,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) __be16 proto; int err, reserve = 0; void *ph; - struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); + bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int tp_len, size_max; unsigned char *addr; int len_sum = 0; @@ -2175,8 +2257,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; - reserve = dev->hard_header_len; - + reserve = dev->hard_header_len + VLAN_HLEN; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); @@ -2185,10 +2266,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) do { ph = packet_current_frame(po, &po->tx_ring, - TP_STATUS_SEND_REQUEST); - + TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { - schedule(); + if (need_wait && need_resched()) + schedule(); continue; } @@ -2203,8 +2284,19 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) goto out_status; tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, - addr, hlen); + addr, hlen); + if (tp_len > dev->mtu + dev->hard_header_len) { + struct ethhdr *ehdr; + /* Earlier code assumed this would be a VLAN pkt, + * double-check this now that we have the actual + * packet in hand. + */ + skb_reset_mac_header(skb); + ehdr = eth_hdr(skb); + if (ehdr->h_proto != htons(ETH_P_8021Q)) + tp_len = -EMSGSIZE; + } if (unlikely(tp_len < 0)) { if (po->tp_loss) { __packet_set_status(po, ph, @@ -2219,10 +2311,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } - skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); + packet_pick_tx_queue(dev, skb); + skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); - atomic_inc(&po->tx_ring.pending); + packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = po->xmit(skb); @@ -2243,9 +2336,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (likely((ph != NULL) || - ((!(msg->msg_flags & MSG_DONTWAIT)) && - (atomic_read(&po->tx_ring.pending)))) - ); + /* Note: packet_read_pending() might be slow if we have + * to call it as it's per_cpu variable, but in fast-path + * we already short-circuit the loop with the first + * condition, and luckily don't have to go that path + * anyway. + */ + (need_wait && packet_read_pending(&po->tx_ring)))); err = len_sum; goto out_put; @@ -2287,7 +2384,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; - struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; @@ -2429,7 +2526,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); + + packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { @@ -2544,6 +2642,7 @@ static int packet_release(struct socket *sock) /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); + packet_free_pending(po); sk_refcnt_debug_release(sk); sock_put(sk); @@ -2554,9 +2653,12 @@ static int packet_release(struct socket *sock) * Attach a packet hook. */ -static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) +static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) { struct packet_sock *po = pkt_sk(sk); + const struct net_device *dev_curr; + __be16 proto_curr; + bool need_rehook; if (po->fanout) { if (dev) @@ -2566,21 +2668,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc } lock_sock(sk); - spin_lock(&po->bind_lock); - unregister_prot_hook(sk, true); - po->num = protocol; - po->prot_hook.type = protocol; - if (po->prot_hook.dev) - dev_put(po->prot_hook.dev); + proto_curr = po->prot_hook.type; + dev_curr = po->prot_hook.dev; - po->prot_hook.dev = dev; - po->ifindex = dev ? dev->ifindex : 0; + need_rehook = proto_curr != proto || dev_curr != dev; + + if (need_rehook) { + unregister_prot_hook(sk, true); + + po->num = proto; + po->prot_hook.type = proto; - packet_cached_dev_assign(po, dev); + if (po->prot_hook.dev) + dev_put(po->prot_hook.dev); - if (protocol == 0) + po->prot_hook.dev = dev; + + po->ifindex = dev ? dev->ifindex : 0; + packet_cached_dev_assign(po, dev); + } + + if (proto == 0 || !need_rehook) goto out_unlock; if (!dev || (dev->flags & IFF_UP)) { @@ -2694,6 +2804,10 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, po->num = proto; po->xmit = dev_queue_xmit; + err = packet_alloc_pending(po); + if (err) + goto out2; + packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; @@ -2726,6 +2840,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, preempt_enable(); return 0; +out2: + sk_free(sk); out: return err; } @@ -2845,6 +2961,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { + __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; @@ -2867,11 +2984,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, aux.tp_net = skb_network_offset(skb); if (vlan_tx_tag_present(skb)) { aux.tp_vlan_tci = vlan_tx_tag_get(skb); - aux.tp_status |= TP_STATUS_VLAN_VALID; + aux.tp_vlan_tpid = ntohs(skb->vlan_proto); + aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; + aux.tp_vlan_tpid = 0; } - aux.tp_padding = 0; put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } @@ -3570,34 +3688,26 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order, static char *alloc_one_pg_vec_page(unsigned long order) { - char *buffer = NULL; + char *buffer; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); - if (buffer) return buffer; - /* - * __get_free_pages failed, fall back to vmalloc - */ + /* __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc((1 << order) * PAGE_SIZE); - if (buffer) return buffer; - /* - * vmalloc failed, lets dig into swap here - */ + /* vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; - buffer = (char *)__get_free_pages(gfp_flags, order); + buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; - /* - * complete and utter failure - */ + /* complete and utter failure */ return NULL; } @@ -3652,7 +3762,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (!closing) { if (atomic_read(&po->mapped)) goto out; - if (atomic_read(&rb->pending)) + if (packet_read_pending(rb)) goto out; } @@ -3704,7 +3814,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, */ if (!tx_ring) init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); - break; + break; default: break; } diff --git a/net/packet/diag.c b/net/packet/diag.c index a9584a2f6d6..92f2c7107ee 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -3,6 +3,7 @@ #include <linux/net.h> #include <linux/netdevice.h> #include <linux/packet_diag.h> +#include <linux/percpu.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -127,6 +128,7 @@ static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb) static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag_req *req, + bool may_report_filterinfo, struct user_namespace *user_ns, u32 portid, u32 seq, u32 flags, int sk_ino) { @@ -171,7 +173,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_FILTER) && - sock_diag_put_filterinfo(user_ns, sk, skb, PACKET_DIAG_FILTER)) + sock_diag_put_filterinfo(may_report_filterinfo, sk, skb, + PACKET_DIAG_FILTER)) goto out_nlmsg_trim; return nlmsg_end(skb, nlh); @@ -187,9 +190,11 @@ static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct packet_diag_req *req; struct net *net; struct sock *sk; + bool may_report_filterinfo; net = sock_net(skb->sk); req = nlmsg_data(cb->nlh); + may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN); mutex_lock(&net->packet.sklist_lock); sk_for_each(sk, &net->packet.sklist) { @@ -199,6 +204,7 @@ static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto next; if (sk_diag_fill(sk, skb, req, + may_report_filterinfo, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, diff --git a/net/packet/internal.h b/net/packet/internal.h index 0a87d7b36c9..eb9580a6b25 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -64,7 +64,7 @@ struct packet_ring_buffer { unsigned int pg_vec_pages; unsigned int pg_vec_len; - atomic_t pending; + unsigned int __percpu *pending_refcnt; struct tpacket_kbdq_core prb_bdqc; }; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 38946b26e47..290352c0e6b 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -86,7 +86,7 @@ static int pn_init(struct sock *sk) static int pn_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { - struct sockaddr_pn *target; + DECLARE_SOCKADDR(struct sockaddr_pn *, target, msg->msg_name); struct sk_buff *skb; int err; @@ -94,13 +94,12 @@ static int pn_sendmsg(struct kiocb *iocb, struct sock *sk, MSG_CMSG_COMPAT)) return -EOPNOTSUPP; - if (msg->msg_name == NULL) + if (target == NULL) return -EDESTADDRREQ; if (msg->msg_namelen < sizeof(struct sockaddr_pn)) return -EINVAL; - target = (struct sockaddr_pn *)msg->msg_name; if (target->spn_family != AF_PHONET) return -EAFNOSUPPORT; @@ -160,6 +159,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, rval = (flags & MSG_TRUNC) ? skb->len : copylen; if (msg->msg_name != NULL) { + __sockaddr_check_size(sizeof(sa)); memcpy(msg->msg_name, &sa, sizeof(sa)); *addr_len = sizeof(sa); } diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index a2fba7edfd1..66dc65e7c6a 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -37,7 +37,7 @@ struct gprs_dev { struct sock *sk; void (*old_state_change)(struct sock *); - void (*old_data_ready)(struct sock *, int); + void (*old_data_ready)(struct sock *); void (*old_write_space)(struct sock *); struct net_device *dev; @@ -146,7 +146,7 @@ drop: return err; } -static void gprs_data_ready(struct sock *sk, int len) +static void gprs_data_ready(struct sock *sk) { struct gprs_dev *gp = sk->sk_user_data; struct sk_buff *skb; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index e77411735de..70a547ea517 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -462,10 +462,9 @@ out: queue: skb->dev = NULL; skb_set_owner_r(skb, sk); - err = skb->len; skb_queue_tail(queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, err); + sk->sk_data_ready(sk); return NET_RX_SUCCESS; } @@ -587,10 +586,9 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) pn->rx_credits--; skb->dev = NULL; skb_set_owner_r(skb, sk); - err = skb->len; skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, err); + sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_CONNECT_RESP: @@ -698,7 +696,7 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) skb_queue_head(&sk->sk_receive_queue, skb); sk_acceptq_added(sk); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_DISCONNECT_REQ: diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index dc15f430080..b64151ade6b 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -70,10 +70,10 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) int err; u8 pnaddr; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); @@ -233,10 +233,10 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh) int err; u8 dst; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); diff --git a/net/rds/bind.c b/net/rds/bind.c index b5ad65a0067..a2e6562da75 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -117,7 +117,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) rover = be16_to_cpu(*port); last = rover; } else { - rover = max_t(u16, net_random(), 2); + rover = max_t(u16, prandom_u32(), 2); last = rover - 1; } diff --git a/net/rds/ib.c b/net/rds/ib.c index b4c8b0022fe..ba2dffeff60 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -338,7 +338,8 @@ static int rds_ib_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support iWARP devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 8eb9501e3d6..d67de453c35 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -421,8 +421,7 @@ static void rds_ib_recv_cache_put(struct list_head *new_item, struct rds_ib_refill_cache *cache) { unsigned long flags; - struct list_head *old; - struct list_head __percpu *chpfirst; + struct list_head *old, *chpfirst; local_irq_save(flags); @@ -432,7 +431,7 @@ static void rds_ib_recv_cache_put(struct list_head *new_item, else /* put on front */ list_add_tail(new_item, chpfirst); - __this_cpu_write(chpfirst, new_item); + __this_cpu_write(cache->percpu->first, new_item); __this_cpu_inc(cache->percpu->count); if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT) @@ -452,7 +451,7 @@ static void rds_ib_recv_cache_put(struct list_head *new_item, } while (old); - __this_cpu_write(chpfirst, NULL); + __this_cpu_write(cache->percpu->first, NULL); __this_cpu_write(cache->percpu->count, 0); end: local_irq_restore(flags); @@ -599,7 +598,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -607,7 +606,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, static u64 rds_ib_get_ack(struct rds_ib_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 37be6e226d1..1dde91e3dc7 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -298,7 +298,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rds_ib_stats_inc(s_ib_tx_cq_event); if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) rds_ib_stats_inc(s_ib_tx_stalled); rds_ib_ack_send_complete(ic); continue; @@ -315,7 +315,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rm = rds_ib_send_unmap_op(ic, send, wc.status); - if (send->s_queued + HZ/2 < jiffies) + if (time_after(jiffies, send->s_queued + HZ/2)) rds_ib_stats_inc(s_ib_tx_stalled); if (send->s_op) { diff --git a/net/rds/iw.c b/net/rds/iw.c index 7826d46baa7..589935661d6 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -239,7 +239,8 @@ static int rds_iw_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support IB devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_RNIC) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 45033358358..aa8bf678600 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -429,7 +429,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -437,7 +437,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, static u64 rds_iw_get_ack(struct rds_iw_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index e40c3c5db2c..9105ea03aec 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -232,7 +232,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) } if (wc.wr_id == RDS_IW_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) rds_iw_stats_inc(s_iw_tx_stalled); rds_iw_ack_send_complete(ic); continue; @@ -267,7 +267,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) send->s_wr.opcode = 0xdead; send->s_wr.num_sge = 1; - if (send->s_queued + HZ/2 < jiffies) + if (time_after(jiffies, send->s_queued + HZ/2)) rds_iw_stats_inc(s_iw_tx_stalled); /* If a RDMA operation produced an error, signal this right diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 89c91515ed0..139239d2cb2 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -111,8 +111,7 @@ static struct ctl_table rds_iw_sysctl_table[] = { void rds_iw_sysctl_exit(void) { - if (rds_iw_sysctl_hdr) - unregister_net_sysctl_table(rds_iw_sysctl_hdr); + unregister_net_sysctl_table(rds_iw_sysctl_hdr); } int rds_iw_sysctl_init(void) diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index c2be901d19e..6cd9d1deafc 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -168,7 +168,7 @@ static int rds_rdma_listen_init(void) return ret; } - sin.sin_family = AF_INET, + sin.sin_family = AF_INET; sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); sin.sin_port = (__force u16)htons(RDS_PORT); diff --git a/net/rds/recv.c b/net/rds/recv.c index de339b24ca1..bd82522534f 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -402,7 +402,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; - struct sockaddr_in *sin; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ @@ -479,7 +479,6 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_stats_inc(s_recv_delivered); - sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; sin->sin_port = inc->i_hdr.h_sport; diff --git a/net/rds/send.c b/net/rds/send.c index 88eace57dd6..23718160d71 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -107,7 +107,7 @@ static int acquire_in_xmit(struct rds_connection *conn) static void release_in_xmit(struct rds_connection *conn) { clear_bit(RDS_IN_XMIT, &conn->c_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk @@ -661,7 +661,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, /* order flag updates with spin locks */ if (!list_empty(&list)) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&conn->c_lock, flags); @@ -691,7 +691,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) } /* order flag updates with the rs lock */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); @@ -922,7 +922,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); __be32 daddr; __be16 dport; struct rds_message *rm = NULL; diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index b5cb2aa08f3..c3b0cd43eb5 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -94,8 +94,7 @@ static struct ctl_table rds_sysctl_rds_table[] = { void rds_sysctl_exit(void) { - if (rds_sysctl_reg_table) - unregister_net_sysctl_table(rds_sysctl_reg_table); + unregister_net_sysctl_table(rds_sysctl_reg_table); } int rds_sysctl_init(void) diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 9cf2927d002..65637491f72 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -61,12 +61,12 @@ void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ int rds_tcp_listen_init(void); void rds_tcp_listen_stop(void); -void rds_tcp_listen_data_ready(struct sock *sk, int bytes); +void rds_tcp_listen_data_ready(struct sock *sk); /* tcp_recv.c */ int rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); -void rds_tcp_data_ready(struct sock *sk, int bytes); +void rds_tcp_data_ready(struct sock *sk); int rds_tcp_recv(struct rds_connection *conn); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 7787537e9c2..23ab4dcd1d9 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -108,9 +108,9 @@ static void rds_tcp_accept_worker(struct work_struct *work) cond_resched(); } -void rds_tcp_listen_data_ready(struct sock *sk, int bytes) +void rds_tcp_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); rdsdebug("listen data ready sk %p\n", sk); @@ -132,7 +132,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } int rds_tcp_listen_init(void) @@ -153,7 +153,7 @@ int rds_tcp_listen_init(void) sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); - sin.sin_family = PF_INET, + sin.sin_family = PF_INET; sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); sin.sin_port = (__force u16)htons(RDS_TCP_PORT); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 4fac4f2bb9d..9ae6e0a264e 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -314,13 +314,13 @@ int rds_tcp_recv(struct rds_connection *conn) return ret; } -void rds_tcp_data_ready(struct sock *sk, int bytes) +void rds_tcp_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); struct rds_connection *conn; struct rds_tcp_connection *tc; - rdsdebug("data ready sk %p bytes %d\n", sk, bytes); + rdsdebug("data ready sk %p\n", sk); read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; @@ -337,7 +337,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) queue_delayed_work(rds_wq, &conn->c_recv_w, 0); out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } int rds_tcp_recv_init(void) diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 81cf5a4c5e4..53b17ca0dff 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -93,7 +93,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 1bacc107994..b3b16c070a7 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -14,9 +14,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/kernel.h> @@ -791,7 +789,8 @@ void rfkill_resume_polling(struct rfkill *rfkill) if (!rfkill->ops->poll) return; - schedule_work(&rfkill->poll_work.work); + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, 0); } EXPORT_SYMBOL(rfkill_resume_polling); @@ -896,7 +895,8 @@ static void rfkill_poll(struct work_struct *work) */ rfkill->ops->poll(rfkill, rfkill->data); - schedule_delayed_work(&rfkill->poll_work, + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); } @@ -960,7 +960,8 @@ int __must_check rfkill_register(struct rfkill *rfkill) INIT_WORK(&rfkill->sync_work, rfkill_sync_work); if (rfkill->ops->poll) - schedule_delayed_work(&rfkill->poll_work, + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); if (!rfkill->persistent || rfkill_epo_lock_active) { diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 5620d3c0747..14c98e48f26 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -25,19 +25,17 @@ #include <linux/clk.h> #include <linux/slab.h> #include <linux/acpi.h> -#include <linux/acpi_gpio.h> +#include <linux/gpio/consumer.h> #include <linux/rfkill-gpio.h> struct rfkill_gpio_data { const char *name; enum rfkill_type type; - int reset_gpio; - int shutdown_gpio; + struct gpio_desc *reset_gpio; + struct gpio_desc *shutdown_gpio; struct rfkill *rfkill_dev; - char *reset_name; - char *shutdown_name; struct clk *clk; bool clk_enabled; @@ -47,21 +45,14 @@ static int rfkill_gpio_set_power(void *data, bool blocked) { struct rfkill_gpio_data *rfkill = data; - if (blocked) { - if (gpio_is_valid(rfkill->shutdown_gpio)) - gpio_set_value(rfkill->shutdown_gpio, 0); - if (gpio_is_valid(rfkill->reset_gpio)) - gpio_set_value(rfkill->reset_gpio, 0); - if (!IS_ERR(rfkill->clk) && rfkill->clk_enabled) - clk_disable(rfkill->clk); - } else { - if (!IS_ERR(rfkill->clk) && !rfkill->clk_enabled) - clk_enable(rfkill->clk); - if (gpio_is_valid(rfkill->reset_gpio)) - gpio_set_value(rfkill->reset_gpio, 1); - if (gpio_is_valid(rfkill->shutdown_gpio)) - gpio_set_value(rfkill->shutdown_gpio, 1); - } + if (!blocked && !IS_ERR(rfkill->clk) && !rfkill->clk_enabled) + clk_enable(rfkill->clk); + + gpiod_set_value_cansleep(rfkill->shutdown_gpio, !blocked); + gpiod_set_value_cansleep(rfkill->reset_gpio, !blocked); + + if (blocked && !IS_ERR(rfkill->clk) && rfkill->clk_enabled) + clk_disable(rfkill->clk); rfkill->clk_enabled = blocked; @@ -83,8 +74,6 @@ static int rfkill_gpio_acpi_probe(struct device *dev, rfkill->name = dev_name(dev); rfkill->type = (unsigned)id->driver_data; - rfkill->reset_gpio = acpi_get_gpio_by_index(dev, 0, NULL); - rfkill->shutdown_gpio = acpi_get_gpio_by_index(dev, 1, NULL); return 0; } @@ -93,9 +82,8 @@ static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data; struct rfkill_gpio_data *rfkill; - const char *clk_name = NULL; - int ret = 0; - int len = 0; + struct gpio_desc *gpio; + int ret; rfkill = devm_kzalloc(&pdev->dev, sizeof(*rfkill), GFP_KERNEL); if (!rfkill) @@ -106,61 +94,36 @@ static int rfkill_gpio_probe(struct platform_device *pdev) if (ret) return ret; } else if (pdata) { - clk_name = pdata->power_clk_name; rfkill->name = pdata->name; rfkill->type = pdata->type; - rfkill->reset_gpio = pdata->reset_gpio; - rfkill->shutdown_gpio = pdata->shutdown_gpio; } else { return -ENODEV; } - /* make sure at-least one of the GPIO is defined and that - * a name is specified for this instance */ - if ((!gpio_is_valid(rfkill->reset_gpio) && - !gpio_is_valid(rfkill->shutdown_gpio)) || !rfkill->name) { - pr_warn("%s: invalid platform data\n", __func__); - return -EINVAL; - } + rfkill->clk = devm_clk_get(&pdev->dev, NULL); - if (pdata && pdata->gpio_runtime_setup) { - ret = pdata->gpio_runtime_setup(pdev); - if (ret) { - pr_warn("%s: can't set up gpio\n", __func__); + gpio = devm_gpiod_get_index(&pdev->dev, "reset", 0); + if (!IS_ERR(gpio)) { + ret = gpiod_direction_output(gpio, 0); + if (ret) return ret; - } + rfkill->reset_gpio = gpio; } - len = strlen(rfkill->name); - rfkill->reset_name = devm_kzalloc(&pdev->dev, len + 7, GFP_KERNEL); - if (!rfkill->reset_name) - return -ENOMEM; - - rfkill->shutdown_name = devm_kzalloc(&pdev->dev, len + 10, GFP_KERNEL); - if (!rfkill->shutdown_name) - return -ENOMEM; - - snprintf(rfkill->reset_name, len + 6 , "%s_reset", rfkill->name); - snprintf(rfkill->shutdown_name, len + 9, "%s_shutdown", rfkill->name); - - rfkill->clk = devm_clk_get(&pdev->dev, clk_name); - - if (gpio_is_valid(rfkill->reset_gpio)) { - ret = devm_gpio_request_one(&pdev->dev, rfkill->reset_gpio, - 0, rfkill->reset_name); - if (ret) { - pr_warn("%s: failed to get reset gpio.\n", __func__); + gpio = devm_gpiod_get_index(&pdev->dev, "shutdown", 1); + if (!IS_ERR(gpio)) { + ret = gpiod_direction_output(gpio, 0); + if (ret) return ret; - } + rfkill->shutdown_gpio = gpio; } - if (gpio_is_valid(rfkill->shutdown_gpio)) { - ret = devm_gpio_request_one(&pdev->dev, rfkill->shutdown_gpio, - 0, rfkill->shutdown_name); - if (ret) { - pr_warn("%s: failed to get shutdown gpio.\n", __func__); - return ret; - } + /* Make sure at-least one of the GPIO is defined and that + * a name is specified for this instance + */ + if ((!rfkill->reset_gpio && !rfkill->shutdown_gpio) || !rfkill->name) { + dev_err(&pdev->dev, "invalid platform data\n"); + return -EINVAL; } rfkill->rfkill_dev = rfkill_alloc(rfkill->name, &pdev->dev, @@ -183,20 +146,23 @@ static int rfkill_gpio_probe(struct platform_device *pdev) static int rfkill_gpio_remove(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill = platform_get_drvdata(pdev); - struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data; - if (pdata && pdata->gpio_runtime_close) - pdata->gpio_runtime_close(pdev); rfkill_unregister(rfkill->rfkill_dev); rfkill_destroy(rfkill->rfkill_dev); return 0; } +#ifdef CONFIG_ACPI static const struct acpi_device_id rfkill_acpi_match[] = { + { "BCM2E1A", RFKILL_TYPE_BLUETOOTH }, + { "BCM2E39", RFKILL_TYPE_BLUETOOTH }, + { "BCM2E3D", RFKILL_TYPE_BLUETOOTH }, { "BCM4752", RFKILL_TYPE_GPS }, + { "LNV4752", RFKILL_TYPE_GPS }, { }, }; +#endif static struct platform_driver rfkill_gpio_driver = { .probe = rfkill_gpio_probe, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 33af77246bf..8451c8cdc9d 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1012,7 +1012,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros make_rose->source_call = facilities.source_call; make_rose->source_ndigis = facilities.source_ndigis; for (n = 0 ; n < facilities.source_ndigis ; n++) - make_rose->source_digis[n]= facilities.source_digis[n]; + make_rose->source_digis[n] = facilities.source_digis[n]; make_rose->neighbour = neigh; make_rose->device = dev; make_rose->facilities = facilities; @@ -1041,7 +1041,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros rose_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 1; } @@ -1051,7 +1051,7 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); - struct sockaddr_rose *usrose = (struct sockaddr_rose *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_rose *, usrose, msg->msg_name); int err; struct full_sockaddr_rose srose; struct sk_buff *skb; @@ -1253,6 +1253,8 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, if (msg->msg_name) { struct sockaddr_rose *srose; + DECLARE_SOCKADDR(struct full_sockaddr_rose *, full_srose, + msg->msg_name); memset(msg->msg_name, 0, sizeof(struct full_sockaddr_rose)); srose = msg->msg_name; @@ -1260,18 +1262,9 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, srose->srose_addr = rose->dest_addr; srose->srose_call = rose->dest_call; srose->srose_ndigis = rose->dest_ndigis; - if (msg->msg_namelen >= sizeof(struct full_sockaddr_rose)) { - struct full_sockaddr_rose *full_srose = (struct full_sockaddr_rose *)msg->msg_name; - for (n = 0 ; n < rose->dest_ndigis ; n++) - full_srose->srose_digis[n] = rose->dest_digis[n]; - msg->msg_namelen = sizeof(struct full_sockaddr_rose); - } else { - if (rose->dest_ndigis >= 1) { - srose->srose_ndigis = 1; - srose->srose_digi = rose->dest_digis[0]; - } - msg->msg_namelen = sizeof(struct sockaddr_rose); - } + for (n = 0 ; n < rose->dest_ndigis ; n++) + full_srose->srose_digis[n] = rose->dest_digis[n]; + msg->msg_namelen = sizeof(struct full_sockaddr_rose); } skb_free_datagram(sk, skb); diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 28dbdb911b8..50005888be5 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -146,7 +146,7 @@ static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev) static const struct header_ops rose_header_ops = { .create = rose_header, - .rebuild= rose_rebuild_header, + .rebuild = rose_rebuild_header, }; static const struct net_device_ops rose_netdev_ops = { diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index d1c3429b69e..ec126f91276 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -20,9 +20,8 @@ af-rxrpc-y := \ ar-skbuff.o \ ar-transport.o -ifeq ($(CONFIG_PROC_FS),y) -af-rxrpc-y += ar-proc.o -endif +af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o +af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index e61aa6001c6..7b167048963 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -838,6 +838,12 @@ static int __init af_rxrpc_init(void) goto error_key_type_s; } + ret = rxrpc_sysctl_init(); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register sysctls\n"); + goto error_sysctls; + } + #ifdef CONFIG_PROC_FS proc_create("rxrpc_calls", 0, init_net.proc_net, &rxrpc_call_seq_fops); proc_create("rxrpc_conns", 0, init_net.proc_net, @@ -845,6 +851,8 @@ static int __init af_rxrpc_init(void) #endif return 0; +error_sysctls: + unregister_key_type(&key_type_rxrpc_s); error_key_type_s: unregister_key_type(&key_type_rxrpc); error_key_type: @@ -865,6 +873,7 @@ error_call_jar: static void __exit af_rxrpc_exit(void) { _enter(""); + rxrpc_sysctl_exit(); unregister_key_type(&key_type_rxrpc_s); unregister_key_type(&key_type_rxrpc); sock_unregister(PF_RXRPC); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index e4d9cbcff40..c6be17a959a 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -19,12 +19,61 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -static unsigned int rxrpc_ack_defer = 1; +/* + * How long to wait before scheduling ACK generation after seeing a + * packet with RXRPC_REQUEST_ACK set (in jiffies). + */ +unsigned rxrpc_requested_ack_delay = 1; -static const char *const rxrpc_acks[] = { - "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", "IDL", - "-?-" -}; +/* + * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * + * We use this when we've received new data packets. If those packets aren't + * all consumed within this time we will send a DELAY ACK if an ACK was not + * requested to let the sender know it doesn't need to resend. + */ +unsigned rxrpc_soft_ack_delay = 1 * HZ; + +/* + * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * + * We use this when we've consumed some previously soft-ACK'd packets when + * further packets aren't immediately received to decide when to send an IDLE + * ACK let the other end know that it can free up its Tx buffer space. + */ +unsigned rxrpc_idle_ack_delay = 0.5 * HZ; + +/* + * Receive window size in packets. This indicates the maximum number of + * unconsumed received packets we're willing to retain in memory. Once this + * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further + * packets. + */ +unsigned rxrpc_rx_window_size = 32; + +/* + * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet + * made by gluing normal packets together that we're willing to handle. + */ +unsigned rxrpc_rx_mtu = 5692; + +/* + * The maximum number of fragments in a received jumbo packet that we tell the + * sender that we're willing to handle. + */ +unsigned rxrpc_rx_jumbo_max = 4; + +static const char *rxrpc_acks(u8 reason) +{ + static const char *const str[] = { + "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", + "IDL", "-?-" + }; + + if (reason >= ARRAY_SIZE(str)) + reason = ARRAY_SIZE(str) - 1; + return str[reason]; +} static const s8 rxrpc_ack_priority[] = { [0] = 0, @@ -50,7 +99,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, ASSERTCMP(prior, >, 0); _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks[ack_reason], ntohl(serial), + call->debug_id, rxrpc_acks(ack_reason), ntohl(serial), immediate); if (prior < rxrpc_ack_priority[call->ackr_reason]) { @@ -75,24 +124,23 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, switch (ack_reason) { case RXRPC_ACK_DELAY: _debug("run delay timer"); - call->ack_timer.expires = jiffies + rxrpc_ack_timeout * HZ; - add_timer(&call->ack_timer); - return; + expiry = rxrpc_soft_ack_delay; + goto run_timer; case RXRPC_ACK_IDLE: if (!immediate) { _debug("run defer timer"); - expiry = 1; + expiry = rxrpc_idle_ack_delay; goto run_timer; } goto cancel_timer; case RXRPC_ACK_REQUESTED: - if (!rxrpc_ack_defer) + expiry = rxrpc_requested_ack_delay; + if (!expiry) goto cancel_timer; if (!immediate || serial == cpu_to_be32(1)) { _debug("run defer timer"); - expiry = rxrpc_ack_defer; goto run_timer; } @@ -637,7 +685,7 @@ process_further: hard, ntohl(ack.previousPacket), ntohl(ack.serial), - rxrpc_acks[ack.reason], + rxrpc_acks(ack.reason), ack.nAcks); rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); @@ -1167,11 +1215,11 @@ send_ACK: mtu = call->conn->trans->peer->if_mtu; mtu -= call->conn->trans->peer->hdrsize; ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(32); + ackinfo.rwind = htonl(rxrpc_rx_window_size); /* permit the peer to send us jumbo packets if it wants to */ - ackinfo.rxMTU = htonl(5692); - ackinfo.jumbo_max = htonl(4); + ackinfo.rxMTU = htonl(rxrpc_rx_mtu); + ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", @@ -1180,7 +1228,7 @@ send_ACK: ntohl(ack.firstPacket), ntohl(ack.previousPacket), ntohl(ack.serial), - rxrpc_acks[ack.reason], + rxrpc_acks(ack.reason), ack.nAcks); del_timer_sync(&call->ack_timer); diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index a3bbb360a3f..a9e05db0f5d 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -12,10 +12,22 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/circ_buf.h> +#include <linux/hashtable.h> +#include <linux/spinlock_types.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" +/* + * Maximum lifetime of a call (in jiffies). + */ +unsigned rxrpc_max_call_lifetime = 60 * HZ; + +/* + * Time till dead call expires after last use (in jiffies). + */ +unsigned rxrpc_dead_call_expiry = 2 * HZ; + const char *const rxrpc_call_states[] = { [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", @@ -38,8 +50,6 @@ const char *const rxrpc_call_states[] = { struct kmem_cache *rxrpc_call_jar; LIST_HEAD(rxrpc_calls); DEFINE_RWLOCK(rxrpc_call_lock); -static unsigned int rxrpc_call_max_lifetime = 60; -static unsigned int rxrpc_dead_call_timeout = 2; static void rxrpc_destroy_call(struct work_struct *work); static void rxrpc_call_life_expired(unsigned long _call); @@ -47,6 +57,145 @@ static void rxrpc_dead_call_expired(unsigned long _call); static void rxrpc_ack_time_expired(unsigned long _call); static void rxrpc_resend_time_expired(unsigned long _call); +static DEFINE_SPINLOCK(rxrpc_call_hash_lock); +static DEFINE_HASHTABLE(rxrpc_call_hash, 10); + +/* + * Hash function for rxrpc_call_hash + */ +static unsigned long rxrpc_call_hashfunc( + u8 clientflag, + __be32 cid, + __be32 call_id, + __be32 epoch, + __be16 service_id, + sa_family_t proto, + void *localptr, + unsigned int addr_size, + const u8 *peer_addr) +{ + const u16 *p; + unsigned int i; + unsigned long key; + u32 hcid = ntohl(cid); + + _enter(""); + + key = (unsigned long)localptr; + /* We just want to add up the __be32 values, so forcing the + * cast should be okay. + */ + key += (__force u32)epoch; + key += (__force u16)service_id; + key += (__force u32)call_id; + key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; + key += hcid & RXRPC_CHANNELMASK; + key += clientflag; + key += proto; + /* Step through the peer address in 16-bit portions for speed */ + for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) + key += *p; + _leave(" key = 0x%lx", key); + return key; +} + +/* + * Add a call to the hashtable + */ +static void rxrpc_call_hash_add(struct rxrpc_call *call) +{ + unsigned long key; + unsigned int addr_size = 0; + + _enter(""); + switch (call->proto) { + case AF_INET: + addr_size = sizeof(call->peer_ip.ipv4_addr); + break; + case AF_INET6: + addr_size = sizeof(call->peer_ip.ipv6_addr); + break; + default: + break; + } + key = rxrpc_call_hashfunc(call->in_clientflag, call->cid, + call->call_id, call->epoch, + call->service_id, call->proto, + call->conn->trans->local, addr_size, + call->peer_ip.ipv6_addr); + /* Store the full key in the call */ + call->hash_key = key; + spin_lock(&rxrpc_call_hash_lock); + hash_add_rcu(rxrpc_call_hash, &call->hash_node, key); + spin_unlock(&rxrpc_call_hash_lock); + _leave(""); +} + +/* + * Remove a call from the hashtable + */ +static void rxrpc_call_hash_del(struct rxrpc_call *call) +{ + _enter(""); + spin_lock(&rxrpc_call_hash_lock); + hash_del_rcu(&call->hash_node); + spin_unlock(&rxrpc_call_hash_lock); + _leave(""); +} + +/* + * Find a call in the hashtable and return it, or NULL if it + * isn't there. + */ +struct rxrpc_call *rxrpc_find_call_hash( + u8 clientflag, + __be32 cid, + __be32 call_id, + __be32 epoch, + __be16 service_id, + void *localptr, + sa_family_t proto, + const u8 *peer_addr) +{ + unsigned long key; + unsigned int addr_size = 0; + struct rxrpc_call *call = NULL; + struct rxrpc_call *ret = NULL; + + _enter(""); + switch (proto) { + case AF_INET: + addr_size = sizeof(call->peer_ip.ipv4_addr); + break; + case AF_INET6: + addr_size = sizeof(call->peer_ip.ipv6_addr); + break; + default: + break; + } + + key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch, + service_id, proto, localptr, addr_size, + peer_addr); + hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { + if (call->hash_key == key && + call->call_id == call_id && + call->cid == cid && + call->in_clientflag == clientflag && + call->service_id == service_id && + call->proto == proto && + call->local == localptr && + memcmp(call->peer_ip.ipv6_addr, peer_addr, + addr_size) == 0 && + call->epoch == epoch) { + ret = call; + break; + } + } + _leave(" = %p", ret); + return ret; +} + /* * allocate a new call */ @@ -91,7 +240,7 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) call->rx_data_expect = 1; call->rx_data_eaten = 0; call->rx_first_oos = 0; - call->ackr_win_top = call->rx_data_eaten + 1 + RXRPC_MAXACKS; + call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; call->creation_jif = jiffies; return call; } @@ -128,11 +277,31 @@ static struct rxrpc_call *rxrpc_alloc_client_call( return ERR_PTR(ret); } + /* Record copies of information for hashtable lookup */ + call->proto = rx->proto; + call->local = trans->local; + switch (call->proto) { + case AF_INET: + call->peer_ip.ipv4_addr = + trans->peer->srx.transport.sin.sin_addr.s_addr; + break; + case AF_INET6: + memcpy(call->peer_ip.ipv6_addr, + trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + sizeof(call->peer_ip.ipv6_addr)); + break; + } + call->epoch = call->conn->epoch; + call->service_id = call->conn->service_id; + call->in_clientflag = call->conn->in_clientflag; + /* Add the new call to the hashtable */ + rxrpc_call_hash_add(call); + spin_lock(&call->conn->trans->peer->lock); list_add(&call->error_link, &call->conn->trans->peer->error_targets); spin_unlock(&call->conn->trans->peer->lock); - call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; add_timer(&call->lifetimer); _leave(" = %p", call); @@ -320,9 +489,12 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, parent = *p; call = rb_entry(parent, struct rxrpc_call, conn_node); - if (call_id < call->call_id) + /* The tree is sorted in order of the __be32 value without + * turning it into host order. + */ + if ((__force u32)call_id < (__force u32)call->call_id) p = &(*p)->rb_left; - else if (call_id > call->call_id) + else if ((__force u32)call_id > (__force u32)call->call_id) p = &(*p)->rb_right; else goto old_call; @@ -347,9 +519,31 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock_bh(&rxrpc_call_lock); + /* Record copies of information for hashtable lookup */ + call->proto = rx->proto; + call->local = conn->trans->local; + switch (call->proto) { + case AF_INET: + call->peer_ip.ipv4_addr = + conn->trans->peer->srx.transport.sin.sin_addr.s_addr; + break; + case AF_INET6: + memcpy(call->peer_ip.ipv6_addr, + conn->trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + sizeof(call->peer_ip.ipv6_addr)); + break; + default: + break; + } + call->epoch = conn->epoch; + call->service_id = conn->service_id; + call->in_clientflag = conn->in_clientflag; + /* Add the new call to the hashtable */ + rxrpc_call_hash_add(call); + _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); - call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; add_timer(&call->lifetimer); _leave(" = %p {%d} [new]", call, call->debug_id); return call; @@ -533,7 +727,7 @@ void rxrpc_release_call(struct rxrpc_call *call) del_timer_sync(&call->resend_timer); del_timer_sync(&call->ack_timer); del_timer_sync(&call->lifetimer); - call->deadspan.expires = jiffies + rxrpc_dead_call_timeout * HZ; + call->deadspan.expires = jiffies + rxrpc_dead_call_expiry; add_timer(&call->deadspan); _leave(""); @@ -665,6 +859,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) rxrpc_put_connection(call->conn); } + /* Remove the call from the hash */ + rxrpc_call_hash_del(call); + if (call->acks_window) { _debug("kill Tx window %d", CIRC_CNT(call->acks_head, call->acks_tail, diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 4106ca95ec8..6631f4f1e39 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -18,11 +18,15 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" +/* + * Time till a connection expires after last use (in seconds). + */ +unsigned rxrpc_connection_expiry = 10 * 60; + static void rxrpc_connection_reaper(struct work_struct *work); LIST_HEAD(rxrpc_connections); DEFINE_RWLOCK(rxrpc_connection_lock); -static unsigned long rxrpc_connection_timeout = 10 * 60; static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); /* @@ -381,6 +385,8 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, rxrpc_assign_connection_id(conn); rx->conn = conn; + } else { + spin_lock(&trans->client_lock); } /* we've got a connection with a free channel and we can now attach the @@ -860,7 +866,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) spin_lock(&conn->trans->client_lock); write_lock(&conn->trans->conn_lock); - reap_time = conn->put_time + rxrpc_connection_timeout; + reap_time = conn->put_time + rxrpc_connection_expiry; if (atomic_read(&conn->usage) > 0) { ; @@ -914,7 +920,7 @@ void __exit rxrpc_destroy_all_connections(void) { _enter(""); - rxrpc_connection_timeout = 0; + rxrpc_connection_expiry = 0; cancel_delayed_work(&rxrpc_connection_reap); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index a9206087b4d..db57458c824 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -83,6 +83,7 @@ void rxrpc_UDP_error_report(struct sock *sk) if (mtu == 0) { /* they didn't give us a size, estimate one */ + mtu = peer->if_mtu; if (mtu > 1500) { mtu >>= 1; if (mtu < 1500) diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 529572f18d1..63b21e580de 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -25,8 +25,6 @@ #include <net/net_namespace.h> #include "ar-internal.h" -unsigned long rxrpc_ack_timeout = 1; - const char *rxrpc_pkts[] = { "?00", "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", @@ -115,7 +113,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, spin_unlock_bh(&sk->sk_receive_queue.lock); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); } skb = NULL; } else { @@ -349,8 +347,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) * it */ if (sp->hdr.flags & RXRPC_REQUEST_ACK) { _proto("ACK Requested on %%%u", serial); - rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, - !(sp->hdr.flags & RXRPC_MORE_PACKETS)); + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); } switch (sp->hdr.type) { @@ -526,36 +523,38 @@ protocol_error: * post an incoming packet to the appropriate call/socket to deal with * - must get rid of the sk_buff, either by freeing it or by queuing it */ -static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, +static void rxrpc_post_packet_to_call(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp; - struct rxrpc_call *call; - struct rb_node *p; - __be32 call_id; - - _enter("%p,%p", conn, skb); - read_lock_bh(&conn->lock); + _enter("%p,%p", call, skb); sp = rxrpc_skb(skb); - /* look at extant calls by channel number first */ - call = conn->channels[ntohl(sp->hdr.cid) & RXRPC_CHANNELMASK]; - if (!call || call->call_id != sp->hdr.callNumber) - goto call_not_extant; - _debug("extant call [%d]", call->state); - ASSERTCMP(call->conn, ==, conn); read_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) { rxrpc_queue_call(call); + goto free_unlock; + } case RXRPC_CALL_REMOTELY_ABORTED: case RXRPC_CALL_NETWORK_ERROR: case RXRPC_CALL_DEAD: + goto dead_call; + case RXRPC_CALL_COMPLETE: + case RXRPC_CALL_CLIENT_FINAL_ACK: + /* complete server call */ + if (call->conn->in_clientflag) + goto dead_call; + /* resend last packet of a completed call */ + _debug("final ack again"); + rxrpc_get_call(call); + set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + rxrpc_queue_call(call); goto free_unlock; default: break; @@ -563,7 +562,6 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, read_unlock(&call->state_lock); rxrpc_get_call(call); - read_unlock_bh(&conn->lock); if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.flags & RXRPC_JUMBO_PACKET) @@ -574,78 +572,16 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, rxrpc_put_call(call); goto done; -call_not_extant: - /* search the completed calls in case what we're dealing with is - * there */ - _debug("call not extant"); - - call_id = sp->hdr.callNumber; - p = conn->calls.rb_node; - while (p) { - call = rb_entry(p, struct rxrpc_call, conn_node); - - if (call_id < call->call_id) - p = p->rb_left; - else if (call_id > call->call_id) - p = p->rb_right; - else - goto found_completed_call; - } - dead_call: - /* it's a either a really old call that we no longer remember or its a - * new incoming call */ - read_unlock_bh(&conn->lock); - - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && - sp->hdr.seq == cpu_to_be32(1)) { - _debug("incoming call"); - skb_queue_tail(&conn->trans->local->accept_queue, skb); - rxrpc_queue_work(&conn->trans->local->acceptor); - goto done; - } - - _debug("dead call"); - skb->priority = RX_CALL_DEAD; - rxrpc_reject_packet(conn->trans->local, skb); - goto done; - - /* resend last packet of a completed call - * - client calls may have been aborted or ACK'd - * - server calls may have been aborted - */ -found_completed_call: - _debug("completed call"); - - if (atomic_read(&call->usage) == 0) - goto dead_call; - - /* synchronise any state changes */ - read_lock(&call->state_lock); - ASSERTIFCMP(call->state != RXRPC_CALL_CLIENT_FINAL_ACK, - call->state, >=, RXRPC_CALL_COMPLETE); - - if (call->state == RXRPC_CALL_LOCALLY_ABORTED || - call->state == RXRPC_CALL_REMOTELY_ABORTED || - call->state == RXRPC_CALL_DEAD) { - read_unlock(&call->state_lock); - goto dead_call; - } - - if (call->conn->in_clientflag) { - read_unlock(&call->state_lock); - goto dead_call; /* complete server call */ + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + skb->priority = RX_CALL_DEAD; + rxrpc_reject_packet(call->conn->trans->local, skb); + goto unlock; } - - _debug("final ack again"); - rxrpc_get_call(call); - set_bit(RXRPC_CALL_ACK_FINAL, &call->events); - rxrpc_queue_call(call); - free_unlock: - read_unlock(&call->state_lock); - read_unlock_bh(&conn->lock); rxrpc_free_skb(skb); +unlock: + read_unlock(&call->state_lock); done: _leave(""); } @@ -664,21 +600,46 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn); } +static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, + struct sk_buff *skb, + struct rxrpc_skb_priv *sp) +{ + struct rxrpc_peer *peer; + struct rxrpc_transport *trans; + struct rxrpc_connection *conn; + + peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, + udp_hdr(skb)->source); + if (IS_ERR(peer)) + goto cant_find_conn; + + trans = rxrpc_find_transport(local, peer); + rxrpc_put_peer(peer); + if (!trans) + goto cant_find_conn; + + conn = rxrpc_find_connection(trans, &sp->hdr); + rxrpc_put_transport(trans); + if (!conn) + goto cant_find_conn; + + return conn; +cant_find_conn: + return NULL; +} + /* * handle data received on the local endpoint * - may be called in interrupt context */ -void rxrpc_data_ready(struct sock *sk, int count) +void rxrpc_data_ready(struct sock *sk) { - struct rxrpc_connection *conn; - struct rxrpc_transport *trans; struct rxrpc_skb_priv *sp; struct rxrpc_local *local; - struct rxrpc_peer *peer; struct sk_buff *skb; int ret; - _enter("%p, %d", sk, count); + _enter("%p", sk); ASSERT(!irqs_disabled()); @@ -749,27 +710,34 @@ void rxrpc_data_ready(struct sock *sk, int count) (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) goto bad_message; - peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, udp_hdr(skb)->source); - if (IS_ERR(peer)) - goto cant_route_call; + if (sp->hdr.callNumber == 0) { + /* This is a connection-level packet. These should be + * fairly rare, so the extra overhead of looking them up the + * old-fashioned way doesn't really hurt */ + struct rxrpc_connection *conn; - trans = rxrpc_find_transport(local, peer); - rxrpc_put_peer(peer); - if (!trans) - goto cant_route_call; + conn = rxrpc_conn_from_local(local, skb, sp); + if (!conn) + goto cant_route_call; - conn = rxrpc_find_connection(trans, &sp->hdr); - rxrpc_put_transport(trans); - if (!conn) - goto cant_route_call; - - _debug("CONN %p {%d}", conn, conn->debug_id); - - if (sp->hdr.callNumber == 0) + _debug("CONN %p {%d}", conn, conn->debug_id); rxrpc_post_packet_to_conn(conn, skb); - else - rxrpc_post_packet_to_call(conn, skb); - rxrpc_put_connection(conn); + rxrpc_put_connection(conn); + } else { + struct rxrpc_call *call; + u8 in_clientflag = 0; + + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + in_clientflag = RXRPC_CLIENT_INITIATED; + call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid, + sp->hdr.callNumber, sp->hdr.epoch, + sp->hdr.serviceId, local, AF_INET, + (u8 *)&ip_hdr(skb)->saddr); + if (call) + rxrpc_post_packet_to_call(call, skb); + else + goto cant_route_call; + } rxrpc_put_local(local); return; @@ -790,8 +758,10 @@ cant_route_call: skb->priority = RX_CALL_DEAD; } - _debug("reject"); - rxrpc_reject_packet(local, skb); + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + _debug("reject type %d",sp->hdr.type); + rxrpc_reject_packet(local, skb); + } rxrpc_put_local(local); _leave(" [no call]"); return; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5f43675ee1d..ba9fd36d3f1 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -396,9 +396,20 @@ struct rxrpc_call { #define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; + struct hlist_node hash_node; + unsigned long hash_key; /* Full hash key */ + u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */ + struct rxrpc_local *local; /* Local endpoint. Used for hashing. */ + sa_family_t proto; /* Frame protocol */ /* the following should all be in net order */ __be32 cid; /* connection ID + channel index */ __be32 call_id; /* call ID on connection */ + __be32 epoch; /* epoch of this connection */ + __be16 service_id; /* service ID */ + union { /* Peer IP address for hashing */ + __be32 ipv4_addr; + __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */ + } peer_ip; }; /* @@ -433,6 +444,13 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * ar-ack.c */ +extern unsigned rxrpc_requested_ack_delay; +extern unsigned rxrpc_soft_ack_delay; +extern unsigned rxrpc_idle_ack_delay; +extern unsigned rxrpc_rx_window_size; +extern unsigned rxrpc_rx_mtu; +extern unsigned rxrpc_rx_jumbo_max; + void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); void rxrpc_process_call(struct work_struct *); @@ -440,10 +458,14 @@ void rxrpc_process_call(struct work_struct *); /* * ar-call.c */ +extern unsigned rxrpc_max_call_lifetime; +extern unsigned rxrpc_dead_call_expiry; extern struct kmem_cache *rxrpc_call_jar; extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; +struct rxrpc_call *rxrpc_find_call_hash(u8, __be32, __be32, __be32, + __be16, void *, sa_family_t, const u8 *); struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, @@ -460,6 +482,7 @@ void __exit rxrpc_destroy_all_calls(void); /* * ar-connection.c */ +extern unsigned rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; @@ -493,10 +516,9 @@ void rxrpc_UDP_error_handler(struct work_struct *); /* * ar-input.c */ -extern unsigned long rxrpc_ack_timeout; extern const char *rxrpc_pkts[]; -void rxrpc_data_ready(struct sock *, int); +void rxrpc_data_ready(struct sock *); int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); @@ -504,6 +526,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); * ar-local.c */ extern rwlock_t rxrpc_local_lock; + struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *); void rxrpc_put_local(struct rxrpc_local *); void __exit rxrpc_destroy_all_locals(void); @@ -522,7 +545,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, /* * ar-output.c */ -extern int rxrpc_resend_timeout; +extern unsigned rxrpc_resend_timeout; int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); int rxrpc_client_sendmsg(struct kiocb *, struct rxrpc_sock *, @@ -572,6 +595,8 @@ void rxrpc_packet_destructor(struct sk_buff *); /* * ar-transport.c */ +extern unsigned rxrpc_transport_expiry; + struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, struct rxrpc_peer *, gfp_t); void rxrpc_put_transport(struct rxrpc_transport *); @@ -580,6 +605,17 @@ struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, struct rxrpc_peer *); /* + * sysctl.c + */ +#ifdef CONFIG_SYSCTL +extern int __init rxrpc_sysctl_init(void); +extern void rxrpc_sysctl_exit(void); +#else +static inline int __init rxrpc_sysctl_init(void) { return 0; } +static inline void rxrpc_sysctl_exit(void) {} +#endif + +/* * debug tracing */ extern unsigned int rxrpc_debug; diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 7633a752c65..0ad080790a3 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -99,7 +99,7 @@ static int rxrpc_instantiate_xdr_rxkad(struct key *key, const __be32 *xdr, _debug("tktlen: %x", tktlen); if (tktlen > AFSTOKEN_RK_TIX_MAX) return -EKEYREJECTED; - if (8 * 4 + tktlen != toklen) + if (toklen < 8 * 4 + tktlen) return -EKEYREJECTED; plen = sizeof(*token) + sizeof(*token->kad) + tktlen; diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index e1ac183d50b..0b4b9a79f5a 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -18,7 +18,10 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -int rxrpc_resend_timeout = 4; +/* + * Time till packet resend (in jiffies). + */ +unsigned rxrpc_resend_timeout = 4 * HZ; static int rxrpc_send_data(struct kiocb *iocb, struct rxrpc_sock *rx, @@ -152,8 +155,8 @@ int rxrpc_client_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, if (trans) { service_id = rx->service_id; if (msg->msg_name) { - struct sockaddr_rxrpc *srx = - (struct sockaddr_rxrpc *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, + msg->msg_name); service_id = htons(srx->srx_service); } key = rx->key; @@ -487,7 +490,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); sp->need_resend = false; - sp->resend_at = jiffies + rxrpc_resend_timeout * HZ; + sp->resend_at = jiffies + rxrpc_resend_timeout; if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { _debug("run timer"); call->resend_timer.expires = sp->resend_at; @@ -666,6 +669,7 @@ static int rxrpc_send_data(struct kiocb *iocb, /* add the packet to the send queue if it's now full */ if (sp->remain <= 0 || (segment == 0 && !more)) { struct rxrpc_connection *conn = call->conn; + uint32_t seq; size_t pad; /* pad out if we're using security */ @@ -678,11 +682,12 @@ static int rxrpc_send_data(struct kiocb *iocb, memset(skb_put(skb, pad), 0, pad); } + seq = atomic_inc_return(&call->sequence); + sp->hdr.epoch = conn->epoch; sp->hdr.cid = call->cid; sp->hdr.callNumber = call->call_id; - sp->hdr.seq = - htonl(atomic_inc_return(&call->sequence)); + sp->hdr.seq = htonl(seq); sp->hdr.serial = htonl(atomic_inc_return(&conn->serial)); sp->hdr.type = RXRPC_PACKET_TYPE_DATA; @@ -697,6 +702,8 @@ static int rxrpc_send_data(struct kiocb *iocb, else if (CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz) > 1) sp->hdr.flags |= RXRPC_MORE_PACKETS; + if (more && seq & 1) + sp->hdr.flags |= RXRPC_REQUEST_ACK; ret = rxrpc_secure_packet( call, skb, skb->mark, diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 898492a8d61..e9aaa65c077 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,15 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - ret = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, copy); - } else { - ret = skb_copy_and_csum_datagram_iovec(skb, offset, - msg->msg_iov); - if (ret == -EINVAL) - goto csum_copy_error; - } + ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); if (ret < 0) goto copy_error; @@ -347,16 +339,6 @@ copy_error: _leave(" = %d", ret); return ret; -csum_copy_error: - _debug("csum error"); - release_sock(&rx->sk); - if (continue_call) - rxrpc_put_call(continue_call); - rxrpc_kill_skb(skb); - skb_kill_datagram(&rx->sk, skb, flags); - rxrpc_put_call(call); - return -EAGAIN; - wait_interrupted: ret = sock_intr_errno(timeo); wait_error: diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c index de755e04d29..4cfab49e329 100644 --- a/net/rxrpc/ar-skbuff.c +++ b/net/rxrpc/ar-skbuff.c @@ -83,9 +83,14 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, rxrpc_request_final_ACK(call); } else if (atomic_dec_and_test(&call->ackr_not_idle) && test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { + /* We previously soft-ACK'd some received packets that have now + * been consumed, so send a hard-ACK if no more packets are + * immediately forthcoming to allow the transmitter to free up + * its Tx bufferage. + */ _debug("send Rx idle ACK"); __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, - true); + false); } spin_unlock_bh(&call->lock); diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c index 92df566930b..1976dec84f2 100644 --- a/net/rxrpc/ar-transport.c +++ b/net/rxrpc/ar-transport.c @@ -17,11 +17,15 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" +/* + * Time after last use at which transport record is cleaned up. + */ +unsigned rxrpc_transport_expiry = 3600 * 24; + static void rxrpc_transport_reaper(struct work_struct *work); static LIST_HEAD(rxrpc_transports); static DEFINE_RWLOCK(rxrpc_transport_lock); -static unsigned long rxrpc_transport_timeout = 3600 * 24; static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper); /* @@ -235,7 +239,7 @@ static void rxrpc_transport_reaper(struct work_struct *work) if (likely(atomic_read(&trans->usage) > 0)) continue; - reap_time = trans->put_time + rxrpc_transport_timeout; + reap_time = trans->put_time + rxrpc_transport_expiry; if (reap_time <= now) list_move_tail(&trans->link, &graveyard); else if (reap_time < earliest) @@ -271,7 +275,7 @@ void __exit rxrpc_destroy_all_transports(void) { _enter(""); - rxrpc_transport_timeout = 0; + rxrpc_transport_expiry = 0; cancel_delayed_work(&rxrpc_transport_reap); rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c new file mode 100644 index 00000000000..50a98a910eb --- /dev/null +++ b/net/rxrpc/sysctl.c @@ -0,0 +1,146 @@ +/* sysctls for configuring RxRPC operating parameters + * + * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/sysctl.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned zero = 0; +static const unsigned one = 1; +static const unsigned four = 4; +static const unsigned n_65535 = 65535; +static const unsigned n_max_acks = RXRPC_MAXACKS; + +/* + * RxRPC operating parameters. + * + * See Documentation/networking/rxrpc.txt and the variable definitions for more + * information on the individual parameters. + */ +static struct ctl_table rxrpc_sysctl_table[] = { + /* Values measured in milliseconds */ + { + .procname = "req_ack_delay", + .data = &rxrpc_requested_ack_delay, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&zero, + }, + { + .procname = "soft_ack_delay", + .data = &rxrpc_soft_ack_delay, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "idle_ack_delay", + .data = &rxrpc_idle_ack_delay, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "resend_timeout", + .data = &rxrpc_resend_timeout, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + + /* Values measured in seconds but used in jiffies */ + { + .procname = "max_call_lifetime", + .data = &rxrpc_max_call_lifetime, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "dead_call_expiry", + .data = &rxrpc_dead_call_expiry, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + .extra1 = (void *)&one, + }, + + /* Values measured in seconds */ + { + .procname = "connection_expiry", + .data = &rxrpc_connection_expiry, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + }, + { + .procname = "transport_expiry", + .data = &rxrpc_transport_expiry, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + }, + + /* Non-time values */ + { + .procname = "rx_window_size", + .data = &rxrpc_rx_window_size, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra2 = (void *)&n_max_acks, + }, + { + .procname = "rx_mtu", + .data = &rxrpc_rx_mtu, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra1 = (void *)&n_65535, + }, + { + .procname = "rx_jumbo_max", + .data = &rxrpc_rx_jumbo_max, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra2 = (void *)&four, + }, + + { } +}; + +int __init rxrpc_sysctl_init(void) +{ + rxrpc_sysctl_reg_table = register_net_sysctl(&init_net, "net/rxrpc", + rxrpc_sysctl_table); + if (!rxrpc_sysctl_reg_table) + return -ENOMEM; + return 0; +} + +void rxrpc_sysctl_exit(void) +{ + if (rxrpc_sysctl_reg_table) + unregister_net_sysctl_table(rxrpc_sysctl_reg_table); +} diff --git a/net/sched/Kconfig b/net/sched/Kconfig index ad1f1d81920..a1a8e29e5fc 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -286,6 +286,28 @@ config NET_SCH_FQ If unsure, say N. +config NET_SCH_HHF + tristate "Heavy-Hitter Filter (HHF)" + help + Say Y here if you want to use the Heavy-Hitter Filter (HHF) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_hhf. + +config NET_SCH_PIE + tristate "Proportional Integral controller Enhanced (PIE) scheduler" + help + Say Y here if you want to use the Proportional Integral controller + Enhanced scheduler packet scheduling algorithm. + For more information, please see + http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 + + To compile this driver as a module, choose M here: the module + will be called sch_pie. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT @@ -435,6 +457,7 @@ config NET_CLS_FLOW config NET_CLS_CGROUP tristate "Control Group Classifier" select NET_CLS + select CGROUP_NET_CLASSID depends on CGROUPS ---help--- Say Y here if you want to classify packets based on the control diff --git a/net/sched/Makefile b/net/sched/Makefile index 35fa47a494a..0a869a11f3e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -40,6 +40,8 @@ obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o +obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o +obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 69cb848e834..648778aef1a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -27,42 +27,40 @@ #include <net/act_api.h> #include <net/netlink.h> -void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) +void tcf_hash_destroy(struct tc_action *a) { - unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); - struct tcf_common **p1p; - - for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) { - if (*p1p == p) { - write_lock_bh(hinfo->lock); - *p1p = p->tcfc_next; - write_unlock_bh(hinfo->lock); - gen_kill_estimator(&p->tcfc_bstats, - &p->tcfc_rate_est); - /* - * gen_estimator est_timer() might access p->tcfc_lock - * or bstats, wait a RCU grace period before freeing p - */ - kfree_rcu(p, tcfc_rcu); - return; - } - } - WARN_ON(1); + struct tcf_common *p = a->priv; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + + spin_lock_bh(&hinfo->lock); + hlist_del(&p->tcfc_head); + spin_unlock_bh(&hinfo->lock); + gen_kill_estimator(&p->tcfc_bstats, + &p->tcfc_rate_est); + /* + * gen_estimator est_timer() might access p->tcfc_lock + * or bstats, wait a RCU grace period before freeing p + */ + kfree_rcu(p, tcfc_rcu); } EXPORT_SYMBOL(tcf_hash_destroy); -int tcf_hash_release(struct tcf_common *p, int bind, - struct tcf_hashinfo *hinfo) +int tcf_hash_release(struct tc_action *a, int bind) { + struct tcf_common *p = a->priv; int ret = 0; if (p) { if (bind) p->tcfc_bindcnt--; + else if (p->tcfc_bindcnt > 0) + return -EPERM; p->tcfc_refcnt--; if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { - tcf_hash_destroy(p, hinfo); + if (a->ops->cleanup) + a->ops->cleanup(a, bind); + tcf_hash_destroy(a); ret = 1; } } @@ -71,20 +69,22 @@ int tcf_hash_release(struct tcf_common *p, int bind, EXPORT_SYMBOL(tcf_hash_release); static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, - struct tc_action *a, struct tcf_hashinfo *hinfo) + struct tc_action *a) { + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; - read_lock_bh(hinfo->lock); + spin_lock_bh(&hinfo->lock); s_i = cb->args[0]; for (i = 0; i < (hinfo->hmask + 1); i++) { - p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - for (; p; p = p->tcfc_next) { + hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; @@ -107,7 +107,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, } } done: - read_unlock_bh(hinfo->lock); + spin_unlock_bh(&hinfo->lock); if (n_i) cb->args[0] += n_i; return n_i; @@ -117,12 +117,15 @@ nla_put_failure: goto done; } -static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, - struct tcf_hashinfo *hinfo) +static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a) { - struct tcf_common *p, *s_p; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; + struct hlist_node *n; + struct tcf_common *p; struct nlattr *nest; int i = 0, n_i = 0; + int ret = -EINVAL; nest = nla_nest_start(skb, a->order); if (nest == NULL) @@ -130,14 +133,15 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; for (i = 0; i < (hinfo->hmask + 1); i++) { - p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; - - while (p != NULL) { - s_p = p->tcfc_next; - if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; + hlist_for_each_entry_safe(p, n, head, tcfc_head) { + a->priv = p; + ret = tcf_hash_release(a, 0); + if (ret == ACT_P_DELETED) { module_put(a->ops->owner); - n_i++; - p = s_p; + n_i++; + } else if (ret < 0) + goto nla_put_failure; } } if (nla_put_u32(skb, TCA_FCNT, n_i)) @@ -147,51 +151,48 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, return n_i; nla_put_failure: nla_nest_cancel(skb, nest); - return -EINVAL; + return ret; } -int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; - if (type == RTM_DELACTION) { - return tcf_del_walker(skb, a, hinfo); + return tcf_del_walker(skb, a); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(skb, cb, a, hinfo); + return tcf_dump_walker(skb, cb, a); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; } } -EXPORT_SYMBOL(tcf_generic_walker); -struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { - struct tcf_common *p; + struct tcf_common *p = NULL; + struct hlist_head *head; - read_lock_bh(hinfo->lock); - for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p; - p = p->tcfc_next) { + spin_lock_bh(&hinfo->lock); + head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; + hlist_for_each_entry_rcu(p, head, tcfc_head) if (p->tcfc_index == index) break; - } - read_unlock_bh(hinfo->lock); + spin_unlock_bh(&hinfo->lock); return p; } -EXPORT_SYMBOL(tcf_hash_lookup); -u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo) +u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) { - u32 val = *idx_gen; + u32 val = hinfo->index; do { if (++val == 0) val = 1; } while (tcf_hash_lookup(val, hinfo)); - return (*idx_gen = val); + hinfo->index = val; + return val; } EXPORT_SYMBOL(tcf_hash_new_index); @@ -208,34 +209,46 @@ int tcf_hash_search(struct tc_action *a, u32 index) } EXPORT_SYMBOL(tcf_hash_search); -struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind, - struct tcf_hashinfo *hinfo) +int tcf_hash_check(u32 index, struct tc_action *a, int bind) { + struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = NULL; if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) p->tcfc_bindcnt++; p->tcfc_refcnt++; a->priv = p; + return 1; } - return p; + return 0; } EXPORT_SYMBOL(tcf_hash_check); -struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, - struct tc_action *a, int size, int bind, - u32 *idx_gen, struct tcf_hashinfo *hinfo) +void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { + struct tcf_common *pc = a->priv; + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); +} +EXPORT_SYMBOL(tcf_hash_cleanup); + +int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, + int size, int bind) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); if (unlikely(!p)) - return ERR_PTR(-ENOMEM); + return -ENOMEM; p->tcfc_refcnt = 1; if (bind) p->tcfc_bindcnt = 1; spin_lock_init(&p->tcfc_lock); - p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo); + INIT_HLIST_NODE(&p->tcfc_head); + p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { @@ -243,35 +256,37 @@ struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, &p->tcfc_lock, est); if (err) { kfree(p); - return ERR_PTR(err); + return err; } } a->priv = (void *) p; - return p; + return 0; } EXPORT_SYMBOL(tcf_hash_create); -void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo) +void tcf_hash_insert(struct tc_action *a) { + struct tcf_common *p = a->priv; + struct tcf_hashinfo *hinfo = a->ops->hinfo; unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); - write_lock_bh(hinfo->lock); - p->tcfc_next = hinfo->htab[h]; - hinfo->htab[h] = p; - write_unlock_bh(hinfo->lock); + spin_lock_bh(&hinfo->lock); + hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); + spin_unlock_bh(&hinfo->lock); } EXPORT_SYMBOL(tcf_hash_insert); -static struct tc_action_ops *act_base = NULL; +static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act) +int tcf_register_action(struct tc_action_ops *act, unsigned int mask) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; + int err; - /* Must supply act, dump, cleanup and init */ - if (!act->act || !act->dump || !act->cleanup || !act->init) + /* Must supply act, dump and init */ + if (!act->act || !act->dump || !act->init) return -EINVAL; /* Supply defaults */ @@ -280,15 +295,25 @@ int tcf_register_action(struct tc_action_ops *act) if (!act->walk) act->walk = tcf_generic_walker; + act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); + if (!act->hinfo) + return -ENOMEM; + err = tcf_hashinfo_init(act->hinfo, mask); + if (err) { + kfree(act->hinfo); + return err; + } + write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { + list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); + tcf_hashinfo_destroy(act->hinfo); + kfree(act->hinfo); return -EEXIST; } } - act->next = NULL; - *ap = act; + list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; } @@ -296,17 +321,18 @@ EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) - if (a == act) + list_for_each_entry(a, &act_base, head) { + if (a == act) { + list_del(&act->head); + tcf_hashinfo_destroy(act->hinfo); + kfree(act->hinfo); + err = 0; break; - if (a) { - *ap = a->next; - a->next = NULL; - err = 0; + } } write_unlock(&act_mod_lock); return err; @@ -316,69 +342,42 @@ EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) { - struct tc_action_ops *a = NULL; + struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { + list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } + if (try_module_get(a->owner)) + res = a; break; } } read_unlock(&act_mod_lock); } - return a; + return res; } /* lookup by nlattr */ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { - struct tc_action_ops *a = NULL; + struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { + list_for_each_entry(a, &act_base, head) { if (nla_strcmp(kind, a->kind) == 0) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } - break; - } - } - read_unlock(&act_mod_lock); - } - return a; -} - -#if 0 -/* lookup by id */ -static struct tc_action_ops *tc_lookup_action_id(u32 type) -{ - struct tc_action_ops *a = NULL; - - if (type) { - read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { - if (a->type == type) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } + if (try_module_get(a->owner)) + res = a; break; } } read_unlock(&act_mod_lock); } - return a; + return res; } -#endif -int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act, +int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, struct tcf_result *res) { const struct tc_action *a; @@ -389,53 +388,44 @@ int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act, ret = TC_ACT_OK; goto exec_done; } - while ((a = act) != NULL) { + list_for_each_entry(a, actions, list) { repeat: - if (a->ops) { - ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } - if (ret == TC_ACT_REPEAT) - goto repeat; /* we need a ttl - JHS */ - if (ret != TC_ACT_PIPE) - goto exec_done; + ret = a->ops->act(skb, a, res); + if (TC_MUNGED & skb->tc_verd) { + /* copied already, allow trampling */ + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); } - act = a->next; + if (ret == TC_ACT_REPEAT) + goto repeat; /* we need a ttl - JHS */ + if (ret != TC_ACT_PIPE) + goto exec_done; } exec_done: return ret; } EXPORT_SYMBOL(tcf_action_exec); -void tcf_action_destroy(struct tc_action *act, int bind) +int tcf_action_destroy(struct list_head *actions, int bind) { - struct tc_action *a; + struct tc_action *a, *tmp; + int ret = 0; - for (a = act; a; a = act) { - if (a->ops) { - if (a->ops->cleanup(a, bind) == ACT_P_DELETED) - module_put(a->ops->owner); - act = act->next; - kfree(a); - } else { - /*FIXME: Remove later - catch insertion bugs*/ - WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n"); - act = act->next; - kfree(a); - } + list_for_each_entry_safe(a, tmp, actions, list) { + ret = tcf_hash_release(a, bind); + if (ret == ACT_P_DELETED) + module_put(a->ops->owner); + else if (ret < 0) + return ret; + list_del(&a->list); + kfree(a); } + return ret; } int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - int err = -EINVAL; - - if (a->ops == NULL) - return err; return a->ops->dump(skb, a, bind, ref); } @@ -446,9 +436,6 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - if (a->ops == NULL) - return err; - if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) @@ -469,14 +456,13 @@ nla_put_failure: EXPORT_SYMBOL(tcf_action_dump_1); int -tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref) { struct tc_action *a; int err = -EINVAL; struct nlattr *nest; - while ((a = act) != NULL) { - act = a->next; + list_for_each_entry(a, actions, list) { nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -551,6 +537,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, if (a == NULL) goto err_mod; + a->ops = a_o; + INIT_LIST_HEAD(&a->list); /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); @@ -565,7 +553,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, */ if (err != ACT_P_CREATED) module_put(a_o->owner); - a->ops = a_o; return a; @@ -577,37 +564,33 @@ err_out: return ERR_PTR(err); } -struct tc_action *tcf_action_init(struct net *net, struct nlattr *nla, +int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, char *name, int ovr, - int bind) + int bind, struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + struct tc_action *act; int err; int i; err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); if (err < 0) - return ERR_PTR(err); + return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); - if (IS_ERR(act)) + if (IS_ERR(act)) { + err = PTR_ERR(act); goto err; + } act->order = i; - - if (head == NULL) - head = act; - else - act_prev->next = act; - act_prev = act; + list_add_tail(&act->list, actions); } - return head; + return 0; err: - if (head != NULL) - tcf_action_destroy(head, bind); - return act; + tcf_action_destroy(actions, bind); + return err; } int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, @@ -615,9 +598,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, { int err = 0; struct gnet_dump d; - struct tcf_act_hdr *h = a->priv; + struct tcf_common *p = a->priv; - if (h == NULL) + if (p == NULL) goto errout; /* compat_mode being true specifies a call that is supposed @@ -626,24 +609,20 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (compat_mode) { if (a->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, - TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d); + TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - &h->tcf_lock, &d); + &p->tcfc_lock, &d); if (err < 0) goto errout; - if (a->ops != NULL && a->ops->get_stats != NULL) - if (a->ops->get_stats(skb, a) < 0) - goto errout; - - if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 || - gnet_stats_copy_rate_est(&d, &h->tcf_bstats, - &h->tcf_rate_est) < 0 || - gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0) + if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, + &p->tcfc_rate_est) < 0 || + gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) @@ -656,7 +635,7 @@ errout: } static int -tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 portid, u32 seq, +tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { struct tcamsg *t; @@ -676,7 +655,7 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 portid, u32 seq, if (nest == NULL) goto out_nlmsg_trim; - if (tcf_action_dump(skb, a, bind, ref) < 0) + if (tcf_action_dump(skb, actions, bind, ref) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); @@ -691,14 +670,14 @@ out_nlmsg_trim: static int act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct tc_action *a, int event) + struct list_head *actions, int event) { struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, a, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -706,6 +685,20 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, return rtnl_unicast(skb, net, portid); } +static struct tc_action *create_a(int i) +{ + struct tc_action *act; + + act = kzalloc(sizeof(*act), GFP_KERNEL); + if (act == NULL) { + pr_debug("create_a: failed to alloc!\n"); + return NULL; + } + act->order = i; + INIT_LIST_HEAD(&act->list); + return act; +} + static struct tc_action * tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) { @@ -725,13 +718,13 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -ENOMEM; - a = kzalloc(sizeof(struct tc_action), GFP_KERNEL); + a = create_a(0); if (a == NULL) goto err_out; err = -EINVAL; a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); - if (a->ops == NULL) + if (a->ops == NULL) /* could happen in batch of actions */ goto err_free; err = -ENOENT; if (a->ops->lookup(a, index) == 0) @@ -748,29 +741,16 @@ err_out: return ERR_PTR(err); } -static void cleanup_a(struct tc_action *act) +static void cleanup_a(struct list_head *actions) { - struct tc_action *a; + struct tc_action *a, *tmp; - for (a = act; a; a = act) { - act = a->next; + list_for_each_entry_safe(a, tmp, actions, list) { + list_del(&a->list); kfree(a); } } -static struct tc_action *create_a(int i) -{ - struct tc_action *act; - - act = kzalloc(sizeof(*act), GFP_KERNEL); - if (act == NULL) { - pr_debug("create_a: failed to alloc!\n"); - return NULL; - } - act->order = i; - return act; -} - static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid) { @@ -782,18 +762,12 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; - struct tc_action *a = create_a(0); + struct tc_action a; int err = -ENOMEM; - if (a == NULL) { - pr_debug("tca_action_flush: couldnt create tc_action\n"); - return err; - } - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { pr_debug("tca_action_flush: failed skb alloc\n"); - kfree(a); return err; } @@ -805,8 +779,10 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, err = -EINVAL; kind = tb[TCA_ACT_KIND]; - a->ops = tc_lookup_action(kind); - if (a->ops == NULL) + memset(&a, 0, sizeof(struct tc_action)); + INIT_LIST_HEAD(&a.list); + a.ops = tc_lookup_action(kind); + if (a.ops == NULL) /*some idjot trying to flush unknown action */ goto err_out; nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); @@ -821,7 +797,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (nest == NULL) goto out_module_put; - err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); + err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a); if (err < 0) goto out_module_put; if (err == 0) @@ -831,8 +807,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; - module_put(a->ops->owner); - kfree(a); + module_put(a.ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) @@ -841,21 +816,52 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, return err; out_module_put: - module_put(a->ops->owner); + module_put(a.ops->owner); err_out: noflush_out: kfree_skb(skb); - kfree(a); return err; } static int +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, + u32 portid) +{ + int ret; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, + 0, 1) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + /* now do the delete */ + ret = tcf_action_destroy(actions, 0); + if (ret < 0) { + kfree_skb(skb); + return ret; + } + + ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + if (ret > 0) + return 0; + return ret; +} + +static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int event) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + struct tc_action *act; + LIST_HEAD(actions); ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); if (ret < 0) @@ -875,117 +881,62 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } act->order = i; - - if (head == NULL) - head = act; - else - act_prev->next = act; - act_prev = act; + list_add_tail(&act->list, &actions); } if (event == RTM_GETACTION) - ret = act_get_notify(net, portid, n, head, event); + ret = act_get_notify(net, portid, n, &actions, event); else { /* delete */ - struct sk_buff *skb; - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) { - ret = -ENOBUFS; - goto err; - } - - if (tca_get_fill(skb, head, portid, n->nlmsg_seq, 0, event, - 0, 1) <= 0) { - kfree_skb(skb); - ret = -EINVAL; + ret = tcf_del_notify(net, n, &actions, portid); + if (ret) goto err; - } - - /* now do the delete */ - tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (ret > 0) - return 0; return ret; } err: - cleanup_a(head); + cleanup_a(&actions); return ret; } -static int tcf_add_notify(struct net *net, struct tc_action *a, - u32 portid, u32 seq, int event, u16 flags) +static int +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, + u32 portid) { - struct tcamsg *t; - struct nlmsghdr *nlh; struct sk_buff *skb; - struct nlattr *nest; - unsigned char *b; int err = 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - b = skb_tail_pointer(skb); - - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); - if (!nlh) - goto out_kfree_skb; - t = nlmsg_data(nlh); - t->tca_family = AF_UNSPEC; - t->tca__pad1 = 0; - t->tca__pad2 = 0; - - nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) - goto out_kfree_skb; - - if (tcf_action_dump(skb, a, 0, 0) < 0) - goto out_kfree_skb; - - nla_nest_end(skb, nest); - - nlh->nlmsg_len = skb_tail_pointer(skb) - b; - NETLINK_CB(skb).dst_group = RTNLGRP_TC; + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, + RTM_NEWACTION, 0, 0) <= 0) { + kfree_skb(skb); + return -EINVAL; + } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); if (err > 0) err = 0; return err; - -out_kfree_skb: - kfree_skb(skb); - return -1; } - static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int ovr) { int ret = 0; - struct tc_action *act; - struct tc_action *a; - u32 seq = n->nlmsg_seq; + LIST_HEAD(actions); - act = tcf_action_init(net, nla, NULL, NULL, ovr, 0); - if (act == NULL) - goto done; - if (IS_ERR(act)) { - ret = PTR_ERR(act); + ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); + if (ret) goto done; - } /* dump then free all the actions after update; inserted policy * stays intact */ - ret = tcf_add_notify(net, act, portid, seq, RTM_NEWACTION, n->nlmsg_flags); - for (a = act; a; a = act) { - act = a->next; - kfree(a); - } + ret = tcf_add_notify(net, n, &actions, portid); + cleanup_a(&actions); done: return ret; } @@ -997,7 +948,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; - if ((n->nlmsg_type != RTM_GETACTION) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5c5edf56adb..edbf40dac70 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -37,15 +37,6 @@ #include <net/tc_act/tc_csum.h> #define CSUM_TAB_MASK 15 -static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; -static u32 csum_idx_gen; -static DEFINE_RWLOCK(csum_lock); - -static struct tcf_hashinfo csum_hash_info = { - .htab = tcf_csum_ht, - .hmask = CSUM_TAB_MASK, - .lock = &csum_lock, -}; static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, @@ -56,7 +47,6 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, { struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; - struct tcf_common *pc; struct tcf_csum *p; int ret = 0, err; @@ -71,39 +61,31 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, - &csum_idx_gen, &csum_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); - p = to_tcf_csum(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - p = to_tcf_csum(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &csum_hash_info); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } + p = to_tcf_csum(a); spin_lock_bh(&p->tcf_lock); p->tcf_action = parm->action; p->update_flags = parm->update_flags; spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &csum_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_csum_cleanup(struct tc_action *a, int bind) -{ - struct tcf_csum *p = a->priv; - return tcf_hash_release(&p->common, bind, &csum_hash_info); -} - /** * tcf_csum_skb_nextlayer - Get next layer pointer * @skb: sk_buff to use @@ -578,13 +560,10 @@ nla_put_failure: static struct tc_action_ops act_csum_ops = { .kind = "csum", - .hinfo = &csum_hash_info, .type = TCA_ACT_CSUM, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_csum, .dump = tcf_csum_dump, - .cleanup = tcf_csum_cleanup, .init = tcf_csum_init, }; @@ -593,7 +572,7 @@ MODULE_LICENSE("GPL"); static int __init csum_init_module(void) { - return tcf_register_action(&act_csum_ops); + return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK); } static void __exit csum_cleanup_module(void) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 5645a4d32ab..d6bcbd9f779 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -24,20 +24,11 @@ #include <net/tc_act/tc_gact.h> #define GACT_TAB_MASK 15 -static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1]; -static u32 gact_idx_gen; -static DEFINE_RWLOCK(gact_lock); - -static struct tcf_hashinfo gact_hash_info = { - .htab = tcf_gact_ht, - .hmask = GACT_TAB_MASK, - .lock = &gact_lock, -}; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { - if (!gact->tcfg_pval || net_random() % gact->tcfg_pval) + if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } @@ -65,7 +56,6 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; struct tcf_gact *gact; - struct tcf_common *pc; int ret = 0; int err; #ifdef CONFIG_GACT_PROB @@ -94,21 +84,20 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*gact), - bind, &gact_idx_gen, &gact_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_hash_release(pc, bind, &gact_hash_info); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } - gact = to_gact(pc); + gact = to_gact(a); spin_lock_bh(&gact->tcf_lock); gact->tcf_action = parm->action; @@ -121,19 +110,10 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, #endif spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &gact_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_gact_cleanup(struct tc_action *a, int bind) -{ - struct tcf_gact *gact = a->priv; - - if (gact) - return tcf_hash_release(&gact->common, bind, &gact_hash_info); - return 0; -} - static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -199,13 +179,10 @@ nla_put_failure: static struct tc_action_ops act_gact_ops = { .kind = "gact", - .hinfo = &gact_hash_info, .type = TCA_ACT_GACT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_gact, .dump = tcf_gact_dump, - .cleanup = tcf_gact_cleanup, .init = tcf_gact_init, }; @@ -220,7 +197,7 @@ static int __init gact_init_module(void) #else pr_info("GACT probability NOT on\n"); #endif - return tcf_register_action(&act_gact_ops); + return tcf_register_action(&act_gact_ops, GACT_TAB_MASK); } static void __exit gact_cleanup_module(void) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 882a89762f7..8a64a0734ae 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -29,15 +29,6 @@ #define IPT_TAB_MASK 15 -static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1]; -static u32 ipt_idx_gen; -static DEFINE_RWLOCK(ipt_lock); - -static struct tcf_hashinfo ipt_hash_info = { - .htab = tcf_ipt_ht, - .hmask = IPT_TAB_MASK, - .lock = &ipt_lock, -}; static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { @@ -77,22 +68,12 @@ static void ipt_destroy_target(struct xt_entry_target *t) module_put(par.target->me); } -static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) +static void tcf_ipt_release(struct tc_action *a, int bind) { - int ret = 0; - if (ipt) { - if (bind) - ipt->tcf_bindcnt--; - ipt->tcf_refcnt--; - if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) { - ipt_destroy_target(ipt->tcfi_t); - kfree(ipt->tcfi_tname); - kfree(ipt->tcfi_t); - tcf_hash_destroy(&ipt->common, &ipt_hash_info); - ret = ACT_P_DELETED; - } - } - return ret; + struct tcf_ipt *ipt = to_ipt(a); + ipt_destroy_target(ipt->tcfi_t); + kfree(ipt->tcfi_tname); + kfree(ipt->tcfi_t); } static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { @@ -107,7 +88,6 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; - struct tcf_common *pc; struct xt_entry_target *td, *t; char *tname; int ret = 0, err; @@ -133,20 +113,20 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - pc = tcf_hash_check(index, a, bind, &ipt_hash_info); - if (!pc) { - pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind, - &ipt_idx_gen, &ipt_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(index, a, bind) ) { + ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_ipt_release(to_ipt(pc), bind); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + + if (!ovr) return -EEXIST; - } } - ipt = to_ipt(pc); + ipt = to_ipt(a); hook = nla_get_u32(tb[TCA_IPT_HOOK]); @@ -177,7 +157,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &ipt_hash_info); + tcf_hash_insert(a); return ret; err3: @@ -185,21 +165,11 @@ err3: err2: kfree(tname); err1: - if (ret == ACT_P_CREATED) { - if (est) - gen_kill_estimator(&pc->tcfc_bstats, - &pc->tcfc_rate_est); - kfree_rcu(pc, tcfc_rcu); - } + if (ret == ACT_P_CREATED) + tcf_hash_cleanup(a, est); return err; } -static int tcf_ipt_cleanup(struct tc_action *a, int bind) -{ - struct tcf_ipt *ipt = a->priv; - return tcf_ipt_release(ipt, bind); -} - static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -291,25 +261,21 @@ nla_put_failure: static struct tc_action_ops act_ipt_ops = { .kind = "ipt", - .hinfo = &ipt_hash_info, .type = TCA_ACT_IPT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_ipt, .dump = tcf_ipt_dump, - .cleanup = tcf_ipt_cleanup, + .cleanup = tcf_ipt_release, .init = tcf_ipt_init, }; static struct tc_action_ops act_xt_ops = { .kind = "xt", - .hinfo = &ipt_hash_info, - .type = TCA_ACT_IPT, - .capab = TCA_CAP_NONE, + .type = TCA_ACT_XT, .owner = THIS_MODULE, .act = tcf_ipt, .dump = tcf_ipt_dump, - .cleanup = tcf_ipt_cleanup, + .cleanup = tcf_ipt_release, .init = tcf_ipt_init, }; @@ -321,16 +287,17 @@ MODULE_ALIAS("act_xt"); static int __init ipt_init_module(void) { int ret1, ret2; - ret1 = tcf_register_action(&act_xt_ops); + + ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK); if (ret1 < 0) printk("Failed to load xt action\n"); - ret2 = tcf_register_action(&act_ipt_ops); + ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK); if (ret2 < 0) printk("Failed to load ipt action\n"); - if (ret1 < 0 && ret2 < 0) + if (ret1 < 0 && ret2 < 0) { return ret1; - else + } else return 0; } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 252378121ce..4f912c0e225 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,32 +30,14 @@ #include <linux/if_arp.h> #define MIRRED_TAB_MASK 7 -static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1]; -static u32 mirred_idx_gen; -static DEFINE_RWLOCK(mirred_lock); static LIST_HEAD(mirred_list); -static struct tcf_hashinfo mirred_hash_info = { - .htab = tcf_mirred_ht, - .hmask = MIRRED_TAB_MASK, - .lock = &mirred_lock, -}; - -static int tcf_mirred_release(struct tcf_mirred *m, int bind) +static void tcf_mirred_release(struct tc_action *a, int bind) { - if (m) { - if (bind) - m->tcf_bindcnt--; - m->tcf_refcnt--; - if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) { - list_del(&m->tcfm_list); - if (m->tcfm_dev) - dev_put(m->tcfm_dev); - tcf_hash_destroy(&m->common, &mirred_hash_info); - return 1; - } - } - return 0; + struct tcf_mirred *m = to_mirred(a); + list_del(&m->tcfm_list); + if (m->tcfm_dev) + dev_put(m->tcfm_dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -69,7 +51,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; struct tcf_mirred *m; - struct tcf_common *pc; struct net_device *dev; int ret, ok_push = 0; @@ -109,22 +90,20 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, dev = NULL; } - pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info); - if (!pc) { + if (!tcf_hash_check(parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind, - &mirred_idx_gen, &mirred_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { if (!ovr) { - tcf_mirred_release(to_mirred(pc), bind); + tcf_hash_release(a, bind); return -EEXIST; } } - m = to_mirred(pc); + m = to_mirred(a); spin_lock_bh(&m->tcf_lock); m->tcf_action = parm->action; @@ -140,21 +119,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&m->tcf_lock); if (ret == ACT_P_CREATED) { list_add(&m->tcfm_list, &mirred_list); - tcf_hash_insert(pc, &mirred_hash_info); + tcf_hash_insert(a); } return ret; } -static int tcf_mirred_cleanup(struct tc_action *a, int bind) -{ - struct tcf_mirred *m = a->priv; - - if (m) - return tcf_mirred_release(m, bind); - return 0; -} - static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -261,16 +231,13 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; - static struct tc_action_ops act_mirred_ops = { .kind = "mirred", - .hinfo = &mirred_hash_info, .type = TCA_ACT_MIRRED, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_mirred, .dump = tcf_mirred_dump, - .cleanup = tcf_mirred_cleanup, + .cleanup = tcf_mirred_release, .init = tcf_mirred_init, }; @@ -285,13 +252,13 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops); + return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK); } static void __exit mirred_cleanup_module(void) { - unregister_netdevice_notifier(&mirred_device_notifier); tcf_unregister_action(&act_mirred_ops); + unregister_netdevice_notifier(&mirred_device_notifier); } module_init(mirred_init_module); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 6a15ace0024..270a030d5fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -30,15 +30,6 @@ #define NAT_TAB_MASK 15 -static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1]; -static u32 nat_idx_gen; -static DEFINE_RWLOCK(nat_lock); - -static struct tcf_hashinfo nat_hash_info = { - .htab = tcf_nat_ht, - .hmask = NAT_TAB_MASK, - .lock = &nat_lock, -}; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, @@ -51,7 +42,6 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_nat *parm; int ret = 0, err; struct tcf_nat *p; - struct tcf_common *pc; if (nla == NULL) return -EINVAL; @@ -64,21 +54,19 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, - &nat_idx_gen, &nat_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); - p = to_tcf_nat(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - p = to_tcf_nat(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &nat_hash_info); + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } + p = to_tcf_nat(a); spin_lock_bh(&p->tcf_lock); p->old_addr = parm->old_addr; @@ -90,18 +78,11 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &nat_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_nat_cleanup(struct tc_action *a, int bind) -{ - struct tcf_nat *p = a->priv; - - return tcf_hash_release(&p->common, bind, &nat_hash_info); -} - static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -301,13 +282,10 @@ nla_put_failure: static struct tc_action_ops act_nat_ops = { .kind = "nat", - .hinfo = &nat_hash_info, .type = TCA_ACT_NAT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_nat, .dump = tcf_nat_dump, - .cleanup = tcf_nat_cleanup, .init = tcf_nat_init, }; @@ -316,7 +294,7 @@ MODULE_LICENSE("GPL"); static int __init nat_init_module(void) { - return tcf_register_action(&act_nat_ops); + return tcf_register_action(&act_nat_ops, NAT_TAB_MASK); } static void __exit nat_cleanup_module(void) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 03b67674169..5f9bcb2e080 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -24,15 +24,6 @@ #include <net/tc_act/tc_pedit.h> #define PEDIT_TAB_MASK 15 -static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1]; -static u32 pedit_idx_gen; -static DEFINE_RWLOCK(pedit_lock); - -static struct tcf_hashinfo pedit_hash_info = { - .htab = tcf_pedit_ht, - .hmask = PEDIT_TAB_MASK, - .lock = &pedit_lock, -}; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, @@ -46,7 +37,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct tc_pedit *parm; int ret = 0, err; struct tcf_pedit *p; - struct tcf_common *pc; struct tc_pedit_key *keys = NULL; int ksize; @@ -64,30 +54,27 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) return -EINVAL; - pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info); - if (!pc) { + if (!tcf_hash_check(parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, - &pedit_idx_gen, &pedit_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); - p = to_pedit(pc); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; + p = to_pedit(a); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - if (est) - gen_kill_estimator(&pc->tcfc_bstats, - &pc->tcfc_rate_est); - kfree_rcu(pc, tcfc_rcu); + tcf_hash_cleanup(a, est); return -ENOMEM; } ret = ACT_P_CREATED; } else { - p = to_pedit(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &pedit_hash_info); + p = to_pedit(a); + tcf_hash_release(a, bind); + if (bind) + return 0; + if (!ovr) return -EEXIST; - } + if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -106,22 +93,15 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &pedit_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_pedit_cleanup(struct tc_action *a, int bind) +static void tcf_pedit_cleanup(struct tc_action *a, int bind) { struct tcf_pedit *p = a->priv; - - if (p) { - struct tc_pedit_key *keys = p->tcfp_keys; - if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) { - kfree(keys); - return 1; - } - } - return 0; + struct tc_pedit_key *keys = p->tcfp_keys; + kfree(keys); } static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, @@ -236,9 +216,7 @@ nla_put_failure: static struct tc_action_ops act_pedit_ops = { .kind = "pedit", - .hinfo = &pedit_hash_info, .type = TCA_ACT_PEDIT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_pedit, .dump = tcf_pedit_dump, @@ -252,7 +230,7 @@ MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { - return tcf_register_action(&act_pedit_ops); + return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK); } static void __exit pedit_cleanup_module(void) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 16a62c36928..0566e4606a4 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -41,15 +41,6 @@ struct tcf_police { container_of(pc, struct tcf_police, common) #define POL_TAB_MASK 15 -static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; -static u32 police_idx_gen; -static DEFINE_RWLOCK(police_lock); - -static struct tcf_hashinfo police_hash_info = { - .htab = tcf_police_ht, - .hmask = POL_TAB_MASK, - .lock = &police_lock, -}; /* old policer structure from before tc actions */ struct tc_police_compat { @@ -67,18 +58,20 @@ struct tc_police_compat { static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, int type, struct tc_action *a) { + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; - read_lock_bh(&police_lock); + spin_lock_bh(&hinfo->lock); s_i = cb->args[0]; for (i = 0; i < (POL_TAB_MASK + 1); i++) { - p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)]; + head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; - for (; p; p = p->tcfc_next) { + hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; @@ -101,7 +94,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c } } done: - read_unlock_bh(&police_lock); + spin_unlock_bh(&hinfo->lock); if (n_i) cb->args[0] += n_i; return n_i; @@ -111,29 +104,6 @@ nla_put_failure: goto done; } -static void tcf_police_destroy(struct tcf_police *p) -{ - unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); - struct tcf_common **p1p; - - for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) { - if (*p1p == &p->common) { - write_lock_bh(&police_lock); - *p1p = p->tcf_next; - write_unlock_bh(&police_lock); - gen_kill_estimator(&p->tcf_bstats, - &p->tcf_rate_est); - /* - * gen_estimator est_timer() might access p->tcf_lock - * or bstats, wait a RCU grace period before freeing p - */ - kfree_rcu(p, tcf_rcu); - return; - } - } - WARN_ON(1); -} - static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, @@ -151,6 +121,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; + struct tcf_hashinfo *hinfo = a->ops->hinfo; int size; if (nla == NULL) @@ -168,19 +139,17 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { - struct tcf_common *pc; - - pc = tcf_hash_lookup(parm->index, &police_hash_info); - if (pc != NULL) { - a->priv = pc; - police = to_police(pc); + if (tcf_hash_search(a, parm->index)) { + police = to_police(a->priv); if (bind) { police->tcf_bindcnt += 1; police->tcf_refcnt += 1; + return 0; } if (ovr) goto override; - return ret; + /* not replacing */ + return -EEXIST; } } @@ -264,12 +233,11 @@ override: police->tcfp_t_c = ktime_to_ns(ktime_get()); police->tcf_index = parm->index ? parm->index : - tcf_hash_new_index(&police_idx_gen, &police_hash_info); + tcf_hash_new_index(hinfo); h = tcf_hash(police->tcf_index, POL_TAB_MASK); - write_lock_bh(&police_lock); - police->tcf_next = tcf_police_ht[h]; - tcf_police_ht[h] = &police->common; - write_unlock_bh(&police_lock); + spin_lock_bh(&hinfo->lock); + hlist_add_head(&police->tcf_head, &hinfo->htab[h]); + spin_unlock_bh(&hinfo->lock); a->priv = police; return ret; @@ -277,33 +245,13 @@ override: failure_unlock: spin_unlock_bh(&police->tcf_lock); failure: - if (P_tab) - qdisc_put_rtab(P_tab); - if (R_tab) - qdisc_put_rtab(R_tab); + qdisc_put_rtab(P_tab); + qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) kfree(police); return err; } -static int tcf_act_police_cleanup(struct tc_action *a, int bind) -{ - struct tcf_police *p = a->priv; - int ret = 0; - - if (p != NULL) { - if (bind) - p->tcf_bindcnt--; - - p->tcf_refcnt--; - if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) { - tcf_police_destroy(p); - ret = 1; - } - } - return ret; -} - static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -400,13 +348,10 @@ MODULE_LICENSE("GPL"); static struct tc_action_ops act_police_ops = { .kind = "police", - .hinfo = &police_hash_info, .type = TCA_ID_POLICE, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_act_police, .dump = tcf_act_police_dump, - .cleanup = tcf_act_police_cleanup, .init = tcf_act_police_locate, .walk = tcf_act_police_walker }; @@ -414,7 +359,7 @@ static struct tc_action_ops act_police_ops = { static int __init police_init_module(void) { - return tcf_register_action(&act_police_ops); + return tcf_register_action(&act_police_ops, POL_TAB_MASK); } static void __exit diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 31157d3e729..992c2317ce8 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -25,15 +25,6 @@ #include <net/tc_act/tc_defact.h> #define SIMP_TAB_MASK 7 -static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1]; -static u32 simp_idx_gen; -static DEFINE_RWLOCK(simp_lock); - -static struct tcf_hashinfo simp_hash_info = { - .htab = tcf_simp_ht, - .hmask = SIMP_TAB_MASK, - .lock = &simp_lock, -}; #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, @@ -55,20 +46,10 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, return d->tcf_action; } -static int tcf_simp_release(struct tcf_defact *d, int bind) +static void tcf_simp_release(struct tc_action *a, int bind) { - int ret = 0; - if (d) { - if (bind) - d->tcf_bindcnt--; - d->tcf_refcnt--; - if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) { - kfree(d->tcfd_defdata); - tcf_hash_destroy(&d->common, &simp_hash_info); - ret = 1; - } - } - return ret; + struct tcf_defact *d = to_defact(a); + kfree(d->tcfd_defdata); } static int alloc_defdata(struct tcf_defact *d, char *defdata) @@ -102,7 +83,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; - struct tcf_common *pc; char *defdata; int ret = 0, err; @@ -122,47 +102,36 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_DEF_PARMS]); defdata = nla_data(tb[TCA_DEF_DATA]); - pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, - &simp_idx_gen, &simp_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + if (ret) + return ret; - d = to_defact(pc); + d = to_defact(a); ret = alloc_defdata(d, defdata); if (ret < 0) { - if (est) - gen_kill_estimator(&pc->tcfc_bstats, - &pc->tcfc_rate_est); - kfree_rcu(pc, tcfc_rcu); + tcf_hash_cleanup(a, est); return ret; } d->tcf_action = parm->action; ret = ACT_P_CREATED; } else { - d = to_defact(pc); - if (!ovr) { - tcf_simp_release(d, bind); + d = to_defact(a); + + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } + reset_policy(d, defdata, parm); } if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &simp_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_simp_cleanup(struct tc_action *a, int bind) -{ - struct tcf_defact *d = a->priv; - - if (d) - return tcf_simp_release(d, bind); - return 0; -} - static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -193,13 +162,11 @@ nla_put_failure: static struct tc_action_ops act_simp_ops = { .kind = "simple", - .hinfo = &simp_hash_info, .type = TCA_ACT_SIMP, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_simp, .dump = tcf_simp_dump, - .cleanup = tcf_simp_cleanup, + .cleanup = tcf_simp_release, .init = tcf_simp_init, }; @@ -209,7 +176,8 @@ MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { - int ret = tcf_register_action(&act_simp_ops); + int ret; + ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cf20add1c3f..fcfeeaf838b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -28,15 +28,6 @@ #include <net/tc_act/tc_skbedit.h> #define SKBEDIT_TAB_MASK 15 -static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1]; -static u32 skbedit_idx_gen; -static DEFINE_RWLOCK(skbedit_lock); - -static struct tcf_hashinfo skbedit_hash_info = { - .htab = tcf_skbedit_ht, - .hmask = SKBEDIT_TAB_MASK, - .lock = &skbedit_lock, -}; static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -73,7 +64,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; - struct tcf_common *pc; u32 flags = 0, *priority = NULL, *mark = NULL; u16 *queue_mapping = NULL; int ret = 0, err; @@ -108,21 +98,20 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, - &skbedit_idx_gen, &skbedit_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + if (ret) + return ret; - d = to_skbedit(pc); + d = to_skbedit(a); ret = ACT_P_CREATED; } else { - d = to_skbedit(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &skbedit_hash_info); + d = to_skbedit(a); + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } spin_lock_bh(&d->tcf_lock); @@ -140,19 +129,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &skbedit_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_skbedit_cleanup(struct tc_action *a, int bind) -{ - struct tcf_skbedit *d = a->priv; - - if (d) - return tcf_hash_release(&d->common, bind, &skbedit_hash_info); - return 0; -} - static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -194,13 +174,10 @@ nla_put_failure: static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", - .hinfo = &skbedit_hash_info, .type = TCA_ACT_SKBEDIT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_skbedit, .dump = tcf_skbedit_dump, - .cleanup = tcf_skbedit_cleanup, .init = tcf_skbedit_init, }; @@ -210,7 +187,7 @@ MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { - return tcf_register_action(&act_skbedit_ops); + return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK); } static void __exit skbedit_cleanup_module(void) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8e118af9097..45527e6b52d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -31,8 +31,7 @@ #include <net/pkt_cls.h> /* The list of all installed classifier types */ - -static struct tcf_proto_ops *tcf_proto_base __read_mostly; +static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); @@ -41,36 +40,35 @@ static DEFINE_RWLOCK(cls_mod_lock); static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) { - const struct tcf_proto_ops *t = NULL; + const struct tcf_proto_ops *t, *res = NULL; if (kind) { read_lock(&cls_mod_lock); - for (t = tcf_proto_base; t; t = t->next) { + list_for_each_entry(t, &tcf_proto_base, head) { if (nla_strcmp(kind, t->kind) == 0) { - if (!try_module_get(t->owner)) - t = NULL; + if (try_module_get(t->owner)) + res = t; break; } } read_unlock(&cls_mod_lock); } - return t; + return res; } /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -EEXIST; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + list_for_each_entry(t, &tcf_proto_base, head) if (!strcmp(ops->kind, t->kind)) goto out; - ops->next = NULL; - *tp = ops; + list_add_tail(&ops->head, &tcf_proto_base); rc = 0; out: write_unlock(&cls_mod_lock); @@ -80,19 +78,17 @@ EXPORT_SYMBOL(register_tcf_proto_ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -ENOENT; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) - if (t == ops) + list_for_each_entry(t, &tcf_proto_base, head) { + if (t == ops) { + list_del(&t->head); + rc = 0; break; - - if (!t) - goto out; - *tp = t->next; - rc = 0; -out: + } + } write_unlock(&cls_mod_lock); return rc; } @@ -138,7 +134,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) int err; int tp_created = 0; - if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETTFILTER) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; replay: @@ -321,7 +318,8 @@ replay: } } - err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh); + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, + n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); if (err == 0) { if (tp_created) { spin_lock_bh(root_lock); @@ -344,7 +342,7 @@ errout: return err; } -static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, +static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; @@ -366,7 +364,7 @@ static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; - if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0) goto nla_put_failure; } nlh->nlmsg_len = skb_tail_pointer(skb) - b; @@ -389,7 +387,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) { + if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -408,8 +406,9 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) { struct tcf_dump_args *a = (void *)arg; + struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, + return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); } @@ -467,7 +466,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).portid, + if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) break; @@ -500,46 +499,41 @@ out: void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) { - tcf_action_destroy(exts->action, TCA_ACT_UNBIND); - exts->action = NULL; - } + tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); + INIT_LIST_HEAD(&exts->actions); #endif } EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, - const struct tcf_ext_map *map) + struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) { - memset(exts, 0, sizeof(*exts)); - #ifdef CONFIG_NET_CLS_ACT { struct tc_action *act; - if (map->police && tb[map->police]) { - act = tcf_action_init_1(net, tb[map->police], rate_tlv, - "police", TCA_ACT_NOREPLACE, + INIT_LIST_HEAD(&exts->actions); + if (exts->police && tb[exts->police]) { + act = tcf_action_init_1(net, tb[exts->police], rate_tlv, + "police", ovr, TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); - act->type = TCA_OLD_COMPAT; - exts->action = act; - } else if (map->action && tb[map->action]) { - act = tcf_action_init(net, tb[map->action], rate_tlv, - NULL, TCA_ACT_NOREPLACE, - TCA_ACT_BIND); - if (IS_ERR(act)) - return PTR_ERR(act); - - exts->action = act; + act->type = exts->type = TCA_OLD_COMPAT; + list_add(&act->list, &exts->actions); + } else if (exts->action && tb[exts->action]) { + int err; + err = tcf_action_init(net, tb[exts->action], rate_tlv, + NULL, ovr, + TCA_ACT_BIND, &exts->actions); + if (err) + return err; } } #else - if ((map->action && tb[map->action]) || - (map->police && tb[map->police])) + if ((exts->action && tb[exts->action]) || + (exts->police && tb[exts->police])) return -EOPNOTSUPP; #endif @@ -551,43 +545,42 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT - if (src->action) { - struct tc_action *act; - tcf_tree_lock(tp); - act = dst->action; - dst->action = src->action; - tcf_tree_unlock(tp); - if (act) - tcf_action_destroy(act, TCA_ACT_UNBIND); - } + LIST_HEAD(tmp); + tcf_tree_lock(tp); + list_splice_init(&dst->actions, &tmp); + list_splice(&src->actions, &dst->actions); + tcf_tree_unlock(tp); + tcf_action_destroy(&tmp, TCA_ACT_UNBIND); #endif } EXPORT_SYMBOL(tcf_exts_change); -int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map) +#define tcf_exts_first_act(ext) \ + list_first_entry(&(exts)->actions, struct tc_action, list) + +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (map->action && exts->action) { + if (exts->action && !list_empty(&exts->actions)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ struct nlattr *nest; - - if (exts->action->type != TCA_OLD_COMPAT) { - nest = nla_nest_start(skb, map->action); + if (exts->type != TCA_OLD_COMPAT) { + nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, exts->action, 0, 0) < 0) + if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - } else if (map->police) { - nest = nla_nest_start(skb, map->police); - if (nest == NULL) + } else if (exts->police) { + struct tc_action *act = tcf_exts_first_act(exts); + nest = nla_nest_start(skb, exts->police); + if (nest == NULL || !act) goto nla_put_failure; - if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) + if (tcf_action_dump_old(skb, act, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } @@ -600,17 +593,14 @@ nla_put_failure: __attribute__ ((unused)) EXPORT_SYMBOL(tcf_exts_dump); -int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map) +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) - if (tcf_action_copy_stats(skb, exts->action, 1) < 0) - goto nla_put_failure; + struct tc_action *a = tcf_exts_first_act(exts); + if (tcf_action_copy_stats(skb, a, 1) < 0) + return -1; #endif return 0; -nla_put_failure: __attribute__ ((unused)) - return -1; } EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 636d9131d87..0ae1813e3e9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -34,16 +34,11 @@ struct basic_filter { struct list_head link; }; -static const struct tcf_ext_map basic_ext_map = { - .action = TCA_BASIC_ACT, - .police = TCA_BASIC_POLICE -}; - static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { int r; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { @@ -61,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long basic_get(struct tcf_proto *tp, u32 handle) { unsigned long l = 0UL; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; if (head == NULL) @@ -112,7 +107,7 @@ static void basic_destroy(struct tcf_proto *tp) static int basic_delete(struct tcf_proto *tp, unsigned long arg) { - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *t, *f = (struct basic_filter *) arg; list_for_each_entry(t, &head->flist, link) @@ -135,13 +130,14 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct basic_filter *f, unsigned long base, struct nlattr **tb, - struct nlattr *est) + struct nlattr *est, bool ovr) { int err; struct tcf_exts e; struct tcf_ematch_tree t; - err = tcf_exts_validate(net, tp, tb, est, &e, &basic_ext_map); + tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; @@ -165,10 +161,10 @@ errout: static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, bool ovr) { int err; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *f = (struct basic_filter *) *arg; @@ -183,7 +179,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (f != NULL) { if (handle && f->handle != handle) return -EINVAL; - return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); + return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); } err = -ENOBUFS; @@ -191,6 +187,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout; + tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); err = -EINVAL; if (handle) f->handle = handle; @@ -209,7 +206,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, f->handle = head->hgenerator; } - err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); + err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); if (err < 0) goto errout; @@ -228,7 +225,7 @@ errout: static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { @@ -244,7 +241,7 @@ skip: } } -static int basic_dump(struct tcf_proto *tp, unsigned long fh, +static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct basic_filter *f = (struct basic_filter *) fh; @@ -263,13 +260,13 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || + if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1002a822628..13f64df2c71 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -46,11 +46,6 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; -static const struct tcf_ext_map bpf_ext_map = { - .action = TCA_BPF_ACT, - .police = TCA_BPF_POLICE, -}; - static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -161,11 +156,11 @@ static void cls_bpf_put(struct tcf_proto *tp, unsigned long f) static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, struct cls_bpf_prog *prog, unsigned long base, struct nlattr **tb, - struct nlattr *est) + struct nlattr *est, bool ovr) { struct sock_filter *bpf_ops, *bpf_old; struct tcf_exts exts; - struct sock_fprog tmp; + struct sock_fprog_kern tmp; struct sk_filter *fp, *fp_old; u16 bpf_size, bpf_len; u32 classid; @@ -174,7 +169,8 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) return -EINVAL; - ret = tcf_exts_validate(net, tp, tb, est, &exts, &bpf_ext_map); + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); if (ret < 0) return ret; @@ -195,7 +191,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); tmp.len = bpf_len; - tmp.filter = (struct sock_filter __user *) bpf_ops; + tmp.filter = bpf_ops; ret = sk_unattached_filter_create(&fp, &tmp); if (ret) @@ -246,7 +242,7 @@ static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct cls_bpf_head *head = tp->root; struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; @@ -264,13 +260,14 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (handle && prog->handle != handle) return -EINVAL; return cls_bpf_modify_existing(net, tp, prog, base, tb, - tca[TCA_RATE]); + tca[TCA_RATE], ovr); } prog = kzalloc(sizeof(*prog), GFP_KERNEL); if (prog == NULL) return -ENOBUFS; + tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); if (handle == 0) prog->handle = cls_bpf_grab_new_handle(tp, head); else @@ -280,7 +277,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout; } - ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE]); + ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); if (ret < 0) goto errout; @@ -298,7 +295,7 @@ errout: return ret; } -static int cls_bpf_dump(struct tcf_proto *tp, unsigned long fh, +static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *tm) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; @@ -323,14 +320,14 @@ static int cls_bpf_dump(struct tcf_proto *tp, unsigned long fh, if (nla == NULL) goto nla_put_failure; - memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); - if (tcf_exts_dump(skb, &prog->exts, &bpf_ext_map) < 0) + if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &prog->exts, &bpf_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &prog->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 16006c92c3f..cacf01bd04f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -11,109 +11,13 @@ #include <linux/module.h> #include <linux/slab.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/errno.h> #include <linux/skbuff.h> -#include <linux/cgroup.h> #include <linux/rcupdate.h> -#include <linux/fdtable.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/cls_cgroup.h> -static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cgroup_cls_state, css) : NULL; -} - -static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) -{ - return css_cls_state(task_css(p, net_cls_subsys_id)); -} - -static struct cgroup_subsys_state * -cgrp_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_cls_state *cs; - - cs = kzalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) - return ERR_PTR(-ENOMEM); - return &cs->css; -} - -static int cgrp_css_online(struct cgroup_subsys_state *css) -{ - struct cgroup_cls_state *cs = css_cls_state(css); - struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); - - if (parent) - cs->classid = parent->classid; - return 0; -} - -static void cgrp_css_free(struct cgroup_subsys_state *css) -{ - kfree(css_cls_state(css)); -} - -static int update_classid(const void *v, struct file *file, unsigned n) -{ - int err; - struct socket *sock = sock_from_file(file, &err); - if (sock) - sock->sk->sk_classid = (u32)(unsigned long)v; - return 0; -} - -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) -{ - struct task_struct *p; - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; - - cgroup_taskset_for_each(p, css, tset) { - task_lock(p); - iterate_fd(p->files, 0, update_classid, v); - task_unlock(p); - } -} - -static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return css_cls_state(css)->classid; -} - -static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, - u64 value) -{ - css_cls_state(css)->classid = (u32) value; - return 0; -} - -static struct cftype ss_files[] = { - { - .name = "classid", - .read_u64 = read_classid, - .write_u64 = write_classid, - }, - { } /* terminate */ -}; - -struct cgroup_subsys net_cls_subsys = { - .name = "net_cls", - .css_alloc = cgrp_css_alloc, - .css_online = cgrp_css_online, - .css_free = cgrp_css_free, - .attach = cgrp_attach, - .subsys_id = net_cls_subsys_id, - .base_cftypes = ss_files, - .module = THIS_MODULE, -}; - struct cls_cgroup_head { u32 handle; struct tcf_exts exts; @@ -172,11 +76,6 @@ static int cls_cgroup_init(struct tcf_proto *tp) return 0; } -static const struct tcf_ext_map cgroup_ext_map = { - .action = TCA_CGROUP_ACT, - .police = TCA_CGROUP_POLICE, -}; - static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; @@ -184,7 +83,7 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; struct cls_cgroup_head *head = tp->root; @@ -203,6 +102,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (head == NULL) return -ENOBUFS; + tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); head->handle = handle; tcf_tree_lock(tp); @@ -218,8 +118,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, - &cgroup_ext_map); + tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; @@ -264,7 +164,7 @@ skip: arg->count++; } -static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, +static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct cls_cgroup_head *head = tp->root; @@ -277,13 +177,13 @@ static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 || + if (tcf_exts_dump(skb, &head->exts) < 0 || tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &head->exts) < 0) goto nla_put_failure; return skb->len; @@ -309,25 +209,12 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { static int __init init_cgroup_cls(void) { - int ret; - - ret = cgroup_load_subsys(&net_cls_subsys); - if (ret) - goto out; - - ret = register_tcf_proto_ops(&cls_cgroup_ops); - if (ret) - cgroup_unload_subsys(&net_cls_subsys); - -out: - return ret; + return register_tcf_proto_ops(&cls_cgroup_ops); } static void __exit exit_cgroup_cls(void) { unregister_tcf_proto_ops(&cls_cgroup_ops); - - cgroup_unload_subsys(&net_cls_subsys); } module_init(init_cgroup_cls); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 7881e2fccbc..35be16f7c19 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -56,11 +56,6 @@ struct flow_filter { u32 hashrnd; }; -static const struct tcf_ext_map flow_ext_map = { - .action = TCA_FLOW_ACT, - .police = TCA_FLOW_POLICE, -}; - static inline u32 addr_fold(void *addr) { unsigned long a = (unsigned long)addr; @@ -220,7 +215,7 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb) static u32 flow_get_rxhash(struct sk_buff *skb) { - return skb_get_rxhash(skb); + return skb_get_hash(skb); } static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) @@ -354,7 +349,7 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct flow_head *head = tp->root; struct flow_filter *f; @@ -397,7 +392,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; } - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &flow_ext_map); + tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; @@ -455,6 +451,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, f->handle = handle; f->mask = ~0U; + tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); get_random_bytes(&f->hashrnd, 4); f->perturb_timer.function = flow_perturbation; @@ -566,7 +563,7 @@ static void flow_put(struct tcf_proto *tp, unsigned long f) { } -static int flow_dump(struct tcf_proto *tp, unsigned long fh, +static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct flow_filter *f = (struct flow_filter *)fh; @@ -608,7 +605,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_EMATCH if (f->ematches.hdr.nmatches && @@ -617,7 +614,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh, #endif nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 9b97172db84..861b03ccfed 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -29,11 +29,11 @@ #include <net/act_api.h> #include <net/pkt_cls.h> -#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) +#define HTSIZE 256 struct fw_head { - struct fw_filter *ht[HTSIZE]; - u32 mask; + u32 mask; + struct fw_filter *ht[HTSIZE]; }; struct fw_filter { @@ -41,46 +41,22 @@ struct fw_filter { u32 id; struct tcf_result res; #ifdef CONFIG_NET_CLS_IND - char indev[IFNAMSIZ]; + int ifindex; #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; }; -static const struct tcf_ext_map fw_ext_map = { - .action = TCA_FW_ACT, - .police = TCA_FW_POLICE -}; - -static inline int fw_hash(u32 handle) +static u32 fw_hash(u32 handle) { - if (HTSIZE == 4096) - return ((handle >> 24) & 0xFFF) ^ - ((handle >> 12) & 0xFFF) ^ - (handle & 0xFFF); - else if (HTSIZE == 2048) - return ((handle >> 22) & 0x7FF) ^ - ((handle >> 11) & 0x7FF) ^ - (handle & 0x7FF); - else if (HTSIZE == 1024) - return ((handle >> 20) & 0x3FF) ^ - ((handle >> 10) & 0x3FF) ^ - (handle & 0x3FF); - else if (HTSIZE == 512) - return (handle >> 27) ^ - ((handle >> 18) & 0x1FF) ^ - ((handle >> 9) & 0x1FF) ^ - (handle & 0x1FF); - else if (HTSIZE == 256) { - u8 *t = (u8 *) &handle; - return t[0] ^ t[1] ^ t[2] ^ t[3]; - } else - return handle & (HTSIZE - 1); + handle ^= (handle >> 16); + handle ^= (handle >> 8); + return handle % HTSIZE; } static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f; int r; u32 id = skb->mark; @@ -91,7 +67,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (f->id == id) { *res = f->res; #ifdef CONFIG_NET_CLS_IND - if (!tcf_match_indev(skb, f->indev)) + if (!tcf_match_indev(skb, f->ifindex)) continue; #endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); @@ -116,7 +92,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long fw_get(struct tcf_proto *tp, u32 handle) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f; if (head == NULL) @@ -165,7 +141,7 @@ static void fw_destroy(struct tcf_proto *tp) static int fw_delete(struct tcf_proto *tp, unsigned long arg) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f = (struct fw_filter *)arg; struct fw_filter **fp; @@ -193,14 +169,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { static int fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, - struct nlattr **tb, struct nlattr **tca, unsigned long base) + struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct tcf_exts e; u32 mask; int err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &fw_ext_map); + tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; @@ -211,9 +188,13 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { - err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]); - if (err < 0) + int ret; + ret = tcf_change_indev(net, tb[TCA_FW_INDEV]); + if (ret < 0) { + err = ret; goto errout; + } + f->ifindex = ret; } #endif /* CONFIG_NET_CLS_IND */ @@ -237,9 +218,9 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f = (struct fw_filter *) *arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FW_MAX + 1]; @@ -255,7 +236,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f != NULL) { if (f->id != handle && handle) return -EINVAL; - return fw_change_attrs(net, tp, f, tb, tca, base); + return fw_change_attrs(net, tp, f, tb, tca, base, ovr); } if (!handle) @@ -280,9 +261,10 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) return -ENOBUFS; + tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); f->id = handle; - err = fw_change_attrs(net, tp, f, tb, tca, base); + err = fw_change_attrs(net, tp, f, tb, tca, base, ovr); if (err < 0) goto errout; @@ -301,7 +283,7 @@ errout: static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; int h; if (head == NULL) @@ -327,10 +309,10 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int fw_dump(struct tcf_proto *tp, unsigned long fh, +static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f = (struct fw_filter *)fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -351,20 +333,23 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if (strlen(f->indev) && - nla_put_string(skb, TCA_FW_INDEV, f->indev)) - goto nla_put_failure; + if (f->ifindex) { + struct net_device *dev; + dev = __dev_get_by_index(net, f->ifindex); + if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) + goto nla_put_failure; + } #endif /* CONFIG_NET_CLS_IND */ if (head->mask != 0xFFFFFFFF && nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 37da567d833..dd9fc2523c7 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -59,11 +59,6 @@ struct route4_filter { #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) -static const struct tcf_ext_map route_ext_map = { - .police = TCA_ROUTE4_POLICE, - .action = TCA_ROUTE4_ACT -}; - static inline int route4_fastmap_hash(u32 id, int iif) { return id & 0xF; @@ -128,7 +123,7 @@ static inline int route4_hash_wild(void) static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct route4_head *head = (struct route4_head *)tp->root; + struct route4_head *head = tp->root; struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; @@ -218,7 +213,7 @@ static inline u32 from_hash(u32 id) static unsigned long route4_get(struct tcf_proto *tp, u32 handle) { - struct route4_head *head = (struct route4_head *)tp->root; + struct route4_head *head = tp->root; struct route4_bucket *b; struct route4_filter *f; unsigned int h1, h2; @@ -289,7 +284,7 @@ static void route4_destroy(struct tcf_proto *tp) static int route4_delete(struct tcf_proto *tp, unsigned long arg) { - struct route4_head *head = (struct route4_head *)tp->root; + struct route4_head *head = tp->root; struct route4_filter **fp, *f = (struct route4_filter *)arg; unsigned int h = 0; struct route4_bucket *b; @@ -338,7 +333,8 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { static int route4_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, - struct nlattr **tb, struct nlattr *est, int new) + struct nlattr **tb, struct nlattr *est, int new, + bool ovr) { int err; u32 id = 0, to = 0, nhandle = 0x8000; @@ -347,7 +343,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct route4_bucket *b; struct tcf_exts e; - err = tcf_exts_validate(net, tp, tb, est, &e, &route_ext_map); + tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; @@ -432,7 +429,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct route4_head *head = tp->root; struct route4_filter *f, *f1, **fp; @@ -459,7 +456,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, old_handle = f->handle; err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], 0); + tca[TCA_RATE], 0, ovr); if (err < 0) return err; @@ -481,8 +478,9 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout; + tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], 1); + tca[TCA_RATE], 1, ovr); if (err < 0) goto errout; @@ -554,7 +552,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int route4_dump(struct tcf_proto *tp, unsigned long fh, +static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct route4_filter *f = (struct route4_filter *)fh; @@ -589,12 +587,12 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 252d8b05872..1020e233a5d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -116,11 +116,6 @@ static inline unsigned int hash_src(__be32 *src) return h & 0xF; } -static struct tcf_ext_map rsvp_ext_map = { - .police = TCA_RSVP_POLICE, - .action = TCA_RSVP_ACT -}; - #define RSVP_APPLY_RESULT() \ { \ int r = tcf_exts_exec(skb, &f->exts, res); \ @@ -420,7 +415,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct rsvp_head *data = tp->root; struct rsvp_filter *f, **fp; @@ -440,7 +435,8 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); + tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; @@ -471,6 +467,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout2; + tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); h2 = 16; if (tb[TCA_RSVP_SRC]) { memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); @@ -597,7 +594,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, +static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct rsvp_filter *f = (struct rsvp_filter *)fh; @@ -633,12 +630,12 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index b86535a4016..c721cd4a469 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -24,9 +24,6 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ -#define PRIV(tp) ((struct tcindex_data *) (tp)->root) - - struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; @@ -50,11 +47,6 @@ struct tcindex_data { int fall_through; /* 0: only classify if explicit match */ }; -static const struct tcf_ext_map tcindex_ext_map = { - .police = TCA_TCINDEX_POLICE, - .action = TCA_TCINDEX_ACT -}; - static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) { @@ -82,7 +74,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key) static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; @@ -107,7 +99,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r; pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -145,7 +137,7 @@ static int tcindex_init(struct tcf_proto *tp) static int __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; struct tcindex_filter *f = NULL; @@ -196,11 +188,17 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, }; +static void tcindex_filter_result_init(struct tcindex_filter_result *r) +{ + memset(r, 0, sizeof(*r)); + tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est) + struct nlattr *est, bool ovr) { int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; @@ -209,17 +207,17 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; - err = tcf_exts_validate(net, tp, tb, est, &e, &tcindex_ext_map); + tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; memcpy(&cp, p, sizeof(cp)); - memset(&new_filter_result, 0, sizeof(new_filter_result)); + tcindex_filter_result_init(&new_filter_result); + tcindex_filter_result_init(&cr); if (old_r) - memcpy(&cr, r, sizeof(cr)); - else - memset(&cr, 0, sizeof(cr)); + cr.res = r->res; if (tb[TCA_TCINDEX_HASH]) cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -271,9 +269,14 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = -ENOMEM; if (!cp.perfect && !cp.h) { if (valid_perfect_hash(&cp)) { + int i; + cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); if (!cp.perfect) goto errout; + for (i = 0; i < cp.hash; i++) + tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + TCA_TCINDEX_POLICE); balloc = 1; } else { cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); @@ -299,14 +302,17 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, tcf_bind_filter(tp, &cr.res, base); } - tcf_exts_change(tp, &cr.exts, &e); + if (old_r) + tcf_exts_change(tp, &r->exts, &e); + else + tcf_exts_change(tp, &cr.exts, &e); tcf_tree_lock(tp); if (old_r && old_r != r) - memset(old_r, 0, sizeof(*old_r)); + tcindex_filter_result_init(old_r); memcpy(p, &cp, sizeof(cp)); - memcpy(r, &cr, sizeof(cr)); + r->res = cr.res; if (r == &new_filter_result) { struct tcindex_filter **fp; @@ -335,11 +341,11 @@ errout: static int tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, bool ovr) { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; int err; @@ -355,13 +361,13 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, return err; return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE]); + tca[TCA_RATE], ovr); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter *f, *next; int i; @@ -408,7 +414,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp, static void tcindex_destroy(struct tcf_proto *tp) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcf_walker walker; pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); @@ -423,10 +429,10 @@ static void tcindex_destroy(struct tcf_proto *tp) } -static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, +static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -468,11 +474,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) goto nla_put_failure; - if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) + if (tcf_exts_dump(skb, &r->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &r->exts) < 0) goto nla_put_failure; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index eb07a1e536e..70c0be8d012 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -38,6 +38,7 @@ #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/bitmap.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> @@ -48,7 +49,7 @@ struct tc_u_knode { struct tc_u_hnode *ht_up; struct tcf_exts exts; #ifdef CONFIG_NET_CLS_IND - char indev[IFNAMSIZ]; + int ifindex; #endif u8 fshift; struct tcf_result res; @@ -79,11 +80,6 @@ struct tc_u_common { u32 hgenerator; }; -static const struct tcf_ext_map u32_ext_map = { - .action = TCA_U32_ACT, - .police = TCA_U32_POLICE -}; - static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) @@ -100,7 +96,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct unsigned int off; } stack[TC_U32_MAXDEPTH]; - struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root; + struct tc_u_hnode *ht = tp->root; unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; @@ -157,7 +153,7 @@ check_terminal: *res = n->res; #ifdef CONFIG_NET_CLS_IND - if (!tcf_match_indev(skb, n->indev)) { + if (!tcf_match_indev(skb, n->ifindex)) { n = n->next; goto next_knode; } @@ -352,7 +348,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) return 0; } -static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_knode **kp; struct tc_u_hnode *ht = key->ht_up; @@ -465,17 +461,25 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) return 0; } +#define NR_U32_NODE (1<<12) static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) { struct tc_u_knode *n; - unsigned int i = 0x7FF; + unsigned long i; + unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), + GFP_KERNEL); + if (!bitmap) + return handle | 0xFFF; for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) - if (i < TC_U32_NODE(n->handle)) - i = TC_U32_NODE(n->handle); - i++; + set_bit(TC_U32_NODE(n->handle), bitmap); + + i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); + if (i >= NR_U32_NODE) + i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); - return handle | (i > 0xFFF ? 0xFFF : i); + kfree(bitmap); + return handle | (i >= NR_U32_NODE ? 0xFFF : i); } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -491,12 +495,13 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { static int u32_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tc_u_hnode *ht, struct tc_u_knode *n, struct nlattr **tb, - struct nlattr *est) + struct nlattr *est, bool ovr) { int err; struct tcf_exts e; - err = tcf_exts_validate(net, tp, tb, est, &e, &u32_ext_map); + tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; @@ -531,9 +536,11 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { - err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]); - if (err < 0) + int ret; + ret = tcf_change_indev(net, tb[TCA_U32_INDEV]); + if (ret < 0) goto errout; + n->ifindex = ret; } #endif tcf_exts_change(tp, &n->exts, &e); @@ -547,7 +554,7 @@ errout: static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; @@ -571,7 +578,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; return u32_set_parms(net, tp, base, n->ht_up, n, tb, - tca[TCA_RATE]); + tca[TCA_RATE], ovr); } if (tb[TCA_U32_DIVISOR]) { @@ -646,6 +653,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->ht_up = ht; n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); #ifdef CONFIG_CLS_U32_MARK if (tb[TCA_U32_MARK]) { @@ -657,7 +665,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE]); + err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr); if (err == 0) { struct tc_u_knode **ins; for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) @@ -715,7 +723,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int u32_dump(struct tcf_proto *tp, unsigned long fh, +static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode *)fh; @@ -759,13 +767,16 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; #endif - if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) + if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if (strlen(n->indev) && - nla_put_string(skb, TCA_U32_INDEV, n->indev)) - goto nla_put_failure; + if (n->ifindex) { + struct net_device *dev; + dev = __dev_get_by_index(net, n->ifindex); + if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) + goto nla_put_failure; + } #endif #ifdef CONFIG_CLS_U32_PERF if (nla_put(skb, TCA_U32_PCNT, @@ -778,7 +789,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, nla_nest_end(skb, nest); if (TC_U32_KEY(n->handle)) - if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &n->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index e5cef956722..9b8c0b0e60d 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -222,7 +222,7 @@ META_COLLECTOR(int_maclen) META_COLLECTOR(int_rxhash) { - dst->value = skb_get_rxhash(skb); + dst->value = skb_get_hash(skb); } /************************************************************************** @@ -271,40 +271,52 @@ META_COLLECTOR(int_rtiif) * Socket Attributes **************************************************************************/ -#define SKIP_NONLOCAL(skb) \ - if (unlikely(skb->sk == NULL)) { \ - *err = -1; \ - return; \ - } +#define skip_nonlocal(skb) \ + (unlikely(skb->sk == NULL)) META_COLLECTOR(int_sk_family) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_family; } META_COLLECTOR(int_sk_state) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_state; } META_COLLECTOR(int_sk_reuse) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_reuse; } META_COLLECTOR(int_sk_bound_if) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } /* No error if bound_dev_if is 0, legal userspace check */ dst->value = skb->sk->sk_bound_dev_if; } META_COLLECTOR(var_sk_bound_if) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } if (skb->sk->sk_bound_dev_if == 0) { dst->value = (unsigned long) "any"; @@ -322,151 +334,226 @@ META_COLLECTOR(var_sk_bound_if) META_COLLECTOR(int_sk_refcnt) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = atomic_read(&skb->sk->sk_refcnt); } META_COLLECTOR(int_sk_rcvbuf) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_protocol; } META_COLLECTOR(int_sk_type) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = sk_rmem_alloc_get(skb->sk); } META_COLLECTOR(int_sk_wmem_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = sk_wmem_alloc_get(skb->sk); } META_COLLECTOR(int_sk_omem_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = atomic_read(&skb->sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_wmem_queued; } META_COLLECTOR(int_sk_fwd_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_forward_alloc; } META_COLLECTOR(int_sk_sndbuf) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = (__force int) skb->sk->sk_allocation; } META_COLLECTOR(int_sk_hash) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_hash; } META_COLLECTOR(int_sk_lingertime) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_lingertime / HZ; } META_COLLECTOR(int_sk_err_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_ack_backlog; } META_COLLECTOR(int_sk_max_ack_bl) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_max_ack_backlog; } META_COLLECTOR(int_sk_prio) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_priority; } META_COLLECTOR(int_sk_rcvlowat) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvlowat; } META_COLLECTOR(int_sk_rcvtimeo) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvtimeo / HZ; } META_COLLECTOR(int_sk_sndtimeo) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_sndtimeo / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_write_pending; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 547b4a88ae2..58bed7599db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -135,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock); static struct Qdisc_ops *qdisc_base; -/* Register/uregister queueing discipline */ +/* Register/unregister queueing discipline */ int register_qdisc(struct Qdisc_ops *qops) { @@ -273,8 +273,12 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) void qdisc_list_add(struct Qdisc *q) { - if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) - list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list); + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + struct Qdisc *root = qdisc_dev(q)->qdisc; + + WARN_ON_ONCE(root == &noop_qdisc); + list_add_tail(&q->list, &root->list); + } } EXPORT_SYMBOL(qdisc_list_add); @@ -559,7 +563,7 @@ out: } EXPORT_SYMBOL(__qdisc_calculate_pkt_len); -void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) +void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", @@ -1080,7 +1084,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) struct Qdisc *p = NULL; int err; - if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETQDISC) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1147,7 +1152,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n) struct Qdisc *q, *p; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; replay: @@ -1300,6 +1305,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, struct gnet_dump d; struct qdisc_size_table *stab; + cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; @@ -1431,9 +1437,9 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; - rcu_read_lock(); idx = 0; - for_each_netdev_rcu(net, dev) { + ASSERT_RTNL(); + for_each_netdev(net, dev) { struct netdev_queue *dev_queue; if (idx < s_idx) @@ -1456,8 +1462,6 @@ cont: } done: - rcu_read_unlock(); - cb->args[0] = idx; cb->args[1] = q_idx; @@ -1487,7 +1491,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) u32 qid; int err; - if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETTCLASS) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1614,6 +1619,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, struct gnet_dump d; const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; + cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 1f9c31411f1..8449b337f9e 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -623,8 +623,7 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) goto nla_put_failure; } - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 7a42c81a19e..ead526467cc 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1058,9 +1058,10 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ q->quanta[prio]; } - if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { - pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n", - cl->common.classid, cl->quantum); + if (cl->quantum <= 0 || + cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { + pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", + cl->common.classid, cl->quantum); cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; } } @@ -1562,8 +1563,7 @@ static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (cbq_dump_attr(skb, &q->link) < 0) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); @@ -1598,8 +1598,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, goto nla_put_failure; if (cbq_dump_attr(skb, cl) < 0) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); @@ -1782,8 +1781,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { - if (rtab) - qdisc_put_rtab(rtab); + qdisc_put_rtab(rtab); return err; } } diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ddd73cb2d7b..ed30e436128 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -14,7 +14,6 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> -#include <linux/reciprocal_div.h> #include <linux/vmalloc.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> @@ -77,12 +76,6 @@ struct choke_sched_data { struct sk_buff **tab; }; -/* deliver a random number between 0 and N - 1 */ -static u32 random_N(unsigned int N) -{ - return reciprocal_divide(prandom_u32(), N); -} - /* number of elements in queue including holes */ static unsigned int choke_len(const struct choke_sched_data *q) { @@ -233,7 +226,7 @@ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, int retrys = 3; do { - *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask; + *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask; skb = q->tab[*pidx]; if (skb) return skb; @@ -398,12 +391,7 @@ static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { static void choke_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static int choke_change(struct Qdisc *sch, struct nlattr *opt) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 8302717ea30..7bbbfe11219 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -391,8 +391,10 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) while (1) { cl = list_first_entry(&q->active, struct drr_class, alist); skb = cl->qdisc->ops->peek(cl->qdisc); - if (skb == NULL) + if (skb == NULL) { + qdisc_warn_nonwc(__func__, cl->qdisc); goto out; + } len = qdisc_pkt_len(skb); if (len <= cl->deficit) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 3886365cc20..49d6ef338b5 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -47,7 +47,7 @@ struct dsmark_qdisc_data { static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) { - return (index <= p->indices && index > 0); + return index <= p->indices && index > 0; } /* ------------------------- Class/flow operations ------------------------- */ @@ -57,8 +57,8 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", - sch, p, new, old); + pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", + __func__, sch, p, new, old); if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, @@ -85,8 +85,8 @@ static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) static unsigned long dsmark_get(struct Qdisc *sch, u32 classid) { - pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n", - sch, qdisc_priv(sch), classid); + pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", + __func__, sch, qdisc_priv(sch), classid); return TC_H_MIN(classid) + 1; } @@ -118,8 +118,8 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, int err = -EINVAL; u8 mask = 0; - pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," - "arg 0x%lx\n", sch, p, classid, parent, *arg); + pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", + __func__, sch, p, classid, parent, *arg); if (!dsmark_valid_index(p, *arg)) { err = -ENOENT; @@ -166,7 +166,8 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) struct dsmark_qdisc_data *p = qdisc_priv(sch); int i; - pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", + __func__, sch, p, walker); if (walker->stop) return; @@ -199,7 +200,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; - pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); if (p->set_tc_index) { switch (skb->protocol) { @@ -275,7 +276,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) struct sk_buff *skb; u32 index; - pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); skb = p->q->ops->dequeue(p->q); if (skb == NULL) @@ -303,8 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) * and don't need yet another qdisc as a bypass. */ if (p->mask[index] != 0xff || p->value[index]) - pr_warning("dsmark_dequeue: unsupported protocol %d\n", - ntohs(skb->protocol)); + pr_warn("%s: unsupported protocol %d\n", + __func__, ntohs(skb->protocol)); break; } @@ -315,7 +316,7 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); return p->q->ops->peek(p->q); } @@ -325,7 +326,7 @@ static unsigned int dsmark_drop(struct Qdisc *sch) struct dsmark_qdisc_data *p = qdisc_priv(sch); unsigned int len; - pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); if (p->q->ops->drop == NULL) return 0; @@ -346,7 +347,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) u16 indices; u8 *mask; - pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); if (!opt) goto errout; @@ -384,7 +385,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (p->q == NULL) p->q = &noop_qdisc; - pr_debug("dsmark_init: qdisc %p\n", p->q); + pr_debug("%s: qdisc %p\n", __func__, p->q); err = 0; errout: @@ -395,7 +396,7 @@ static void dsmark_reset(struct Qdisc *sch) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); qdisc_reset(p->q); sch->q.qlen = 0; } @@ -404,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); tcf_destroy_chain(&p->filter_list); qdisc_destroy(p->q); @@ -417,7 +418,7 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, struct dsmark_qdisc_data *p = qdisc_priv(sch); struct nlattr *opts = NULL; - pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); + pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl); if (!dsmark_valid_index(p, cl)) return -EINVAL; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 95d84396190..ba32c2b005d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -47,6 +47,7 @@ #include <linux/rbtree.h> #include <linux/hash.h> #include <linux/prefetch.h> +#include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -225,7 +226,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) /* By forcing low order bit to 1, we make sure to not * collide with a local flow (socket pointers are word aligned) */ - sk = (struct sock *)(skb_get_rxhash(skb) | 1L); + sk = (struct sock *)(skb_get_hash(skb) | 1L); } root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; @@ -578,28 +579,53 @@ static void fq_rehash(struct fq_sched_data *q, q->stat_gc_flows += fcnt; } -static int fq_resize(struct fq_sched_data *q, u32 log) +static void *fq_alloc_node(size_t sz, int node) { + void *ptr; + + ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); + if (!ptr) + ptr = vmalloc_node(sz, node); + return ptr; +} + +static void fq_free(void *addr) +{ + kvfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log) +{ + struct fq_sched_data *q = qdisc_priv(sch); struct rb_root *array; + void *old_fq_root; u32 idx; if (q->fq_root && log == q->fq_trees_log) return 0; - array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); + /* If XPS was setup, we can allocate memory on right NUMA node */ + array = fq_alloc_node(sizeof(struct rb_root) << log, + netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; for (idx = 0; idx < (1U << log); idx++) array[idx] = RB_ROOT; - if (q->fq_root) { - fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); - kfree(q->fq_root); - } + sch_tree_lock(sch); + + old_fq_root = q->fq_root; + if (old_fq_root) + fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); + q->fq_root = array; q->fq_trees_log = log; + sch_tree_unlock(sch); + + fq_free(old_fq_root); + return 0; } @@ -675,9 +701,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) q->flow_refill_delay = usecs_to_jiffies(usecs_delay); } - if (!err) - err = fq_resize(q, fq_log); - + if (!err) { + sch_tree_unlock(sch); + err = fq_resize(sch, fq_log); + sch_tree_lock(sch); + } while (sch->q.qlen > sch->limit) { struct sk_buff *skb = fq_dequeue(sch); @@ -697,7 +725,7 @@ static void fq_destroy(struct Qdisc *sch) struct fq_sched_data *q = qdisc_priv(sch); fq_reset(sch); - kfree(q->fq_root); + fq_free(q->fq_root); qdisc_watchdog_cancel(&q->watchdog); } @@ -723,7 +751,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) if (opt) err = fq_change(sch, opt); else - err = fq_resize(q, q->fq_trees_log); + err = fq_resize(sch, q->fq_trees_log); return err; } @@ -750,8 +778,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; - nla_nest_end(skb, opts); - return skb->len; + return nla_nest_end(skb, opts); nla_put_failure: return -1; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 55786283a3d..063b726bf1f 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -365,12 +365,7 @@ static void *fq_codel_zalloc(size_t sz) static void fq_codel_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void fq_codel_destroy(struct Qdisc *sch) @@ -390,7 +385,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) sch->limit = 10*1024; q->flows_cnt = 1024; q->quantum = psched_mtu(qdisc_dev(sch)); - q->perturbation = net_random(); + q->perturbation = prandom_u32(); INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); codel_params_init(&q->cparams); @@ -450,8 +445,7 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; - nla_nest_end(skb, opts); - return skb->len; + return nla_nest_end(skb, opts); nla_put_failure: return -1; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 922a09406ba..e1543b03e39 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -126,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq, NULL); + ret = dev_hard_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); @@ -310,6 +310,7 @@ void netif_carrier_on(struct net_device *dev) if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; + atomic_inc(&dev->carrier_changes); linkwatch_fire_event(dev); if (netif_running(dev)) __netdev_watchdog_up(dev); @@ -328,6 +329,7 @@ void netif_carrier_off(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; + atomic_inc(&dev->carrier_changes); linkwatch_fire_event(dev); } } @@ -338,13 +340,13 @@ EXPORT_SYMBOL(netif_carrier_off); cheaper. */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) { kfree_skb(skb); return NET_XMIT_CN; } -static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) +static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } @@ -718,8 +720,8 @@ static void attach_default_qdiscs(struct net_device *dev) } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); if (qdisc) { - qdisc->ops->attach(qdisc); dev->qdisc = qdisc; + qdisc->ops->attach(qdisc); } } } diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index d42234c0f13..12cbc09157f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -370,8 +370,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { - pr_warning("GRED: Warning: Destroying " - "shadowed VQ 0x%x\n", i); + pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", + i); gred_destroy_vq(table->tab[i]); table->tab[i] = NULL; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index c4075610502..ec8aeaac1dd 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1353,8 +1353,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, goto nla_put_failure; if (hfsc_dump_curves(skb, cl) < 0) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c new file mode 100644 index 00000000000..d85b6812a7d --- /dev/null +++ b/net/sched/sch_hhf.c @@ -0,0 +1,740 @@ +/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) + * + * Copyright (C) 2013 Terry Lam <vtlam@google.com> + * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> + */ + +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/flow_keys.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +/* Heavy-Hitter Filter (HHF) + * + * Principles : + * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter + * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified + * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. + * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, + * in which the heavy-hitter bucket is served with less weight. + * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) + * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have + * higher share of bandwidth. + * + * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the + * following paper: + * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and + * Accounting", in ACM SIGCOMM, 2002. + * + * Conceptually, a multi-stage filter comprises k independent hash functions + * and k counter arrays. Packets are indexed into k counter arrays by k hash + * functions, respectively. The counters are then increased by the packet sizes. + * Therefore, + * - For a heavy-hitter flow: *all* of its k array counters must be large. + * - For a non-heavy-hitter flow: some of its k array counters can be large + * due to hash collision with other small flows; however, with high + * probability, not *all* k counters are large. + * + * By the design of the multi-stage filter algorithm, the false negative rate + * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is + * susceptible to false positives (non-heavy-hitters mistakenly classified as + * heavy-hitters). + * Therefore, we also implement the following optimizations to reduce false + * positives by avoiding unnecessary increment of the counter values: + * - Optimization O1: once a heavy-hitter is identified, its bytes are not + * accounted in the array counters. This technique is called "shielding" + * in Section 3.3.1 of [EV02]. + * - Optimization O2: conservative update of counters + * (Section 3.3.2 of [EV02]), + * New counter value = max {old counter value, + * smallest counter value + packet bytes} + * + * Finally, we refresh the counters periodically since otherwise the counter + * values will keep accumulating. + * + * Once a flow is classified as heavy-hitter, we also save its per-flow state + * in an exact-matching flow table so that its subsequent packets can be + * dispatched to the heavy-hitter bucket accordingly. + * + * + * At a high level, this qdisc works as follows: + * Given a packet p: + * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching + * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter + * bucket. + * - Otherwise, forward p to the multi-stage filter, denoted filter F + * + If F decides that p belongs to a non-heavy-hitter flow, then send p + * to the non-heavy-hitter bucket. + * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, + * then set up a new flow entry for the flow-id of p in the table T and + * send p to the heavy-hitter bucket. + * + * In this implementation: + * - T is a fixed-size hash-table with 1024 entries. Hash collision is + * resolved by linked-list chaining. + * - F has four counter arrays, each array containing 1024 32-bit counters. + * That means 4 * 1024 * 32 bits = 16KB of memory. + * - Since each array in F contains 1024 counters, 10 bits are sufficient to + * index into each array. + * Hence, instead of having four hash functions, we chop the 32-bit + * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is + * computed as XOR sum of those three chunks. + * - We need to clear the counter arrays periodically; however, directly + * memsetting 16KB of memory can lead to cache eviction and unwanted delay. + * So by representing each counter by a valid bit, we only need to reset + * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. + * - The Deficit Round Robin engine is taken from fq_codel implementation + * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to + * fq_codel_flow in fq_codel implementation. + * + */ + +/* Non-configurable parameters */ +#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ +#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ +#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ +#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ +#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ + +#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ +enum wdrr_bucket_idx { + WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ + WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ +}; + +#define hhf_time_before(a, b) \ + (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) + +/* Heavy-hitter per-flow state */ +struct hh_flow_state { + u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ + u32 hit_timestamp; /* last time heavy-hitter was seen */ + struct list_head flowchain; /* chaining under hash collision */ +}; + +/* Weighted Deficit Round Robin (WDRR) scheduler */ +struct wdrr_bucket { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head bucketchain; + int deficit; +}; + +struct hhf_sched_data { + struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + u32 drop_overlimit; /* number of times max qdisc packet + * limit was hit + */ + struct list_head *hh_flows; /* table T (currently active HHs) */ + u32 hh_flows_limit; /* max active HH allocs */ + u32 hh_flows_overlimit; /* num of disallowed HH allocs */ + u32 hh_flows_total_cnt; /* total admitted HHs */ + u32 hh_flows_current_cnt; /* total current HHs */ + u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ + u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays + * was reset + */ + unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits + * of hhf_arrays + */ + /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ + struct list_head new_buckets; /* list of new buckets */ + struct list_head old_buckets; /* list of old buckets */ + + /* Configurable HHF parameters */ + u32 hhf_reset_timeout; /* interval to reset counter + * arrays in filter F + * (default 40ms) + */ + u32 hhf_admit_bytes; /* counter thresh to classify as + * HH (default 128KB). + * With these default values, + * 128KB / 40ms = 25 Mbps + * i.e., we expect to capture HHs + * sending > 25 Mbps. + */ + u32 hhf_evict_timeout; /* aging threshold to evict idle + * HHs out of table T. This should + * be large enough to avoid + * reordering during HH eviction. + * (default 1s) + */ + u32 hhf_non_hh_weight; /* WDRR weight for non-HHs + * (default 2, + * i.e., non-HH : HH = 2 : 1) + */ +}; + +static u32 hhf_time_stamp(void) +{ + return jiffies; +} + +static unsigned int skb_hash(const struct hhf_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + if (skb->sk && skb->sk->sk_hash) + return skb->sk->sk_hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return hash; +} + +/* Looks up a heavy-hitter flow in a chaining list of table T. */ +static struct hh_flow_state *seek_list(const u32 hash, + struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow, *next; + u32 now = hhf_time_stamp(); + + if (list_empty(head)) + return NULL; + + list_for_each_entry_safe(flow, next, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) { + /* Delete expired heavy-hitters, but preserve one entry + * to avoid kzalloc() when next time this slot is hit. + */ + if (list_is_last(&flow->flowchain, head)) + return NULL; + list_del(&flow->flowchain); + kfree(flow); + q->hh_flows_current_cnt--; + } else if (flow->hash_id == hash) { + return flow; + } + } + return NULL; +} + +/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired + * entry or dynamically alloc a new entry. + */ +static struct hh_flow_state *alloc_new_hh(struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow; + u32 now = hhf_time_stamp(); + + if (!list_empty(head)) { + /* Find an expired heavy-hitter flow entry. */ + list_for_each_entry(flow, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) + return flow; + } + } + + if (q->hh_flows_current_cnt >= q->hh_flows_limit) { + q->hh_flows_overlimit++; + return NULL; + } + /* Create new entry. */ + flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); + if (!flow) + return NULL; + + q->hh_flows_current_cnt++; + INIT_LIST_HEAD(&flow->flowchain); + list_add_tail(&flow->flowchain, head); + + return flow; +} + +/* Assigns packets to WDRR buckets. Implements a multi-stage filter to + * classify heavy-hitters. + */ +static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + u32 tmp_hash, hash; + u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; + struct hh_flow_state *flow; + u32 pkt_len, min_hhf_val; + int i; + u32 prev; + u32 now = hhf_time_stamp(); + + /* Reset the HHF counter arrays if this is the right time. */ + prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; + if (hhf_time_before(prev, now)) { + for (i = 0; i < HHF_ARRAYS_CNT; i++) + bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); + q->hhf_arrays_reset_timestamp = now; + } + + /* Get hashed flow-id of the skb. */ + hash = skb_hash(q, skb); + + /* Check if this packet belongs to an already established HH flow. */ + flow_pos = hash & HHF_BIT_MASK; + flow = seek_list(hash, &q->hh_flows[flow_pos], q); + if (flow) { /* found its HH flow */ + flow->hit_timestamp = now; + return WDRR_BUCKET_FOR_HH; + } + + /* Now pass the packet through the multi-stage filter. */ + tmp_hash = hash; + xorsum = 0; + for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { + /* Split the skb_hash into three 10-bit chunks. */ + filter_pos[i] = tmp_hash & HHF_BIT_MASK; + xorsum ^= filter_pos[i]; + tmp_hash >>= HHF_BIT_MASK_LEN; + } + /* The last chunk is computed as XOR sum of other chunks. */ + filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; + + pkt_len = qdisc_pkt_len(skb); + min_hhf_val = ~0U; + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + u32 val; + + if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { + q->hhf_arrays[i][filter_pos[i]] = 0; + __set_bit(filter_pos[i], q->hhf_valid_bits[i]); + } + + val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; + if (min_hhf_val > val) + min_hhf_val = val; + } + + /* Found a new HH iff all counter values > HH admit threshold. */ + if (min_hhf_val > q->hhf_admit_bytes) { + /* Just captured a new heavy-hitter. */ + flow = alloc_new_hh(&q->hh_flows[flow_pos], q); + if (!flow) /* memory alloc problem */ + return WDRR_BUCKET_FOR_NON_HH; + flow->hash_id = hash; + flow->hit_timestamp = now; + q->hh_flows_total_cnt++; + + /* By returning without updating counters in q->hhf_arrays, + * we implicitly implement "shielding" (see Optimization O1). + */ + return WDRR_BUCKET_FOR_HH; + } + + /* Conservative update of HHF arrays (see Optimization O2). */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) + q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; + } + return WDRR_BUCKET_FOR_NON_HH; +} + +/* Removes one skb from head of bucket. */ +static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) +{ + struct sk_buff *skb = bucket->head; + + bucket->head = skb->next; + skb->next = NULL; + return skb; +} + +/* Tail-adds skb to bucket. */ +static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) +{ + if (bucket->head == NULL) + bucket->head = skb; + else + bucket->tail->next = skb; + bucket->tail = skb; + skb->next = NULL; +} + +static unsigned int hhf_drop(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct wdrr_bucket *bucket; + + /* Always try to drop from heavy-hitters first. */ + bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; + if (!bucket->head) + bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; + + if (bucket->head) { + struct sk_buff *skb = dequeue_head(bucket); + + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= qdisc_pkt_len(skb); + kfree_skb(skb); + } + + /* Return id of the bucket from which the packet was dropped. */ + return bucket - q->buckets; +} + +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + enum wdrr_bucket_idx idx; + struct wdrr_bucket *bucket; + + idx = hhf_classify(skb, sch); + + bucket = &q->buckets[idx]; + bucket_add(bucket, skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&bucket->bucketchain)) { + unsigned int weight; + + /* The logic of new_buckets vs. old_buckets is the same as + * new_flows vs. old_flows in the implementation of fq_codel, + * i.e., short bursts of non-HHs should have strict priority. + */ + if (idx == WDRR_BUCKET_FOR_HH) { + /* Always move heavy-hitters to old bucket. */ + weight = 1; + list_add_tail(&bucket->bucketchain, &q->old_buckets); + } else { + weight = q->hhf_non_hh_weight; + list_add_tail(&bucket->bucketchain, &q->new_buckets); + } + bucket->deficit = weight * q->quantum; + } + if (++sch->q.qlen <= sch->limit) + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet from this + * bucket. + */ + if (hhf_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this. */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *hhf_dequeue(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct wdrr_bucket *bucket; + struct list_head *head; + +begin: + head = &q->new_buckets; + if (list_empty(head)) { + head = &q->old_buckets; + if (list_empty(head)) + return NULL; + } + bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); + + if (bucket->deficit <= 0) { + int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? + 1 : q->hhf_non_hh_weight; + + bucket->deficit += weight * q->quantum; + list_move_tail(&bucket->bucketchain, &q->old_buckets); + goto begin; + } + + if (bucket->head) { + skb = dequeue_head(bucket); + sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + } + + if (!skb) { + /* Force a pass through old_buckets to prevent starvation. */ + if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) + list_move_tail(&bucket->bucketchain, &q->old_buckets); + else + list_del_init(&bucket->bucketchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + bucket->deficit -= qdisc_pkt_len(skb); + + return skb; +} + +static void hhf_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = hhf_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void *hhf_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + + return ptr; +} + +static void hhf_free(void *addr) +{ + kvfree(addr); +} + +static void hhf_destroy(struct Qdisc *sch) +{ + int i; + struct hhf_sched_data *q = qdisc_priv(sch); + + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + hhf_free(q->hhf_arrays[i]); + hhf_free(q->hhf_valid_bits[i]); + } + + for (i = 0; i < HH_FLOWS_CNT; i++) { + struct hh_flow_state *flow, *next; + struct list_head *head = &q->hh_flows[i]; + + if (list_empty(head)) + continue; + list_for_each_entry_safe(flow, next, head, flowchain) { + list_del(&flow->flowchain); + kfree(flow); + } + } + hhf_free(q->hh_flows); +} + +static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { + [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, + [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, + [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, +}; + +static int hhf_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_HHF_MAX + 1]; + unsigned int qlen; + int err; + u64 non_hh_quantum; + u32 new_quantum = q->quantum; + u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy); + if (err < 0) + return err; + + if (tb[TCA_HHF_QUANTUM]) + new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); + + if (tb[TCA_HHF_NON_HH_WEIGHT]) + new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); + + non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; + if (non_hh_quantum > INT_MAX) + return -EINVAL; + + sch_tree_lock(sch); + + if (tb[TCA_HHF_BACKLOG_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + + q->quantum = new_quantum; + q->hhf_non_hh_weight = new_hhf_non_hh_weight; + + if (tb[TCA_HHF_HH_FLOWS_LIMIT]) + q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + + if (tb[TCA_HHF_RESET_TIMEOUT]) { + u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); + + q->hhf_reset_timeout = usecs_to_jiffies(us); + } + + if (tb[TCA_HHF_ADMIT_BYTES]) + q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + + if (tb[TCA_HHF_EVICT_TIMEOUT]) { + u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); + + q->hhf_evict_timeout = usecs_to_jiffies(us); + } + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = hhf_dequeue(sch); + + kfree_skb(skb); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int hhf_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + int i; + + sch->limit = 1000; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = prandom_u32(); + INIT_LIST_HEAD(&q->new_buckets); + INIT_LIST_HEAD(&q->old_buckets); + + /* Configurable HHF parameters */ + q->hhf_reset_timeout = HZ / 25; /* 40 ms */ + q->hhf_admit_bytes = 131072; /* 128 KB */ + q->hhf_evict_timeout = HZ; /* 1 sec */ + q->hhf_non_hh_weight = 2; + + if (opt) { + int err = hhf_change(sch, opt); + + if (err) + return err; + } + + if (!q->hh_flows) { + /* Initialize heavy-hitter flow table. */ + q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * + sizeof(struct list_head)); + if (!q->hh_flows) + return -ENOMEM; + for (i = 0; i < HH_FLOWS_CNT; i++) + INIT_LIST_HEAD(&q->hh_flows[i]); + + /* Cap max active HHs at twice len of hh_flows table. */ + q->hh_flows_limit = 2 * HH_FLOWS_CNT; + q->hh_flows_overlimit = 0; + q->hh_flows_total_cnt = 0; + q->hh_flows_current_cnt = 0; + + /* Initialize heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * + sizeof(u32)); + if (!q->hhf_arrays[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + q->hhf_arrays_reset_timestamp = hhf_time_stamp(); + + /* Initialize valid bits of heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / + BITS_PER_BYTE); + if (!q->hhf_valid_bits[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + + /* Initialize Weighted DRR buckets. */ + for (i = 0; i < WDRR_BUCKET_CNT; i++) { + struct wdrr_bucket *bucket = q->buckets + i; + + INIT_LIST_HEAD(&bucket->bucketchain); + } + } + + return 0; +} + +static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || + nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, + jiffies_to_usecs(q->hhf_reset_timeout)) || + nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || + nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, + jiffies_to_usecs(q->hhf_evict_timeout)) || + nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct tc_hhf_xstats st = { + .drop_overlimit = q->drop_overlimit, + .hh_overlimit = q->hh_flows_overlimit, + .hh_tot_count = q->hh_flows_total_cnt, + .hh_cur_count = q->hh_flows_current_cnt, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { + .id = "hhf", + .priv_size = sizeof(struct hhf_sched_data), + + .enqueue = hhf_enqueue, + .dequeue = hhf_dequeue, + .peek = qdisc_peek_dequeued, + .drop = hhf_drop, + .init = hhf_init, + .reset = hhf_reset, + .destroy = hhf_destroy, + .change = hhf_change, + .dump = hhf_dump, + .dump_stats = hhf_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init hhf_module_init(void) +{ + return register_qdisc(&hhf_qdisc_ops); +} + +static void __exit hhf_module_exit(void) +{ + unregister_qdisc(&hhf_qdisc_ops); +} + +module_init(hhf_module_init) +module_exit(hhf_module_exit) +MODULE_AUTHOR("Terry Lam"); +MODULE_AUTHOR("Nandita Dukkipati"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 0e1e38b4002..9f949abcace 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -219,11 +219,16 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, if (skb->priority == sch->handle) return HTB_DIRECT; /* X:0 (direct flow) selected */ cl = htb_find(skb->priority, sch); - if (cl && cl->level == 0) - return cl; + if (cl) { + if (cl->level == 0) + return cl; + /* Start with inner filter chain if a non-leaf class is selected */ + tcf = cl->filter_list; + } else { + tcf = q->filter_list; + } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - tcf = q->filter_list; while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -712,7 +717,7 @@ static s64 htb_do_events(struct htb_sched *q, const int level, /* too much load - let's continue after a break for scheduling */ if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { - pr_warning("htb: too many events!\n"); + pr_warn("htb: too many events!\n"); q->warned |= HTB_WARN_TOOMANYEVENTS; } @@ -1057,12 +1062,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) { - spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); struct htb_sched *q = qdisc_priv(sch); struct nlattr *nest; struct tc_htb_glob gopt; - spin_lock_bh(root_lock); + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the qdisc parameters. + */ gopt.direct_pkts = q->direct_pkts; gopt.version = HTB_VER; @@ -1076,13 +1082,10 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) goto nla_put_failure; - nla_nest_end(skb, nest); - spin_unlock_bh(root_lock); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: - spin_unlock_bh(root_lock); nla_nest_cancel(skb, nest); return -1; } @@ -1091,11 +1094,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct htb_class *cl = (struct htb_class *)arg; - spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); struct nlattr *nest; struct tc_htb_opt opt; - spin_lock_bh(root_lock); + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the class parameters. + */ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; tcm->tcm_handle = cl->common.classid; if (!cl->level && cl->un.leaf.q) @@ -1123,12 +1127,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) goto nla_put_failure; - nla_nest_end(skb, nest); - spin_unlock_bh(root_lock); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: - spin_unlock_bh(root_lock); nla_nest_cancel(skb, nest); return -1; } @@ -1276,9 +1277,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) struct Qdisc *new_q = NULL; int last_child = 0; - // TODO: why don't allow to delete subtree ? references ? does - // tc subsys quarantee us that in htb_destroy it holds no class - // refs so that we can remove children safely there ? + /* TODO: why don't allow to delete subtree ? references ? does + * tc subsys guarantee us that in htb_destroy it holds no class + * refs so that we can remove children safely there ? + */ if (cl->children || cl->filter_cnt) return -EBUSY; @@ -1337,7 +1339,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; - struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; u64 rate64, ceil64; @@ -1361,16 +1362,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, goto failure; /* Keeping backward compatible with rate_table based iproute2 tc */ - if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) { - rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); - if (rtab) - qdisc_put_rtab(rtab); - } - if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) { - ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); - if (ctab) - qdisc_put_rtab(ctab); - } + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB])); + + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB])); if (!cl) { /* new class */ struct Qdisc *new_q; @@ -1477,21 +1473,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_lock(sch); } + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + + psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); + psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); + /* it used to be a nasty bug here, we have to check that node * is really leaf before changing cl->un.leaf ! */ if (!cl->level) { - cl->quantum = hopt->rate.rate / q->rate2quantum; + u64 quantum = cl->rate.rate_bytes_ps; + + do_div(quantum, q->rate2quantum); + cl->quantum = min_t(u64, quantum, INT_MAX); + if (!hopt->quantum && cl->quantum < 1000) { - pr_warning( - "HTB: quantum of class %X is small. Consider r2q change.\n", - cl->common.classid); + pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n", + cl->common.classid); cl->quantum = 1000; } if (!hopt->quantum && cl->quantum > 200000) { - pr_warning( - "HTB: quantum of class %X is big. Consider r2q change.\n", - cl->common.classid); + pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n", + cl->common.classid); cl->quantum = 200000; } if (hopt->quantum) @@ -1500,13 +1505,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; - - ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; - - psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); - psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); - cl->buffer = PSCHED_TICKS2NS(hopt->buffer); cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index bce1665239b..62871c14e1f 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -100,8 +100,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bccd52b36e9..111d70fddae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -88,10 +88,10 @@ struct netem_sched_data { u32 duplicate; u32 reorder; u32 corrupt; - u32 rate; + u64 rate; s32 packet_overhead; u32 cell_size; - u32 cell_size_reciprocal; + struct reciprocal_value cell_size_reciprocal; s32 cell_overhead; struct crndstate { @@ -110,6 +110,18 @@ struct netem_sched_data { CLG_GILB_ELL, } loss_model; + enum { + TX_IN_GAP_PERIOD = 1, + TX_IN_BURST_PERIOD, + LOST_IN_GAP_PERIOD, + LOST_IN_BURST_PERIOD, + } _4_state_model; + + enum { + GOOD_STATE = 1, + BAD_STATE, + } GE_state_model; + /* Correlated Loss Generation models */ struct clgstate { /* state of the Markov chain */ @@ -169,7 +181,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; - state->last = net_random(); + state->last = prandom_u32(); } /* get_crandom - correlated random number generator @@ -182,9 +194,9 @@ static u32 get_crandom(struct crndstate *state) unsigned long answer; if (state->rho == 0) /* no correlation */ - return net_random(); + return prandom_u32(); - value = net_random(); + value = prandom_u32(); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; @@ -198,50 +210,52 @@ static u32 get_crandom(struct crndstate *state) static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; - u32 rnd = net_random(); + u32 rnd = prandom_u32(); /* * Makes a comparison between rnd and the transition * probabilities outgoing from the current state, then decides the * next state and if the next packet has to be transmitted or lost. * The four states correspond to: - * 1 => successfully transmitted packets within a gap period - * 4 => isolated losses within a gap period - * 3 => lost packets within a burst period - * 2 => successfully transmitted packets within a burst period + * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period + * LOST_IN_BURST_PERIOD => isolated losses within a gap period + * LOST_IN_GAP_PERIOD => lost packets within a burst period + * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period */ switch (clg->state) { - case 1: + case TX_IN_GAP_PERIOD: if (rnd < clg->a4) { - clg->state = 4; + clg->state = LOST_IN_BURST_PERIOD; return true; } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { - clg->state = 3; + clg->state = LOST_IN_GAP_PERIOD; return true; - } else if (clg->a1 + clg->a4 < rnd) - clg->state = 1; + } else if (clg->a1 + clg->a4 < rnd) { + clg->state = TX_IN_GAP_PERIOD; + } break; - case 2: + case TX_IN_BURST_PERIOD: if (rnd < clg->a5) { - clg->state = 3; + clg->state = LOST_IN_GAP_PERIOD; return true; - } else - clg->state = 2; + } else { + clg->state = TX_IN_BURST_PERIOD; + } break; - case 3: + case LOST_IN_GAP_PERIOD: if (rnd < clg->a3) - clg->state = 2; + clg->state = TX_IN_BURST_PERIOD; else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { - clg->state = 1; + clg->state = TX_IN_GAP_PERIOD; } else if (clg->a2 + clg->a3 < rnd) { - clg->state = 3; + clg->state = LOST_IN_GAP_PERIOD; return true; } break; - case 4: - clg->state = 1; + case LOST_IN_BURST_PERIOD: + clg->state = TX_IN_GAP_PERIOD; break; } @@ -263,16 +277,16 @@ static bool loss_gilb_ell(struct netem_sched_data *q) struct clgstate *clg = &q->clg; switch (clg->state) { - case 1: - if (net_random() < clg->a1) - clg->state = 2; - if (net_random() < clg->a4) + case GOOD_STATE: + if (prandom_u32() < clg->a1) + clg->state = BAD_STATE; + if (prandom_u32() < clg->a4) return true; break; - case 2: - if (net_random() < clg->a2) - clg->state = 1; - if (net_random() > clg->a3) + case BAD_STATE: + if (prandom_u32() < clg->a2) + clg->state = GOOD_STATE; + if (prandom_u32() > clg->a3) return true; } @@ -457,7 +471,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb_checksum_help(skb))) return qdisc_drop(skb, sch); - skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + skb->data[prandom_u32() % skb_headlen(skb)] ^= + 1<<(prandom_u32() % 8); } if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) @@ -495,7 +510,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) now = netem_skb_cb(last)->time_to_send; } - delay += packet_len_2_sched_time(skb->len, q); + delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; @@ -633,12 +648,7 @@ static void netem_reset(struct Qdisc *sch) static void dist_free(struct disttable *d) { - if (d) { - if (is_vmalloc_addr(d)) - vfree(d); - else - kfree(d); - } + kvfree(d); } /* @@ -679,9 +689,8 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return 0; } -static void get_correlation(struct Qdisc *sch, const struct nlattr *attr) +static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_corr *c = nla_data(attr); init_crandom(&q->delay_cor, c->delay_corr); @@ -689,47 +698,45 @@ static void get_correlation(struct Qdisc *sch, const struct nlattr *attr) init_crandom(&q->dup_cor, c->dup_corr); } -static void get_reorder(struct Qdisc *sch, const struct nlattr *attr) +static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_reorder *r = nla_data(attr); q->reorder = r->probability; init_crandom(&q->reorder_cor, r->correlation); } -static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) +static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_corrupt *r = nla_data(attr); q->corrupt = r->probability; init_crandom(&q->corrupt_cor, r->correlation); } -static void get_rate(struct Qdisc *sch, const struct nlattr *attr) +static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_rate *r = nla_data(attr); q->rate = r->rate; q->packet_overhead = r->packet_overhead; q->cell_size = r->cell_size; + q->cell_overhead = r->cell_overhead; if (q->cell_size) q->cell_size_reciprocal = reciprocal_value(q->cell_size); - q->cell_overhead = r->cell_overhead; + else + q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; } -static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) +static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct nlattr *la; int rem; nla_for_each_nested(la, attr, rem) { u16 type = nla_type(la); - switch(type) { + switch (type) { case NETEM_LOSS_GI: { const struct tc_netem_gimodel *gi = nla_data(la); @@ -740,7 +747,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) q->loss_model = CLG_4_STATES; - q->clg.state = 1; + q->clg.state = TX_IN_GAP_PERIOD; q->clg.a1 = gi->p13; q->clg.a2 = gi->p31; q->clg.a3 = gi->p32; @@ -758,7 +765,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) } q->loss_model = CLG_GILB_ELL; - q->clg.state = 1; + q->clg.state = GOOD_STATE; q->clg.a1 = ge->p; q->clg.a2 = ge->r; q->clg.a3 = ge->h; @@ -782,6 +789,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, [TCA_NETEM_ECN] = { .type = NLA_U32 }, + [TCA_NETEM_RATE64] = { .type = NLA_U64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -808,6 +816,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; struct tc_netem_qopt *qopt; + struct clgstate old_clg; + int old_loss_model = CLG_RANDOM; int ret; if (opt == NULL) @@ -818,6 +828,33 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (ret < 0) return ret; + /* backup q->clg and q->loss_model */ + old_clg = q->clg; + old_loss_model = q->loss_model; + + if (tb[TCA_NETEM_LOSS]) { + ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); + if (ret) { + q->loss_model = old_loss_model; + return ret; + } + } else { + q->loss_model = CLG_RANDOM; + } + + if (tb[TCA_NETEM_DELAY_DIST]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); + if (ret) { + /* recover clg and loss_model, in case of + * q->clg and q->loss_model were modified + * in get_loss_clg() + */ + q->clg = old_clg; + q->loss_model = old_loss_model; + return ret; + } + } + sch->limit = qopt->limit; q->latency = qopt->latency; @@ -835,30 +872,24 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) q->reorder = ~0; if (tb[TCA_NETEM_CORR]) - get_correlation(sch, tb[TCA_NETEM_CORR]); - - if (tb[TCA_NETEM_DELAY_DIST]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); - if (ret) - return ret; - } + get_correlation(q, tb[TCA_NETEM_CORR]); if (tb[TCA_NETEM_REORDER]) - get_reorder(sch, tb[TCA_NETEM_REORDER]); + get_reorder(q, tb[TCA_NETEM_REORDER]); if (tb[TCA_NETEM_CORRUPT]) - get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); + get_corrupt(q, tb[TCA_NETEM_CORRUPT]); if (tb[TCA_NETEM_RATE]) - get_rate(sch, tb[TCA_NETEM_RATE]); + get_rate(q, tb[TCA_NETEM_RATE]); + + if (tb[TCA_NETEM_RATE64]) + q->rate = max_t(u64, q->rate, + nla_get_u64(tb[TCA_NETEM_RATE64])); if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); - q->loss_model = CLG_RANDOM; - if (tb[TCA_NETEM_LOSS]) - ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); - return ret; } @@ -974,7 +1005,13 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) goto nla_put_failure; - rate.rate = q->rate; + if (q->rate >= (1ULL << 32)) { + if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate)) + goto nla_put_failure; + rate.rate = ~0U; + } else { + rate.rate = q->rate; + } rate.packet_overhead = q->packet_overhead; rate.cell_size = q->cell_size; rate.cell_overhead = q->cell_overhead; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c new file mode 100644 index 00000000000..fefeeb73f15 --- /dev/null +++ b/net/sched/sch_pie.c @@ -0,0 +1,566 @@ +/* Copyright (C) 2013 Cisco Systems, Inc, 2013. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Author: Vijay Subramanian <vijaynsu@cisco.com> + * Author: Mythili Prabhu <mysuryan@cisco.com> + * + * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> + * University of Oslo, Norway. + * + * References: + * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 + * IEEE Conference on High Performance Switching and Routing 2013 : + * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> + +#define QUEUE_THRESHOLD 10000 +#define DQCOUNT_INVALID -1 +#define MAX_PROB 0xffffffff +#define PIE_SCALE 8 + +/* parameters used */ +struct pie_params { + psched_time_t target; /* user specified target delay in pschedtime */ + u32 tupdate; /* timer frequency (in jiffies) */ + u32 limit; /* number of packets that can be enqueued */ + u32 alpha; /* alpha and beta are between 0 and 32 */ + u32 beta; /* and are used for shift relative to 1 */ + bool ecn; /* true if ecn is enabled */ + bool bytemode; /* to scale drop early prob based on pkt size */ +}; + +/* variables used */ +struct pie_vars { + u32 prob; /* probability but scaled by u32 limit. */ + psched_time_t burst_time; + psched_time_t qdelay; + psched_time_t qdelay_old; + u64 dq_count; /* measured in bytes */ + psched_time_t dq_tstamp; /* drain rate */ + u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ + u32 qlen_old; /* in bytes */ +}; + +/* statistics gathering */ +struct pie_stats { + u32 packets_in; /* total number of packets enqueued */ + u32 dropped; /* packets dropped due to pie_action */ + u32 overlimit; /* dropped due to lack of space in queue */ + u32 maxq; /* maximum queue size */ + u32 ecn_mark; /* packets marked with ECN */ +}; + +/* private data for the Qdisc */ +struct pie_sched_data { + struct pie_params params; + struct pie_vars vars; + struct pie_stats stats; + struct timer_list adapt_timer; +}; + +static void pie_params_init(struct pie_params *params) +{ + params->alpha = 2; + params->beta = 20; + params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ + params->limit = 1000; /* default of 1000 packets */ + params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ + params->ecn = false; + params->bytemode = false; +} + +static void pie_vars_init(struct pie_vars *vars) +{ + vars->dq_count = DQCOUNT_INVALID; + vars->avg_dq_rate = 0; + /* default of 100 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); +} + +static bool drop_early(struct Qdisc *sch, u32 packet_size) +{ + struct pie_sched_data *q = qdisc_priv(sch); + u32 rnd; + u32 local_prob = q->vars.prob; + u32 mtu = psched_mtu(qdisc_dev(sch)); + + /* If there is still burst allowance left skip random early drop */ + if (q->vars.burst_time > 0) + return false; + + /* If current delay is less than half of target, and + * if drop prob is low already, disable early_drop + */ + if ((q->vars.qdelay < q->params.target / 2) + && (q->vars.prob < MAX_PROB / 5)) + return false; + + /* If we have fewer than 2 mtu-sized packets, disable drop_early, + * similar to min_th in RED + */ + if (sch->qstats.backlog < 2 * mtu) + return false; + + /* If bytemode is turned on, use packet size to compute new + * probablity. Smaller packets will have lower drop prob in this case + */ + if (q->params.bytemode && packet_size <= mtu) + local_prob = (local_prob / mtu) * packet_size; + else + local_prob = q->vars.prob; + + rnd = prandom_u32(); + if (rnd < local_prob) + return true; + + return false; +} + +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + bool enqueue = false; + + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { + q->stats.overlimit++; + goto out; + } + + if (!drop_early(sch, skb->len)) { + enqueue = true; + } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && + INET_ECN_set_ce(skb)) { + /* If packet is ecn capable, mark it if drop probability + * is lower than 10%, else drop it. + */ + q->stats.ecn_mark++; + enqueue = true; + } + + /* we can enqueue the packet */ + if (enqueue) { + q->stats.packets_in++; + if (qdisc_qlen(sch) > q->stats.maxq) + q->stats.maxq = qdisc_qlen(sch); + + return qdisc_enqueue_tail(skb, sch); + } + +out: + q->stats.dropped++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { + [TCA_PIE_TARGET] = {.type = NLA_U32}, + [TCA_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_PIE_BETA] = {.type = NLA_U32}, + [TCA_PIE_ECN] = {.type = NLA_U32}, + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, +}; + +static int pie_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_PIE_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + /* convert from microseconds to pschedtime */ + if (tb[TCA_PIE_TARGET]) { + /* target is in us */ + u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); + + /* convert to pschedtime */ + q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + } + + /* tupdate is in jiffies */ + if (tb[TCA_PIE_TUPDATE]) + q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + + if (tb[TCA_PIE_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); + + q->params.limit = limit; + sch->limit = limit; + } + + if (tb[TCA_PIE_ALPHA]) + q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + + if (tb[TCA_PIE_BETA]) + q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + + if (tb[TCA_PIE_ECN]) + q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + + if (tb[TCA_PIE_BYTEMODE]) + q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + + /* Drop excess packets if new limit is lower */ + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +{ + + struct pie_sched_data *q = qdisc_priv(sch); + int qlen = sch->qstats.backlog; /* current queue size in bytes */ + + /* If current queue is about 10 packets or more and dq_count is unset + * we have enough packets to calculate the drain rate. Save + * current time as dq_tstamp and start measurement cycle. + */ + if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { + q->vars.dq_tstamp = psched_get_time(); + q->vars.dq_count = 0; + } + + /* Calculate the average drain rate from this value. If queue length + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset + * the dq_count to -1 as we don't have enough packets to calculate the + * drain rate anymore The following if block is entered only when we + * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) + * and we calculate the drain rate for the threshold here. dq_count is + * in bytes, time difference in psched_time, hence rate is in + * bytes/psched_time. + */ + if (q->vars.dq_count != DQCOUNT_INVALID) { + q->vars.dq_count += skb->len; + + if (q->vars.dq_count >= QUEUE_THRESHOLD) { + psched_time_t now = psched_get_time(); + u32 dtime = now - q->vars.dq_tstamp; + u32 count = q->vars.dq_count << PIE_SCALE; + + if (dtime == 0) + return; + + count = count / dtime; + + if (q->vars.avg_dq_rate == 0) + q->vars.avg_dq_rate = count; + else + q->vars.avg_dq_rate = + (q->vars.avg_dq_rate - + (q->vars.avg_dq_rate >> 3)) + (count >> 3); + + /* If the queue has receded below the threshold, we hold + * on to the last drain rate calculated, else we reset + * dq_count to 0 to re-enter the if block when the next + * packet is dequeued + */ + if (qlen < QUEUE_THRESHOLD) + q->vars.dq_count = DQCOUNT_INVALID; + else { + q->vars.dq_count = 0; + q->vars.dq_tstamp = psched_get_time(); + } + + if (q->vars.burst_time > 0) { + if (q->vars.burst_time > dtime) + q->vars.burst_time -= dtime; + else + q->vars.burst_time = 0; + } + } + } +} + +static void calculate_probability(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + u32 qlen = sch->qstats.backlog; /* queue size in bytes */ + psched_time_t qdelay = 0; /* in pschedtime */ + psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ + s32 delta = 0; /* determines the change in probability */ + u32 oldprob; + u32 alpha, beta; + bool update_prob = true; + + q->vars.qdelay_old = q->vars.qdelay; + + if (q->vars.avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + else + qdelay = 0; + + /* If qdelay is zero and qlen is not, it means qlen is very small, less + * than dequeue_rate, so we do not update probabilty in this round + */ + if (qdelay == 0 && qlen != 0) + update_prob = false; + + /* In the algorithm, alpha and beta are between 0 and 2 with typical + * value for alpha as 0.125. In this implementation, we use values 0-32 + * passed from user space to represent this. Also, alpha and beta have + * unit of HZ and need to be scaled before they can used to update + * probability. alpha/beta are updated locally below by 1) scaling them + * appropriately 2) scaling down by 16 to come to 0-2 range. + * Please see paper for details. + * + * We scale alpha and beta differently depending on whether we are in + * light, medium or high dropping mode. + */ + if (q->vars.prob < MAX_PROB / 100) { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; + } else if (q->vars.prob < MAX_PROB / 10) { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; + } else { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + } + + /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ + delta += alpha * ((qdelay - q->params.target)); + delta += beta * ((qdelay - qdelay_old)); + + oldprob = q->vars.prob; + + /* to ensure we increase probability in steps of no more than 2% */ + if (delta > (s32) (MAX_PROB / (100 / 2)) && + q->vars.prob >= MAX_PROB / 10) + delta = (MAX_PROB / 100) * 2; + + /* Non-linear drop: + * Tune drop probability to increase quickly for high delays(>= 250ms) + * 250ms is derived through experiments and provides error protection + */ + + if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) + delta += MAX_PROB / (100 / 2); + + q->vars.prob += delta; + + if (delta > 0) { + /* prevent overflow */ + if (q->vars.prob < oldprob) { + q->vars.prob = MAX_PROB; + /* Prevent normalization error. If probability is at + * maximum value already, we normalize it here, and + * skip the check to do a non-linear drop in the next + * section. + */ + update_prob = false; + } + } else { + /* prevent underflow */ + if (q->vars.prob > oldprob) + q->vars.prob = 0; + } + + /* Non-linear drop in probability: Reduce drop probability quickly if + * delay is 0 for 2 consecutive Tupdate periods. + */ + + if ((qdelay == 0) && (qdelay_old == 0) && update_prob) + q->vars.prob = (q->vars.prob * 98) / 100; + + q->vars.qdelay = qdelay; + q->vars.qlen_old = qlen; + + /* We restart the measurement cycle if the following conditions are met + * 1. If the delay has been low for 2 consecutive Tupdate periods + * 2. Calculated drop probability is zero + * 3. We have atleast one estimate for the avg_dq_rate ie., + * is a non-zero value + */ + if ((q->vars.qdelay < q->params.target / 2) && + (q->vars.qdelay_old < q->params.target / 2) && + (q->vars.prob == 0) && + (q->vars.avg_dq_rate > 0)) + pie_vars_init(&q->vars); +} + +static void pie_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct pie_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + + spin_lock(root_lock); + calculate_probability(sch); + + /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ + if (q->params.tupdate) + mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); + spin_unlock(root_lock); + +} + +static int pie_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct pie_sched_data *q = qdisc_priv(sch); + + pie_params_init(&q->params); + pie_vars_init(&q->vars); + sch->limit = q->params.limit; + + setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); + mod_timer(&q->adapt_timer, jiffies + HZ / 2); + + if (opt) { + int err = pie_change(sch, opt); + + if (err) + return err; + } + + return 0; +} + +static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + /* convert target from pschedtime to us */ + if (nla_put_u32(skb, TCA_PIE_TARGET, + ((u32) PSCHED_TICKS2NS(q->params.target)) / + NSEC_PER_USEC) || + nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || + nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || + nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || + nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || + nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; + +} + +static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct tc_pie_xstats st = { + .prob = q->vars.prob, + .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / + NSEC_PER_USEC, + /* unscale and return dq_rate in bytes per sec */ + .avg_dq_rate = q->vars.avg_dq_rate * + (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, + .packets_in = q->stats.packets_in, + .overlimit = q->stats.overlimit, + .maxq = q->stats.maxq, + .dropped = q->stats.dropped, + .ecn_mark = q->stats.ecn_mark, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + skb = __qdisc_dequeue_head(sch, &sch->q); + + if (!skb) + return NULL; + + pie_process_dequeue(sch, skb); + return skb; +} + +static void pie_reset(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + qdisc_reset_queue(sch); + pie_vars_init(&q->vars); +} + +static void pie_destroy(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + q->params.tupdate = 0; + del_timer_sync(&q->adapt_timer); +} + +static struct Qdisc_ops pie_qdisc_ops __read_mostly = { + .id = "pie", + .priv_size = sizeof(struct pie_sched_data), + .enqueue = pie_qdisc_enqueue, + .dequeue = pie_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = pie_init, + .destroy = pie_destroy, + .reset = pie_reset, + .change = pie_change, + .dump = pie_dump, + .dump_stats = pie_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init pie_module_init(void) +{ + return register_qdisc(&pie_qdisc_ops); +} + +static void __exit pie_module_exit(void) +{ + unregister_qdisc(&pie_qdisc_ops); +} + +module_init(pie_module_init); +module_exit(pie_module_exit); + +MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); +MODULE_AUTHOR("Vijay Subramanian"); +MODULE_AUTHOR("Mythili Prabhu"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 30ea4674cab..9b0f7093d97 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -220,7 +220,7 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) { - q->bins[slot].perturbation = net_random(); + q->bins[slot].perturbation = prandom_u32(); } static void sfb_swap_slot(struct sfb_sched_data *q) @@ -381,7 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto enqueue; } - r = net_random() & SFB_MAX_PROB; + r = prandom_u32() & SFB_MAX_PROB; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index d3a1bc26dbf..1af2f73906d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -237,10 +237,12 @@ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) } #define sfq_unlink(q, x, n, p) \ - n = q->slots[x].dep.next; \ - p = q->slots[x].dep.prev; \ - sfq_dep_head(q, p)->next = n; \ - sfq_dep_head(q, n)->prev = p + do { \ + n = q->slots[x].dep.next; \ + p = q->slots[x].dep.prev; \ + sfq_dep_head(q, p)->next = n; \ + sfq_dep_head(q, n)->prev = p; \ + } while (0) static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) @@ -627,7 +629,7 @@ static void sfq_perturbation(unsigned long arg) spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - q->perturbation = net_random(); + q->perturbation = prandom_u32(); if (!q->filter_list && q->tail) sfq_rehash(sch); spin_unlock(root_lock); @@ -696,7 +698,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) del_timer(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); - q->perturbation = net_random(); + q->perturbation = prandom_u32(); } sch_tree_unlock(sch); kfree(p); @@ -714,12 +716,7 @@ static void *sfq_alloc(size_t sz) static void sfq_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void sfq_destroy(struct Qdisc *sch) @@ -757,7 +754,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; - q->perturbation = net_random(); + q->perturbation = prandom_u32(); if (opt) { int err = sfq_change(sch, opt); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index a6090051c5d..18ff6343370 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -21,7 +21,6 @@ #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> -#include <net/tcp.h> /* Simple Token Bucket Filter. @@ -102,12 +101,11 @@ struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ + u32 max_size; s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ s64 mtu; - u32 max_size; struct psched_ratecfg rate; struct psched_ratecfg peak; - bool peak_present; /* Variables */ s64 tokens; /* Current number of B tokens */ @@ -118,20 +116,40 @@ struct tbf_sched_data { }; +/* Time to Length, convert time in ns to length in bytes + * to determinate how many bytes can be sent in given time. + */ +static u64 psched_ns_t2l(const struct psched_ratecfg *r, + u64 time_in_ns) +{ + /* The formula is : + * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC + */ + u64 len = time_in_ns * r->rate_bytes_ps; + + do_div(len, NSEC_PER_SEC); + + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { + do_div(len, 53); + len = len * 48; + } + + if (len > r->overhead) + len -= r->overhead; + else + len = 0; + + return len; +} + /* * Return length of individual segments of a gso packet, * including all headers (MAC, IP, TCP/UDP) */ -static unsigned int skb_gso_seglen(const struct sk_buff *skb) +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - const struct skb_shared_info *shinfo = skb_shinfo(skb); - - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); - return hdr_len + shinfo->gso_size; + return hdr_len + skb_gso_transport_seglen(skb); } /* GSO packet is too big, segment it so that tbf can transmit @@ -176,7 +194,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) int ret; if (qdisc_pkt_len(skb) > q->max_size) { - if (skb_is_gso(skb) && skb_gso_seglen(skb) <= q->max_size) + if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) return tbf_segment(skb, sch); return qdisc_reshape_fail(skb, sch); } @@ -203,6 +221,11 @@ static unsigned int tbf_drop(struct Qdisc *sch) return len; } +static bool tbf_peak_present(const struct tbf_sched_data *q) +{ + return q->peak.rate_bytes_ps; +} + static struct sk_buff *tbf_dequeue(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -219,7 +242,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) now = ktime_to_ns(ktime_get()); toks = min_t(s64, now - q->t_c, q->buffer); - if (q->peak_present) { + if (tbf_peak_present(q)) { ptoks = toks + q->ptokens; if (ptoks > q->mtu) ptoks = q->mtu; @@ -281,6 +304,8 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_TBF_RATE64] = { .type = NLA_U64 }, [TCA_TBF_PRATE64] = { .type = NLA_U64 }, + [TCA_TBF_BURST] = { .type = NLA_U32 }, + [TCA_TBF_PBURST] = { .type = NLA_U32 }, }; static int tbf_change(struct Qdisc *sch, struct nlattr *opt) @@ -289,10 +314,11 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; - struct qdisc_rate_table *rtab = NULL; - struct qdisc_rate_table *ptab = NULL; struct Qdisc *child = NULL; - int max_size, n; + struct psched_ratecfg rate; + struct psched_ratecfg peak; + u64 max_size; + s64 buffer, mtu; u64 rate64 = 0, prate64 = 0; err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy); @@ -304,39 +330,60 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) goto done; qopt = nla_data(tb[TCA_TBF_PARMS]); - rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]); - if (rtab == NULL) - goto done; + if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, + tb[TCA_TBF_RTAB])); + + if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, + tb[TCA_TBF_PTAB])); + + buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); + mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); + + if (tb[TCA_TBF_RATE64]) + rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); + psched_ratecfg_precompute(&rate, &qopt->rate, rate64); + + if (tb[TCA_TBF_BURST]) { + max_size = nla_get_u32(tb[TCA_TBF_BURST]); + buffer = psched_l2t_ns(&rate, max_size); + } else { + max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); + } if (qopt->peakrate.rate) { - if (qopt->peakrate.rate > qopt->rate.rate) - ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]); - if (ptab == NULL) + if (tb[TCA_TBF_PRATE64]) + prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); + psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); + if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { + pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", + peak.rate_bytes_ps, rate.rate_bytes_ps); + err = -EINVAL; goto done; - } + } - for (n = 0; n < 256; n++) - if (rtab->data[n] > qopt->buffer) - break; - max_size = (n << qopt->rate.cell_log) - 1; - if (ptab) { - int size; - - for (n = 0; n < 256; n++) - if (ptab->data[n] > qopt->mtu) - break; - size = (n << qopt->peakrate.cell_log) - 1; - if (size < max_size) - max_size = size; + if (tb[TCA_TBF_PBURST]) { + u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); + max_size = min_t(u32, max_size, pburst); + mtu = psched_l2t_ns(&peak, pburst); + } else { + max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); + } + } else { + memset(&peak, 0, sizeof(peak)); } - if (max_size < 0) - goto done; if (max_size < psched_mtu(qdisc_dev(sch))) - pr_warn_ratelimited("sch_tbf: burst %u is lower than device %s mtu (%u) !\n", + pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", max_size, qdisc_dev(sch)->name, psched_mtu(qdisc_dev(sch))); + if (!max_size) { + err = -EINVAL; + goto done; + } + if (q->qdisc != &noop_qdisc) { err = fifo_set_limit(q->qdisc, qopt->limit); if (err) @@ -356,31 +403,24 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->qdisc = child; } q->limit = qopt->limit; - q->mtu = PSCHED_TICKS2NS(qopt->mtu); + if (tb[TCA_TBF_PBURST]) + q->mtu = mtu; + else + q->mtu = PSCHED_TICKS2NS(qopt->mtu); q->max_size = max_size; - q->buffer = PSCHED_TICKS2NS(qopt->buffer); + if (tb[TCA_TBF_BURST]) + q->buffer = buffer; + else + q->buffer = PSCHED_TICKS2NS(qopt->buffer); q->tokens = q->buffer; q->ptokens = q->mtu; - if (tb[TCA_TBF_RATE64]) - rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); - psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64); - if (ptab) { - if (tb[TCA_TBF_PRATE64]) - prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); - psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64); - q->peak_present = true; - } else { - q->peak_present = false; - } + memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); + memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); sch_tree_unlock(sch); err = 0; done: - if (rtab) - qdisc_put_rtab(rtab); - if (ptab) - qdisc_put_rtab(ptab); return err; } @@ -419,7 +459,7 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) opt.limit = q->limit; psched_ratecfg_getrate(&opt.rate, &q->rate); - if (q->peak_present) + if (tbf_peak_present(q)) psched_ratecfg_getrate(&opt.peakrate, &q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); @@ -430,13 +470,12 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) if (q->rate.rate_bytes_ps >= (1ULL << 32) && nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps)) goto nla_put_failure; - if (q->peak_present && + if (tbf_peak_present(q) && q->peak.rate_bytes_ps >= (1ULL << 32) && nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps)) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 3d7c6bd4631..06a9ee6b2d3 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -55,6 +55,7 @@ #include <net/sctp/sm.h> /* Forward declarations for internal functions. */ +static void sctp_select_active_and_retran_path(struct sctp_association *asoc); static void sctp_assoc_bh_rcv(struct work_struct *work); static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc); static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc); @@ -145,8 +146,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; - asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = - min_t(unsigned long, sp->autoclose, net->sctp.max_autoclose) * HZ; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) @@ -254,8 +254,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->peer.ipv6_address = 1; INIT_LIST_HEAD(&asoc->asocs); - asoc->autoclose = sp->autoclose; - asoc->default_stream = sp->default_stream; asoc->default_ppid = sp->default_ppid; asoc->default_flags = sp->default_flags; @@ -333,7 +331,7 @@ void sctp_association_free(struct sctp_association *asoc) /* Only real associations count against the endpoint, so * don't bother for if this is a temporary association. */ - if (!asoc->temp) { + if (!list_empty(&asoc->asocs)) { list_del(&asoc->asocs); /* Decrement the backlog value for a TCP-style listening @@ -777,9 +775,6 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, sctp_transport_cmd_t command, sctp_sn_error_t error) { - struct sctp_transport *t = NULL; - struct sctp_transport *first; - struct sctp_transport *second; struct sctp_ulpevent *event; struct sockaddr_storage addr; int spc_state = 0; @@ -832,13 +827,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, return; } - /* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the - * user. + /* Generate and send a SCTP_PEER_ADDR_CHANGE notification + * to the user. */ if (ulp_notify) { memset(&addr, 0, sizeof(struct sockaddr_storage)); memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); + event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, spc_state, error, GFP_ATOMIC); if (event) @@ -846,60 +842,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, } /* Select new active and retran paths. */ - - /* Look for the two most recently used active transports. - * - * This code produces the wrong ordering whenever jiffies - * rolls over, but we still get usable transports, so we don't - * worry about it. - */ - first = NULL; second = NULL; - - list_for_each_entry(t, &asoc->peer.transport_addr_list, - transports) { - - if ((t->state == SCTP_INACTIVE) || - (t->state == SCTP_UNCONFIRMED) || - (t->state == SCTP_PF)) - continue; - if (!first || t->last_time_heard > first->last_time_heard) { - second = first; - first = t; - } else if (!second || - t->last_time_heard > second->last_time_heard) - second = t; - } - - /* RFC 2960 6.4 Multi-Homed SCTP Endpoints - * - * By default, an endpoint should always transmit to the - * primary path, unless the SCTP user explicitly specifies the - * destination transport address (and possibly source - * transport address) to use. - * - * [If the primary is active but not most recent, bump the most - * recently used transport.] - */ - if (((asoc->peer.primary_path->state == SCTP_ACTIVE) || - (asoc->peer.primary_path->state == SCTP_UNKNOWN)) && - first != asoc->peer.primary_path) { - second = first; - first = asoc->peer.primary_path; - } - - if (!second) - second = first; - /* If we failed to find a usable transport, just camp on the - * primary, even if it is inactive. - */ - if (!first) { - first = asoc->peer.primary_path; - second = asoc->peer.primary_path; - } - - /* Set the active and retran transports. */ - asoc->peer.active_path = first; - asoc->peer.retran_path = second; + sctp_select_active_and_retran_path(asoc); } /* Hold a reference to an association. */ @@ -1093,7 +1036,7 @@ static void sctp_assoc_bh_rcv(struct work_struct *work) } if (chunk->transport) - chunk->transport->last_time_heard = jiffies; + chunk->transport->last_time_heard = ktime_get(); /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, @@ -1154,6 +1097,7 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->c = new->c; asoc->peer.rwnd = new->peer.rwnd; asoc->peer.sack_needed = new->peer.sack_needed; + asoc->peer.auth_capable = new->peer.auth_capable; asoc->peer.i = new->peer.i; sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, GFP_ATOMIC); @@ -1242,78 +1186,204 @@ void sctp_assoc_update(struct sctp_association *asoc, } /* Update the retran path for sending a retransmitted packet. - * Round-robin through the active transports, else round-robin - * through the inactive transports as this is the next best thing - * we can try. + * See also RFC4960, 6.4. Multi-Homed SCTP Endpoints: + * + * When there is outbound data to send and the primary path + * becomes inactive (e.g., due to failures), or where the + * SCTP user explicitly requests to send data to an + * inactive destination transport address, before reporting + * an error to its ULP, the SCTP endpoint should try to send + * the data to an alternate active destination transport + * address if one exists. + * + * When retransmitting data that timed out, if the endpoint + * is multihomed, it should consider each source-destination + * address pair in its retransmission selection policy. + * When retransmitting timed-out data, the endpoint should + * attempt to pick the most divergent source-destination + * pair from the original source-destination pair to which + * the packet was transmitted. + * + * Note: Rules for picking the most divergent source-destination + * pair are an implementation decision and are not specified + * within this document. + * + * Our basic strategy is to round-robin transports in priorities + * according to sctp_state_prio_map[] e.g., if no such + * transport with state SCTP_ACTIVE exists, round-robin through + * SCTP_UNKNOWN, etc. You get the picture. */ +static const u8 sctp_trans_state_to_prio_map[] = { + [SCTP_ACTIVE] = 3, /* best case */ + [SCTP_UNKNOWN] = 2, + [SCTP_PF] = 1, + [SCTP_INACTIVE] = 0, /* worst case */ +}; + +static u8 sctp_trans_score(const struct sctp_transport *trans) +{ + return sctp_trans_state_to_prio_map[trans->state]; +} + +static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1, + struct sctp_transport *trans2) +{ + if (trans1->error_count > trans2->error_count) { + return trans2; + } else if (trans1->error_count == trans2->error_count && + ktime_after(trans2->last_time_heard, + trans1->last_time_heard)) { + return trans2; + } else { + return trans1; + } +} + +static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, + struct sctp_transport *best) +{ + u8 score_curr, score_best; + + if (best == NULL) + return curr; + + score_curr = sctp_trans_score(curr); + score_best = sctp_trans_score(best); + + /* First, try a score-based selection if both transport states + * differ. If we're in a tie, lets try to make a more clever + * decision here based on error counts and last time heard. + */ + if (score_curr > score_best) + return curr; + else if (score_curr == score_best) + return sctp_trans_elect_tie(curr, best); + else + return best; +} + void sctp_assoc_update_retran_path(struct sctp_association *asoc) { - struct sctp_transport *t, *next; - struct list_head *head = &asoc->peer.transport_addr_list; - struct list_head *pos; + struct sctp_transport *trans = asoc->peer.retran_path; + struct sctp_transport *trans_next = NULL; + /* We're done as we only have the one and only path. */ if (asoc->peer.transport_count == 1) return; + /* If active_path and retran_path are the same and active, + * then this is the only active path. Use it. + */ + if (asoc->peer.active_path == asoc->peer.retran_path && + asoc->peer.active_path->state == SCTP_ACTIVE) + return; - /* Find the next transport in a round-robin fashion. */ - t = asoc->peer.retran_path; - pos = &t->transports; - next = NULL; + /* Iterate from retran_path's successor back to retran_path. */ + for (trans = list_next_entry(trans, transports); 1; + trans = list_next_entry(trans, transports)) { + /* Manually skip the head element. */ + if (&trans->transports == &asoc->peer.transport_addr_list) + continue; + if (trans->state == SCTP_UNCONFIRMED) + continue; + trans_next = sctp_trans_elect_best(trans, trans_next); + /* Active is good enough for immediate return. */ + if (trans_next->state == SCTP_ACTIVE) + break; + /* We've reached the end, time to update path. */ + if (trans == asoc->peer.retran_path) + break; + } - while (1) { - /* Skip the head. */ - if (pos->next == head) - pos = head->next; - else - pos = pos->next; + asoc->peer.retran_path = trans_next; - t = list_entry(pos, struct sctp_transport, transports); + pr_debug("%s: association:%p updated new path to addr:%pISpc\n", + __func__, asoc, &asoc->peer.retran_path->ipaddr.sa); +} - /* We have exhausted the list, but didn't find any - * other active transports. If so, use the next - * transport. +static void sctp_select_active_and_retran_path(struct sctp_association *asoc) +{ + struct sctp_transport *trans, *trans_pri = NULL, *trans_sec = NULL; + struct sctp_transport *trans_pf = NULL; + + /* Look for the two most recently used active transports. */ + list_for_each_entry(trans, &asoc->peer.transport_addr_list, + transports) { + /* Skip uninteresting transports. */ + if (trans->state == SCTP_INACTIVE || + trans->state == SCTP_UNCONFIRMED) + continue; + /* Keep track of the best PF transport from our + * list in case we don't find an active one. */ - if (t == asoc->peer.retran_path) { - t = next; - break; + if (trans->state == SCTP_PF) { + trans_pf = sctp_trans_elect_best(trans, trans_pf); + continue; } - - /* Try to find an active transport. */ - - if ((t->state == SCTP_ACTIVE) || - (t->state == SCTP_UNKNOWN)) { - break; - } else { - /* Keep track of the next transport in case - * we don't find any active transport. - */ - if (t->state != SCTP_UNCONFIRMED && !next) - next = t; + /* For active transports, pick the most recent ones. */ + if (trans_pri == NULL || + ktime_after(trans->last_time_heard, + trans_pri->last_time_heard)) { + trans_sec = trans_pri; + trans_pri = trans; + } else if (trans_sec == NULL || + ktime_after(trans->last_time_heard, + trans_sec->last_time_heard)) { + trans_sec = trans; } } - if (t) - asoc->peer.retran_path = t; - else - t = asoc->peer.retran_path; + /* RFC 2960 6.4 Multi-Homed SCTP Endpoints + * + * By default, an endpoint should always transmit to the primary + * path, unless the SCTP user explicitly specifies the + * destination transport address (and possibly source transport + * address) to use. [If the primary is active but not most recent, + * bump the most recently used transport.] + */ + if ((asoc->peer.primary_path->state == SCTP_ACTIVE || + asoc->peer.primary_path->state == SCTP_UNKNOWN) && + asoc->peer.primary_path != trans_pri) { + trans_sec = trans_pri; + trans_pri = asoc->peer.primary_path; + } + + /* We did not find anything useful for a possible retransmission + * path; either primary path that we found is the the same as + * the current one, or we didn't generally find an active one. + */ + if (trans_sec == NULL) + trans_sec = trans_pri; - pr_debug("%s: association:%p addr:%pISpc\n", __func__, asoc, - &t->ipaddr.sa); + /* If we failed to find a usable transport, just camp on the + * primary or retran, even if they are inactive, if possible + * pick a PF iff it's the better choice. + */ + if (trans_pri == NULL) { + trans_pri = sctp_trans_elect_best(asoc->peer.primary_path, + asoc->peer.retran_path); + trans_pri = sctp_trans_elect_best(trans_pri, trans_pf); + trans_sec = asoc->peer.primary_path; + } + + /* Set the active and retran transports. */ + asoc->peer.active_path = trans_pri; + asoc->peer.retran_path = trans_sec; } -/* Choose the transport for sending retransmit packet. */ -struct sctp_transport *sctp_assoc_choose_alter_transport( - struct sctp_association *asoc, struct sctp_transport *last_sent_to) +struct sctp_transport * +sctp_assoc_choose_alter_transport(struct sctp_association *asoc, + struct sctp_transport *last_sent_to) { /* If this is the first time packet is sent, use the active path, * else use the retran path. If the last packet was sent over the * retran path, update the retran path and use it. */ - if (!last_sent_to) + if (last_sent_to == NULL) { return asoc->peer.active_path; - else { + } else { if (last_sent_to == asoc->peer.retran_path) sctp_assoc_update_retran_path(asoc); + return asoc->peer.retran_path; } } @@ -1522,7 +1592,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { - bool preload = gfp & __GFP_WAIT; + bool preload = !!(gfp & __GFP_WAIT); int ret; /* If the id is already assigned, keep it. */ diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 5c9f64c1c90..0e8529113dc 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -41,7 +41,7 @@ static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, - .hmac_name="hmac(sha1)", + .hmac_name = "hmac(sha1)", .hmac_len = SCTP_SHA1_SIG_SIZE, }, { @@ -51,7 +51,7 @@ static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { #if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, - .hmac_name="hmac(sha256)", + .hmac_name = "hmac(sha256)", .hmac_len = SCTP_SHA256_SIG_SIZE, } #endif @@ -163,7 +163,7 @@ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1, * lead-zero padded. If it is not, it * is automatically larger numerically. */ - for (i = 0; i < abs(diff); i++ ) { + for (i = 0; i < abs(diff); i++) { if (longer[i] != 0) return diff; } @@ -226,9 +226,9 @@ static struct sctp_auth_bytes *sctp_auth_make_local_vector( gfp_t gfp) { return sctp_auth_make_key_vector( - (sctp_random_param_t*)asoc->c.auth_random, - (sctp_chunks_param_t*)asoc->c.auth_chunks, - (sctp_hmac_algo_param_t*)asoc->c.auth_hmacs, + (sctp_random_param_t *)asoc->c.auth_random, + (sctp_chunks_param_t *)asoc->c.auth_chunks, + (sctp_hmac_algo_param_t *)asoc->c.auth_hmacs, gfp); } @@ -386,14 +386,13 @@ nomem: */ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_auth_bytes *secret; struct sctp_shared_key *ep_key; /* If we don't support AUTH, or peer is not capable * we don't need to do anything. */ - if (!net->sctp.auth_enable || !asoc->peer.auth_capable) + if (!asoc->ep->auth_enable || !asoc->peer.auth_capable) return 0; /* If the key_id is non-zero and we couldn't find an @@ -440,16 +439,16 @@ struct sctp_shared_key *sctp_auth_get_shkey( */ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) { - struct net *net = sock_net(ep->base.sk); struct crypto_hash *tfm = NULL; __u16 id; - /* if the transforms are already allocted, we are done */ - if (!net->sctp.auth_enable) { + /* If AUTH extension is disabled, we are done */ + if (!ep->auth_enable) { ep->auth_hmacs = NULL; return 0; } + /* If the transforms are already allocated, we are done */ if (ep->auth_hmacs) return 0; @@ -499,8 +498,7 @@ void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[]) if (!auth_hmacs) return; - for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) - { + for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) { if (auth_hmacs[i]) crypto_free_hash(auth_hmacs[i]); } @@ -647,15 +645,15 @@ static int __sctp_auth_cid(sctp_cid_t chunk, struct sctp_chunks_param *param) */ for (i = 0; !found && i < len; i++) { switch (param->chunks[i]) { - case SCTP_CID_INIT: - case SCTP_CID_INIT_ACK: - case SCTP_CID_SHUTDOWN_COMPLETE: - case SCTP_CID_AUTH: + case SCTP_CID_INIT: + case SCTP_CID_INIT_ACK: + case SCTP_CID_SHUTDOWN_COMPLETE: + case SCTP_CID_AUTH: break; - default: + default: if (param->chunks[i] == chunk) - found = 1; + found = 1; break; } } @@ -666,12 +664,10 @@ static int __sctp_auth_cid(sctp_cid_t chunk, struct sctp_chunks_param *param) /* Check if peer requested that this chunk is authenticated */ int sctp_auth_send_cid(sctp_cid_t chunk, const struct sctp_association *asoc) { - struct net *net; if (!asoc) return 0; - net = sock_net(asoc->base.sk); - if (!net->sctp.auth_enable || !asoc->peer.auth_capable) + if (!asoc->ep->auth_enable || !asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, asoc->peer.peer_chunks); @@ -680,12 +676,10 @@ int sctp_auth_send_cid(sctp_cid_t chunk, const struct sctp_association *asoc) /* Check if we requested that peer authenticate this chunk. */ int sctp_auth_recv_cid(sctp_cid_t chunk, const struct sctp_association *asoc) { - struct net *net; if (!asoc) return 0; - net = sock_net(asoc->base.sk); - if (!net->sctp.auth_enable) + if (!asoc->ep->auth_enable) return 0; return __sctp_auth_cid(chunk, diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 5573e425b0c..158701da2d3 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -254,7 +254,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, SCTP_INC_STATS_USER(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); /* Create chunks for all the full sized DATA chunks. */ - for (i=0, len=first_len; i < whole; i++) { + for (i = 0, len = first_len; i < whole; i++) { frag = SCTP_DATA_MIDDLE_FRAG; if (0 == i) @@ -317,7 +317,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, goto errout; } - err = sctp_user_addto_chunk(chunk, offset, over,msgh->msg_iov); + err = sctp_user_addto_chunk(chunk, offset, over, msgh->msg_iov); /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 6ffb6c1b13b..9da76ba4d10 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -68,7 +68,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, if (!ep->digest) return NULL; - if (net->sctp.auth_enable) { + ep->auth_enable = net->sctp.auth_enable; + if (ep->auth_enable) { /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. @@ -368,9 +369,9 @@ struct sctp_association *sctp_endpoint_lookup_assoc( { struct sctp_association *asoc; - sctp_local_bh_disable(); + local_bh_disable(); asoc = __sctp_endpoint_lookup_assoc(ep, paddr, transport); - sctp_local_bh_enable(); + local_bh_enable(); return asoc; } @@ -480,7 +481,7 @@ normal: } if (chunk->transport) - chunk->transport->last_time_heard = jiffies; + chunk->transport->last_time_heard = ktime_get(); error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state, ep, asoc, chunk, GFP_ATOMIC); diff --git a/net/sctp/input.c b/net/sctp/input.c index 66038533ca5..f2e2cbd2d75 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -119,7 +119,7 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_af *af; struct net *net = dev_net(skb->dev); - if (skb->pkt_type!=PACKET_HOST) + if (skb->pkt_type != PACKET_HOST) goto discard_it; SCTP_INC_STATS_BH(net, SCTP_MIB_INSCTPPACKS); @@ -180,8 +180,7 @@ int sctp_rcv(struct sk_buff *skb) * If a frame arrives on an interface and the receiving socket is * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB */ - if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) - { + if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) { if (asoc) { sctp_association_put(asoc); asoc = NULL; @@ -239,7 +238,7 @@ int sctp_rcv(struct sk_buff *skb) * bottom halves on this lock, but a user may be in the lock too, * so check if it is busy. */ - sctp_bh_lock_sock(sk); + bh_lock_sock(sk); if (sk != rcvr->sk) { /* Our cached sk is different from the rcvr->sk. This is @@ -249,14 +248,14 @@ int sctp_rcv(struct sk_buff *skb) * be doing something with the new socket. Switch our veiw * of the current sk. */ - sctp_bh_unlock_sock(sk); + bh_unlock_sock(sk); sk = rcvr->sk; - sctp_bh_lock_sock(sk); + bh_lock_sock(sk); } if (sock_owned_by_user(sk)) { if (sctp_add_backlog(sk, skb)) { - sctp_bh_unlock_sock(sk); + bh_unlock_sock(sk); sctp_chunk_free(chunk); skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; @@ -267,7 +266,7 @@ int sctp_rcv(struct sk_buff *skb) sctp_inq_push(&chunk->rcvr->inqueue, chunk); } - sctp_bh_unlock_sock(sk); + bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ if (asoc) @@ -328,7 +327,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) */ sk = rcvr->sk; - sctp_bh_lock_sock(sk); + bh_lock_sock(sk); if (sock_owned_by_user(sk)) { if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) @@ -338,7 +337,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) } else sctp_inq_push(inqueue, chunk); - sctp_bh_unlock_sock(sk); + bh_unlock_sock(sk); /* If the chunk was backloged again, don't drop refs */ if (backloged) @@ -523,7 +522,7 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, goto out; } - sctp_bh_lock_sock(sk); + bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. @@ -536,17 +535,15 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, return sk; out: - if (asoc) - sctp_association_put(asoc); + sctp_association_put(asoc); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ void sctp_err_finish(struct sock *sk, struct sctp_association *asoc) { - sctp_bh_unlock_sock(sk); - if (asoc) - sctp_association_put(asoc); + bh_unlock_sock(sk); + sctp_association_put(asoc); } /* @@ -612,8 +609,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) if (ICMP_FRAG_NEEDED == code) { sctp_icmp_frag_needed(sk, asoc, transport, info); goto out_unlock; - } - else { + } else { if (ICMP_PROT_UNREACH == code) { sctp_icmp_proto_unreachable(sk, asoc, transport); @@ -722,17 +718,17 @@ static void __sctp_hash_endpoint(struct sctp_endpoint *ep) epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; - sctp_write_lock(&head->lock); + write_lock(&head->lock); hlist_add_head(&epb->node, &head->chain); - sctp_write_unlock(&head->lock); + write_unlock(&head->lock); } /* Add an endpoint to the hash. Local BH-safe. */ void sctp_hash_endpoint(struct sctp_endpoint *ep) { - sctp_local_bh_disable(); + local_bh_disable(); __sctp_hash_endpoint(ep); - sctp_local_bh_enable(); + local_bh_enable(); } /* Remove endpoint from the hash table. */ @@ -748,17 +744,17 @@ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) head = &sctp_ep_hashtable[epb->hashent]; - sctp_write_lock(&head->lock); + write_lock(&head->lock); hlist_del_init(&epb->node); - sctp_write_unlock(&head->lock); + write_unlock(&head->lock); } /* Remove endpoint from the hash. Local BH-safe. */ void sctp_unhash_endpoint(struct sctp_endpoint *ep) { - sctp_local_bh_disable(); + local_bh_disable(); __sctp_unhash_endpoint(ep); - sctp_local_bh_enable(); + local_bh_enable(); } /* Look up an endpoint. */ @@ -802,9 +798,9 @@ static void __sctp_hash_established(struct sctp_association *asoc) head = &sctp_assoc_hashtable[epb->hashent]; - sctp_write_lock(&head->lock); + write_lock(&head->lock); hlist_add_head(&epb->node, &head->chain); - sctp_write_unlock(&head->lock); + write_unlock(&head->lock); } /* Add an association to the hash. Local BH-safe. */ @@ -813,9 +809,9 @@ void sctp_hash_established(struct sctp_association *asoc) if (asoc->temp) return; - sctp_local_bh_disable(); + local_bh_disable(); __sctp_hash_established(asoc); - sctp_local_bh_enable(); + local_bh_enable(); } /* Remove association from the hash table. */ @@ -832,9 +828,9 @@ static void __sctp_unhash_established(struct sctp_association *asoc) head = &sctp_assoc_hashtable[epb->hashent]; - sctp_write_lock(&head->lock); + write_lock(&head->lock); hlist_del_init(&epb->node); - sctp_write_unlock(&head->lock); + write_unlock(&head->lock); } /* Remove association from the hash table. Local BH-safe. */ @@ -843,9 +839,9 @@ void sctp_unhash_established(struct sctp_association *asoc) if (asoc->temp) return; - sctp_local_bh_disable(); + local_bh_disable(); __sctp_unhash_established(asoc); - sctp_local_bh_enable(); + local_bh_enable(); } /* Look up an association. */ @@ -895,9 +891,9 @@ struct sctp_association *sctp_lookup_association(struct net *net, { struct sctp_association *asoc; - sctp_local_bh_disable(); + local_bh_disable(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp); - sctp_local_bh_enable(); + local_bh_enable(); return asoc; } @@ -1057,31 +1053,31 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, if (ch_end > skb_tail_pointer(skb)) break; - switch(ch->type) { - case SCTP_CID_AUTH: - have_auth = chunk_num; - break; - - case SCTP_CID_COOKIE_ECHO: - /* If a packet arrives containing an AUTH chunk as - * a first chunk, a COOKIE-ECHO chunk as the second - * chunk, and possibly more chunks after them, and - * the receiver does not have an STCB for that - * packet, then authentication is based on - * the contents of the COOKIE- ECHO chunk. - */ - if (have_auth == 1 && chunk_num == 2) - return NULL; - break; - - case SCTP_CID_ASCONF: - if (have_auth || net->sctp.addip_noauth) - asoc = __sctp_rcv_asconf_lookup( - net, ch, laddr, - sctp_hdr(skb)->source, - transportp); - default: - break; + switch (ch->type) { + case SCTP_CID_AUTH: + have_auth = chunk_num; + break; + + case SCTP_CID_COOKIE_ECHO: + /* If a packet arrives containing an AUTH chunk as + * a first chunk, a COOKIE-ECHO chunk as the second + * chunk, and possibly more chunks after them, and + * the receiver does not have an STCB for that + * packet, then authentication is based on + * the contents of the COOKIE- ECHO chunk. + */ + if (have_auth == 1 && chunk_num == 2) + return NULL; + break; + + case SCTP_CID_ASCONF: + if (have_auth || net->sctp.addip_noauth) + asoc = __sctp_rcv_asconf_lookup( + net, ch, laddr, + sctp_hdr(skb)->source, + transportp); + default: + break; } if (asoc) @@ -1118,19 +1114,10 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ - switch (ch->type) { - case SCTP_CID_INIT: - case SCTP_CID_INIT_ACK: + if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK) return __sctp_rcv_init_lookup(net, skb, laddr, transportp); - break; - default: - return __sctp_rcv_walk_lookup(net, skb, laddr, transportp); - break; - } - - - return NULL; + return __sctp_rcv_walk_lookup(net, skb, laddr, transportp); } /* Lookup an association for an inbound skb. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 32db816ffba..1999592ba88 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -172,7 +172,8 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, switch (type) { case ICMPV6_PKT_TOOBIG: - sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info)); + if (ip6_sk_accept_pmtu(sk)) + sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info)); goto out_unlock; case ICMPV6_PARAMPROB: if (ICMPV6_UNK_NEXTHDR == code) { @@ -215,7 +216,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) - skb->local_df = 1; + skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); @@ -262,7 +263,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, } final_p = fl6_update_dst(fl6, np->opt, &final); - dst = ip6_dst_lookup_flow(sk, fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!asoc || saddr) goto out; @@ -321,7 +322,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->saddr = baddr->v6.sin6_addr; fl6->fl6_sport = baddr->v6.sin6_port; final_p = fl6_update_dst(fl6, np->opt, &final); - dst = ip6_dst_lookup_flow(sk, fl6, final_p, false); + dst = ip6_dst_lookup_flow(sk, fl6, final_p); } out: @@ -401,7 +402,7 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist, } /* Initialize a sockaddr_storage from in incoming skb. */ -static void sctp_v6_from_skb(union sctp_addr *addr,struct sk_buff *skb, +static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { __be16 *port; @@ -661,6 +662,8 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, */ sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); + newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { @@ -940,7 +943,6 @@ static struct inet_protosw sctpv6_seqpacket_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctpv6_stream_protosw = { @@ -948,7 +950,6 @@ static struct inet_protosw sctpv6_stream_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG, }; diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index 0c28e8a5532..40e7fac96c4 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -97,7 +97,7 @@ static void sctp_objcnt_seq_stop(struct seq_file *seq, void *v) { } -static void * sctp_objcnt_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *sctp_objcnt_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos; diff --git a/net/sctp/output.c b/net/sctp/output.c index 6371337e1fe..01ab8e0723f 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -280,7 +280,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { - case SCTP_CID_DATA: + case SCTP_CID_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ @@ -292,17 +292,17 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; break; - case SCTP_CID_COOKIE_ECHO: + case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; break; - case SCTP_CID_SACK: + case SCTP_CID_SACK: packet->has_sack = 1; if (chunk->asoc) chunk->asoc->stats.osacks++; break; - case SCTP_CID_AUTH: + case SCTP_CID_AUTH: packet->has_auth = 1; packet->auth = chunk; break; @@ -387,7 +387,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) int err = 0; int padding; /* How much padding do we need? */ __u8 has_data = 0; - struct dst_entry *dst = tp->dst; + struct dst_entry *dst; unsigned char *auth = NULL; /* pointer to auth in skb data */ pr_debug("%s: packet:%p\n", __func__, packet); @@ -420,9 +420,9 @@ int sctp_packet_transmit(struct sctp_packet *packet) } } dst = dst_clone(tp->dst); - skb_dst_set(nskb, dst); if (!dst) goto no_route; + skb_dst_set(nskb, dst); /* Build the SCTP header. */ sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); @@ -540,8 +540,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) } else { /* no need to seed pseudo checksum for SCTP */ nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (skb_transport_header(nskb) - - nskb->head); + nskb->csum_start = skb_transport_header(nskb) - nskb->head; nskb->csum_offset = offsetof(struct sctphdr, checksum); } } @@ -558,7 +557,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) * Note: The works for IPv6 layer checks this bit too later * in transmission. See IP6_ECN_flow_xmit(). */ - (*tp->af_specific->ecn_capable)(nskb->sk); + tp->af_specific->ecn_capable(nskb->sk); /* Set up the IP options. */ /* BUG: not implemented @@ -580,7 +579,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) unsigned long timeout; /* Restart the AUTOCLOSE timer when sending data. */ - if (sctp_state(asoc, ESTABLISHED) && asoc->autoclose) { + if (sctp_state(asoc, ESTABLISHED) && + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; @@ -591,8 +591,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len); - nskb->local_df = packet->ipfragok; - (*tp->af_specific->sctp_xmit)(nskb, tp); + nskb->ignore_df = packet->ipfragok; + tp->af_specific->sctp_xmit(nskb, tp); out: sctp_packet_reset(packet); diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index b6b09f3f1a8..9c77947c059 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -110,7 +110,7 @@ static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { - if (count_of_newacks >=2 && transport != primary) + if (count_of_newacks >= 2 && transport != primary) return 1; return 0; } @@ -207,8 +207,6 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); - - q->empty = 1; } /* Free the outqueue structure and any related pending chunks. @@ -331,7 +329,6 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS); - q->empty = 0; break; } } else { @@ -470,7 +467,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, struct net *net = sock_net(q->asoc->base.sk); int error = 0; - switch(reason) { + switch (reason) { case SCTP_RTXR_T3_RTX: SCTP_INC_STATS(net, SCTP_MIB_T3_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX); @@ -653,7 +650,6 @@ redo: if (chunk->fast_retransmit == SCTP_NEED_FRTX) chunk->fast_retransmit = SCTP_DONT_FRTX; - q->empty = 0; q->asoc->stats.rtxchunks++; break; } @@ -1064,8 +1060,6 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) sctp_transport_reset_timers(transport); - q->empty = 0; - /* Only let one DATA chunk get bundled with a * COOKIE-ECHO chunk. */ @@ -1088,7 +1082,7 @@ sctp_flush_out: * * --xguo */ - while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL ) { + while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL) { struct sctp_transport *t = list_entry(ltransport, struct sctp_transport, send_ready); @@ -1217,7 +1211,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) * destinations for which cacc_saw_newack is set. */ if (transport->cacc.cacc_saw_newack) - count_of_newacks ++; + count_of_newacks++; } /* Move the Cumulative TSN Ack Point if appropriate. */ @@ -1274,29 +1268,17 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) "advertised peer ack point:0x%x\n", __func__, asoc, ctsn, asoc->adv_peer_ack_point); - /* See if all chunks are acked. - * Make sure the empty queue handler will get run later. - */ - q->empty = (list_empty(&q->out_chunk_list) && - list_empty(&q->retransmit)); - if (!q->empty) - goto finish; - - list_for_each_entry(transport, transport_list, transports) { - q->empty = q->empty && list_empty(&transport->transmitted); - if (!q->empty) - goto finish; - } - - pr_debug("%s: sack queue is empty\n", __func__); -finish: - return q->empty; + return sctp_outq_is_empty(q); } -/* Is the outqueue empty? */ +/* Is the outqueue empty? + * The queue is empty when we have not pending data, no in-flight data + * and nothing pending retransmissions. + */ int sctp_outq_is_empty(const struct sctp_outq *q) { - return q->empty; + return q->out_qlen == 0 && q->outstanding_bytes == 0 && + list_empty(&q->retransmit); } /******************************************************************** diff --git a/net/sctp/probe.c b/net/sctp/probe.c index 53c452efb40..5e68b94ee64 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -38,6 +38,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +MODULE_SOFTDEP("pre: sctp"); MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>"); MODULE_DESCRIPTION("SCTP snooper"); MODULE_LICENSE("GPL"); @@ -182,6 +183,20 @@ static struct jprobe sctp_recv_probe = { .entry = jsctp_sf_eat_sack, }; +static __init int sctp_setup_jprobe(void) +{ + int ret = register_jprobe(&sctp_recv_probe); + + if (ret) { + if (request_module("sctp")) + goto out; + ret = register_jprobe(&sctp_recv_probe); + } + +out: + return ret; +} + static __init int sctpprobe_init(void) { int ret = -ENOMEM; @@ -202,7 +217,7 @@ static __init int sctpprobe_init(void) &sctpprobe_fops)) goto free_kfifo; - ret = register_jprobe(&sctp_recv_probe); + ret = sctp_setup_jprobe(); if (ret) goto remove_proc; diff --git a/net/sctp/proc.c b/net/sctp/proc.c index de32e14f739..34229ee7f37 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -78,7 +78,7 @@ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) for (i = 0; sctp_snmp_list[i].name != NULL; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, - snmp_fold_field((void __percpu **)net->sctp.sctp_statistics, + snmp_fold_field(net->sctp.sctp_statistics, sctp_snmp_list[i].entry)); return 0; @@ -177,7 +177,7 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa rcu_read_unlock(); } -static void * sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) +static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos >= sctp_ep_hashsize) return NULL; @@ -196,7 +196,7 @@ static void sctp_eps_seq_stop(struct seq_file *seq, void *v) } -static void * sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos) { if (++*pos >= sctp_ep_hashsize) return NULL; @@ -218,7 +218,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) return -ENOMEM; head = &sctp_ep_hashtable[hash]; - sctp_local_bh_disable(); + local_bh_disable(); read_lock(&head->lock); sctp_for_each_hentry(epb, &head->chain) { ep = sctp_ep(epb); @@ -235,7 +235,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } read_unlock(&head->lock); - sctp_local_bh_enable(); + local_bh_enable(); return 0; } @@ -282,7 +282,7 @@ void sctp_eps_proc_exit(struct net *net) } -static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) +static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos >= sctp_assoc_hashsize) return NULL; @@ -305,7 +305,7 @@ static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) } -static void * sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos) { if (++*pos >= sctp_assoc_hashsize) return NULL; @@ -326,7 +326,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) return -ENOMEM; head = &sctp_assoc_hashtable[hash]; - sctp_local_bh_disable(); + local_bh_disable(); read_lock(&head->lock); sctp_for_each_hentry(epb, &head->chain) { assoc = sctp_assoc(epb); @@ -362,7 +362,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } read_unlock(&head->lock); - sctp_local_bh_enable(); + local_bh_enable(); return 0; } @@ -446,7 +446,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) return -ENOMEM; head = &sctp_assoc_hashtable[hash]; - sctp_local_bh_disable(); + local_bh_disable(); read_lock(&head->lock); rcu_read_lock(); sctp_for_each_hentry(epb, &head->chain) { @@ -505,7 +505,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) rcu_read_unlock(); read_unlock(&head->lock); - sctp_local_bh_enable(); + local_bh_enable(); return 0; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 19bd4c5bdae..6789d785e69 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -491,8 +491,13 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, continue; if ((laddr->state == SCTP_ADDR_SRC) && (AF_INET == laddr->a.sa.sa_family)) { - fl4->saddr = laddr->a.v4.sin_addr.s_addr; fl4->fl4_sport = laddr->a.v4.sin_port; + flowi4_update_output(fl4, + asoc->base.sk->sk_bound_dev_if, + RT_CONN_FLAGS(asoc->base.sk), + daddr->v4.sin_addr.s_addr, + laddr->a.v4.sin_addr.s_addr); + rt = ip_route_output_key(sock_net(sk), fl4); if (!IS_ERR(rt)) { dst = &rt->dst; @@ -634,10 +639,10 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg) /* ignore bound-specific endpoints */ if (!sctp_is_ep_boundall(sk)) continue; - sctp_bh_lock_sock(sk); + bh_lock_sock(sk); if (sctp_asconf_mgmt(sp, addrw) < 0) pr_debug("%s: sctp_asconf_mgmt failed\n", __func__); - sctp_bh_unlock_sock(sk); + bh_unlock_sock(sk); } #if IS_ENABLED(CONFIG_IPV6) free_next: @@ -957,7 +962,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(skb, &transport->fl); + return ip_queue_xmit(&inet->sk, skb, &transport->fl); } static struct sctp_af sctp_af_inet; @@ -1012,7 +1017,6 @@ static struct inet_protosw sctp_seqpacket_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctp_stream_protosw = { @@ -1020,7 +1024,6 @@ static struct inet_protosw sctp_stream_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG }; @@ -1030,6 +1033,7 @@ static const struct net_protocol sctp_protocol = { .err_handler = sctp_v4_err, .no_policy = 1, .netns_ok = 1, + .icmp_strict_tag_validation = 1, }; /* IPv4 address related functions. */ @@ -1065,8 +1069,8 @@ static struct sctp_af sctp_af_inet = { #endif }; -struct sctp_pf *sctp_get_pf_specific(sa_family_t family) { - +struct sctp_pf *sctp_get_pf_specific(sa_family_t family) +{ switch (family) { case PF_INET: return sctp_pf_inet_specific; @@ -1099,14 +1103,15 @@ int sctp_register_pf(struct sctp_pf *pf, sa_family_t family) static inline int init_sctp_mibs(struct net *net) { - return snmp_mib_init((void __percpu **)net->sctp.sctp_statistics, - sizeof(struct sctp_mib), - __alignof__(struct sctp_mib)); + net->sctp.sctp_statistics = alloc_percpu(struct sctp_mib); + if (!net->sctp.sctp_statistics) + return -ENOMEM; + return 0; } static inline void cleanup_sctp_mibs(struct net *net) { - snmp_mib_free((void __percpu **)net->sctp.sctp_statistics); + free_percpu(net->sctp.sctp_statistics); } static void sctp_v4_pf_init(void) @@ -1460,7 +1465,6 @@ static __init int sctp_init(void) if (status) goto err_v6_add_protocol; - status = 0; out: return status; err_v6_add_protocol: diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d9aaf9641aa..ae0e616a7ca 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -78,6 +78,8 @@ static int sctp_process_param(struct sctp_association *asoc, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); +static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len, + const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) @@ -217,6 +219,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, gfp_t gfp, int vparam_len) { struct net *net = sock_net(asoc->base.sk); + struct sctp_endpoint *ep = asoc->ep; sctp_inithdr_t init; union sctp_params addrs; size_t chunksize; @@ -276,7 +279,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, chunksize += vparam_len; /* Account for AUTH related parameters */ - if (net->sctp.auth_enable) { + if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); @@ -361,7 +364,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, } /* Add SCTP-AUTH chunks to the parameter list */ - if (net->sctp.auth_enable) { + if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) @@ -1419,8 +1422,8 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk) BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); - /* Free the chunk skb data and the SCTP_chunk stub itself. */ - dev_kfree_skb(chunk->skb); + consume_skb(chunk->skb); + consume_skb(chunk->auth_chunk); SCTP_DBG_OBJCNT_DEC(chunk); kmem_cache_free(sctp_chunk_cachep, chunk); @@ -1475,8 +1478,8 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) /* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient * space in the chunk */ -void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk, - int len, const void *data) +static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk, + int len, const void *data) { if (skb_tailroom(chunk->skb) >= len) return sctp_addto_chunk(chunk, len, data); @@ -1779,7 +1782,7 @@ no_hmac: else kt = ktime_get(); - if (!asoc && ktime_compare(bear_cookie->expiration, kt) < 0) { + if (!asoc && ktime_before(bear_cookie->expiration, kt)) { /* * Section 3.3.10.3 Stale Cookie Error (3) * @@ -1967,13 +1970,13 @@ static int sctp_verify_ext_param(struct net *net, union sctp_params param) for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { - case SCTP_CID_AUTH: - have_auth = 1; - break; - case SCTP_CID_ASCONF: - case SCTP_CID_ASCONF_ACK: - have_asconf = 1; - break; + case SCTP_CID_AUTH: + have_auth = 1; + break; + case SCTP_CID_ASCONF: + case SCTP_CID_ASCONF_ACK: + have_asconf = 1; + break; } } @@ -2000,25 +2003,24 @@ static void sctp_process_ext_param(struct sctp_association *asoc, for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { - case SCTP_CID_FWD_TSN: - if (net->sctp.prsctp_enable && - !asoc->peer.prsctp_capable) + case SCTP_CID_FWD_TSN: + if (net->sctp.prsctp_enable && !asoc->peer.prsctp_capable) asoc->peer.prsctp_capable = 1; - break; - case SCTP_CID_AUTH: - /* if the peer reports AUTH, assume that he - * supports AUTH. - */ - if (net->sctp.auth_enable) - asoc->peer.auth_capable = 1; - break; - case SCTP_CID_ASCONF: - case SCTP_CID_ASCONF_ACK: - if (net->sctp.addip_enable) - asoc->peer.asconf_capable = 1; - break; - default: - break; + break; + case SCTP_CID_AUTH: + /* if the peer reports AUTH, assume that he + * supports AUTH. + */ + if (asoc->ep->auth_enable) + asoc->peer.auth_capable = 1; + break; + case SCTP_CID_ASCONF: + case SCTP_CID_ASCONF_ACK: + if (net->sctp.addip_enable) + asoc->peer.asconf_capable = 1; + break; + default: + break; } } } @@ -2101,6 +2103,7 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, * SCTP_IERROR_NO_ERROR - continue with the chunk */ static sctp_ierror_t sctp_verify_param(struct net *net, + const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, sctp_cid_t cid, @@ -2151,7 +2154,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, goto fallthrough; case SCTP_PARAM_RANDOM: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fallthrough; /* SCTP-AUTH: Secion 6.1 @@ -2168,7 +2171,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_CHUNKS: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fallthrough; /* SCTP-AUTH: Section 3.2 @@ -2184,7 +2187,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_HMAC_ALGO: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fallthrough; hmacs = (struct sctp_hmac_algo_param *)param.p; @@ -2219,10 +2222,9 @@ fallthrough: } /* Verify the INIT packet before we process it. */ -int sctp_verify_init(struct net *net, const struct sctp_association *asoc, - sctp_cid_t cid, - sctp_init_chunk_t *peer_init, - struct sctp_chunk *chunk, +int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, sctp_cid_t cid, + sctp_init_chunk_t *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; @@ -2251,7 +2253,7 @@ int sctp_verify_init(struct net *net, const struct sctp_association *asoc, * VIOLATION error. We build the ERROR chunk here and let the normal * error handling code build and send the packet. */ - if (param.v != (void*)chunk->chunk_end) + if (param.v != (void *)chunk->chunk_end) return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); /* The only missing mandatory param possible today is @@ -2263,17 +2265,17 @@ int sctp_verify_init(struct net *net, const struct sctp_association *asoc, /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init, init_hdr.params) { - - result = sctp_verify_param(net, asoc, param, cid, chunk, errp); + result = sctp_verify_param(net, ep, asoc, param, cid, + chunk, errp); switch (result) { - case SCTP_IERROR_ABORT: - case SCTP_IERROR_NOMEM: - return 0; - case SCTP_IERROR_ERROR: - return 1; - case SCTP_IERROR_NO_ERROR: - default: - break; + case SCTP_IERROR_ABORT: + case SCTP_IERROR_NOMEM: + return 0; + case SCTP_IERROR_ERROR: + return 1; + case SCTP_IERROR_NO_ERROR: + default: + break; } } /* for (loop through all parameters) */ @@ -2308,7 +2310,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * added as the primary transport. The source address seems to * be a a better choice than any of the embedded addresses. */ - if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) + if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) @@ -2496,6 +2498,7 @@ static int sctp_process_param(struct sctp_association *asoc, struct sctp_af *af; union sctp_addr_param *addr_param; struct sctp_transport *t; + struct sctp_endpoint *ep = asoc->ep; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters @@ -2635,7 +2638,7 @@ do_addr_param: goto fall_through; case SCTP_PARAM_RANDOM: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ @@ -2648,7 +2651,7 @@ do_addr_param: break; case SCTP_PARAM_HMAC_ALGO: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ @@ -2664,7 +2667,7 @@ do_addr_param: break; case SCTP_PARAM_CHUNKS: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fall_through; asoc->peer.peer_chunks = kmemdup(param.p, @@ -3334,7 +3337,7 @@ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, while (asconf_ack_len > 0) { if (asconf_ack_param->crr_id == asconf_param->crr_id) { - switch(asconf_ack_param->param_hdr.type) { + switch (asconf_ack_param->param_hdr.type) { case SCTP_PARAM_SUCCESS_REPORT: return SCTP_ERROR_NO_ERROR; case SCTP_PARAM_ERR_CAUSE: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 02b7ad1ff46..fef2acdf4a2 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -248,7 +248,7 @@ void sctp_generate_t3_rtx_event(unsigned long peer) /* Check whether a task is in the sock. */ - sctp_bh_lock_sock(asoc->base.sk); + bh_lock_sock(asoc->base.sk); if (sock_owned_by_user(asoc->base.sk)) { pr_debug("%s: sock is busy\n", __func__); @@ -275,7 +275,7 @@ void sctp_generate_t3_rtx_event(unsigned long peer) asoc->base.sk->sk_err = -error; out_unlock: - sctp_bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(asoc->base.sk); sctp_transport_put(transport); } @@ -288,7 +288,7 @@ static void sctp_generate_timeout_event(struct sctp_association *asoc, struct net *net = sock_net(asoc->base.sk); int error = 0; - sctp_bh_lock_sock(asoc->base.sk); + bh_lock_sock(asoc->base.sk); if (sock_owned_by_user(asoc->base.sk)) { pr_debug("%s: sock is busy: timer %d\n", __func__, timeout_type); @@ -315,7 +315,7 @@ static void sctp_generate_timeout_event(struct sctp_association *asoc, asoc->base.sk->sk_err = -error; out_unlock: - sctp_bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(asoc->base.sk); sctp_association_put(asoc); } @@ -367,7 +367,7 @@ void sctp_generate_heartbeat_event(unsigned long data) struct sctp_association *asoc = transport->asoc; struct net *net = sock_net(asoc->base.sk); - sctp_bh_lock_sock(asoc->base.sk); + bh_lock_sock(asoc->base.sk); if (sock_owned_by_user(asoc->base.sk)) { pr_debug("%s: sock is busy\n", __func__); @@ -392,7 +392,7 @@ void sctp_generate_heartbeat_event(unsigned long data) asoc->base.sk->sk_err = -error; out_unlock: - sctp_bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(asoc->base.sk); sctp_transport_put(transport); } @@ -404,8 +404,8 @@ void sctp_generate_proto_unreach_event(unsigned long data) struct sctp_transport *transport = (struct sctp_transport *) data; struct sctp_association *asoc = transport->asoc; struct net *net = sock_net(asoc->base.sk); - - sctp_bh_lock_sock(asoc->base.sk); + + bh_lock_sock(asoc->base.sk); if (sock_owned_by_user(asoc->base.sk)) { pr_debug("%s: sock is busy\n", __func__); @@ -427,7 +427,7 @@ void sctp_generate_proto_unreach_event(unsigned long data) asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); out_unlock: - sctp_bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(asoc->base.sk); sctp_association_put(asoc); } @@ -495,11 +495,11 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands, } /* If the transport error count is greater than the pf_retrans - * threshold, and less than pathmaxrtx, then mark this transport - * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1, - * point 1 + * threshold, and less than pathmaxrtx, and if the current state + * is SCTP_ACTIVE, then mark this transport as Partially Failed, + * see SCTP Quick Failover Draft, section 5.1 */ - if ((transport->state != SCTP_PF) && + if ((transport->state == SCTP_ACTIVE) && (asoc->pf_retrans < transport->pathmaxrxt) && (transport->error_count > asoc->pf_retrans)) { @@ -543,7 +543,7 @@ static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands, { struct sctp_ulpevent *event; - event = sctp_ulpevent_make_assoc_change(asoc,0, SCTP_CANT_STR_ASSOC, + event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_CANT_STR_ASSOC, (__u16)error, 0, 0, NULL, GFP_ATOMIC); @@ -1115,7 +1115,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype, sctp_init_cmd_seq(&commands); debug_pre_sfn(); - status = (*state_fn->fn)(net, ep, asoc, subtype, event_arg, &commands); + status = state_fn->fn(net, ep, asoc, subtype, event_arg, &commands); debug_post_sfn(); error = sctp_side_effects(event_type, subtype, state, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index dd0eba919a8..5170a1ff95a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -357,7 +357,7 @@ sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net, /* Verify the INIT chunk before processing it. */ err_chunk = NULL; - if (!sctp_verify_init(net, asoc, chunk->chunk_hdr->type, + if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, &err_chunk)) { /* This chunk contains fatal error. It is to be discarded. @@ -524,7 +524,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net, /* Verify the INIT chunk before processing it. */ err_chunk = NULL; - if (!sctp_verify_init(net, asoc, chunk->chunk_hdr->type, + if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, &err_chunk)) { @@ -758,6 +758,12 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net, struct sctp_chunk auth; sctp_ierror_t ret; + /* Make sure that we and the peer are AUTH capable */ + if (!net->sctp.auth_enable || !new_asoc->peer.auth_capable) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + /* set-up our fake chunk so that we can process it */ auth.skb = chunk->auth_chunk; auth.asoc = chunk->asoc; @@ -768,10 +774,6 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net, auth.transport = chunk->transport; ret = sctp_sf_authenticate(net, ep, new_asoc, type, &auth); - - /* We can now safely free the auth_chunk clone */ - kfree_skb(chunk->auth_chunk); - if (ret != SCTP_IERROR_NO_ERROR) { sctp_association_free(new_asoc); return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -819,7 +821,7 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_PASSIVEESTABS); sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); - if (new_asoc->autoclose) + if (new_asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); @@ -907,7 +909,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB); SCTP_INC_STATS(net, SCTP_MIB_ACTIVEESTABS); sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); - if (asoc->autoclose) + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); @@ -1428,7 +1430,7 @@ static sctp_disposition_t sctp_sf_do_unexpected_init( /* Verify the INIT chunk before processing it. */ err_chunk = NULL; - if (!sctp_verify_init(net, asoc, chunk->chunk_hdr->type, + if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, &err_chunk)) { /* This chunk contains fatal error. It is to be discarded. @@ -2945,7 +2947,7 @@ sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net, return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); - error = sctp_eat_data(asoc, chunk, commands ); + error = sctp_eat_data(asoc, chunk, commands); switch (error) { case SCTP_IERROR_NO_ERROR: break; @@ -2969,7 +2971,7 @@ sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net, if (chunk->chunk_hdr->flags & SCTP_DATA_SACK_IMM) force = SCTP_FORCE(); - if (asoc->autoclose) { + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); } @@ -3066,7 +3068,7 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net, return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); - error = sctp_eat_data(asoc, chunk, commands ); + error = sctp_eat_data(asoc, chunk, commands); switch (error) { case SCTP_IERROR_NO_ERROR: case SCTP_IERROR_HIGH_TSN: @@ -3681,8 +3683,7 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net, asconf_ack->dest = chunk->source; sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(asconf_ack)); if (asoc->new_transport) { - sctp_sf_heartbeat(ep, asoc, type, asoc->new_transport, - commands); + sctp_sf_heartbeat(ep, asoc, type, asoc->new_transport, commands); ((struct sctp_association *)asoc)->new_transport = NULL; } @@ -3765,7 +3766,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net, */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); - sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED)); sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, @@ -3799,7 +3800,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net, /* We are going to ABORT, so we might as well stop * processing the rest of the chunks in the packet. */ - sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED)); sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, @@ -3877,7 +3878,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net, SCTP_CHUNK(chunk)); /* Count this as receiving DATA. */ - if (asoc->autoclose) { + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); } @@ -4451,7 +4452,7 @@ static sctp_disposition_t sctp_sf_violation_chunklen( void *arg, sctp_cmd_seq_t *commands) { - static const char err_str[]="The following chunk had invalid length:"; + static const char err_str[] = "The following chunk had invalid length:"; return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str, sizeof(err_str)); @@ -4514,7 +4515,7 @@ static sctp_disposition_t sctp_sf_violation_ctsn( void *arg, sctp_cmd_seq_t *commands) { - static const char err_str[]="The cumulative tsn ack beyond the max tsn currently sent:"; + static const char err_str[] = "The cumulative tsn ack beyond the max tsn currently sent:"; return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str, sizeof(err_str)); @@ -4534,7 +4535,7 @@ static sctp_disposition_t sctp_sf_violation_chunk( void *arg, sctp_cmd_seq_t *commands) { - static const char err_str[]="The following chunk violates protocol:"; + static const char err_str[] = "The following chunk violates protocol:"; if (!asoc) return sctp_sf_violation(net, ep, asoc, type, arg, commands); @@ -4610,7 +4611,7 @@ sctp_disposition_t sctp_sf_do_prm_asoc(struct net *net, sctp_cmd_seq_t *commands) { struct sctp_chunk *repl; - struct sctp_association* my_asoc; + struct sctp_association *my_asoc; /* The comment below says that we enter COOKIE-WAIT AFTER * sending the INIT, but that doesn't actually work in our @@ -5266,7 +5267,7 @@ sctp_disposition_t sctp_sf_do_9_2_start_shutdown( sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); - if (asoc->autoclose) + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); @@ -5345,7 +5346,7 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown_ack( sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); - if (asoc->autoclose) + if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); @@ -6000,7 +6001,7 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net, /* Special case the INIT-ACK as there is no peer's vtag * yet. */ - switch(chunk->chunk_hdr->type) { + switch (chunk->chunk_hdr->type) { case SCTP_CID_INIT_ACK: { sctp_initack_chunk_t *initack; @@ -6017,7 +6018,7 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net, /* Special case the INIT and stale COOKIE_ECHO as there is no * vtag yet. */ - switch(chunk->chunk_hdr->type) { + switch (chunk->chunk_hdr->type) { case SCTP_CID_INIT: { sctp_init_chunk_t *init; @@ -6207,7 +6208,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, */ if (*sk->sk_prot_creator->memory_pressure) { if (sctp_tsnmap_has_gap(map) && - (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { + (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { pr_debug("%s: under pressure, reneging for tsn:%u\n", __func__, tsn); deliver = SCTP_CMD_RENEGE; @@ -6231,7 +6232,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, /* We are going to ABORT, so we might as well stop * processing the rest of the chunks in the packet. */ - sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED)); sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index a4f17437fb2..a987d54b379 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -69,7 +69,7 @@ static const sctp_sm_table_entry_t bug = { if ((event_subtype._type > (_max))) { \ pr_warn("table %p possible attack: event %d exceeds max %d\n", \ _table, event_subtype._type, _max); \ - rtn = &bug; \ + rtn = &bug; \ } else \ rtn = &_table[event_subtype._type][(int)state]; \ \ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 191cd925780..42989968940 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -64,12 +64,14 @@ #include <linux/crypto.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/compat.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/busy_poll.h> #include <linux/socket.h> /* for sa_family_t */ #include <linux/export.h> @@ -82,7 +84,7 @@ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p, size_t msg_len); -static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p); +static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); static void sctp_wait_for_close(struct sock *sk, long timeo); @@ -272,7 +274,7 @@ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) { int retval = 0; - sctp_lock_sock(sk); + lock_sock(sk); pr_debug("%s: sk:%p, addr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); @@ -284,7 +286,7 @@ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) else retval = -EINVAL; - sctp_release_sock(sk); + release_sock(sk); return retval; } @@ -952,7 +954,7 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) * * Returns 0 if ok, <0 errno code on error. */ -static int sctp_setsockopt_bindx(struct sock* sk, +static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr __user *addrs, int addrs_size, int op) { @@ -1039,7 +1041,7 @@ out: * Common routine for handling connect() and sctp_connectx(). * Connect will come in with just a single address. */ -static int __sctp_connect(struct sock* sk, +static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs, int addrs_size, sctp_assoc_t *assoc_id) @@ -1299,7 +1301,7 @@ out_free: * * Returns >=0 if ok, <0 errno code on error. */ -static int __sctp_setsockopt_connectx(struct sock* sk, +static int __sctp_setsockopt_connectx(struct sock *sk, struct sockaddr __user *addrs, int addrs_size, sctp_assoc_t *assoc_id) @@ -1337,7 +1339,7 @@ static int __sctp_setsockopt_connectx(struct sock* sk, * This is an older interface. It's kept for backward compatibility * to the option that doesn't provide association id. */ -static int sctp_setsockopt_connectx_old(struct sock* sk, +static int sctp_setsockopt_connectx_old(struct sock *sk, struct sockaddr __user *addrs, int addrs_size) { @@ -1350,7 +1352,7 @@ static int sctp_setsockopt_connectx_old(struct sock* sk, * indication to the call. Error is always negative and association id is * always positive. */ -static int sctp_setsockopt_connectx(struct sock* sk, +static int sctp_setsockopt_connectx(struct sock *sk, struct sockaddr __user *addrs, int addrs_size) { @@ -1368,12 +1370,20 @@ static int sctp_setsockopt_connectx(struct sock* sk, /* * New (hopefully final) interface for the API. * We use the sctp_getaddrs_old structure so that use-space library - * can avoid any unnecessary allocations. The only defferent part + * can avoid any unnecessary allocations. The only different part * is that we store the actual length of the address buffer into the - * addrs_num structure member. That way we can re-use the existing + * addrs_num structure member. That way we can re-use the existing * code. */ -static int sctp_getsockopt_connectx3(struct sock* sk, int len, +#ifdef CONFIG_COMPAT +struct compat_sctp_getaddrs_old { + sctp_assoc_t assoc_id; + s32 addr_num; + compat_uptr_t addrs; /* struct sockaddr * */ +}; +#endif + +static int sctp_getsockopt_connectx3(struct sock *sk, int len, char __user *optval, int __user *optlen) { @@ -1381,16 +1391,30 @@ static int sctp_getsockopt_connectx3(struct sock* sk, int len, sctp_assoc_t assoc_id = 0; int err = 0; - if (len < sizeof(param)) - return -EINVAL; +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + struct compat_sctp_getaddrs_old param32; - if (copy_from_user(¶m, optval, sizeof(param))) - return -EFAULT; + if (len < sizeof(param32)) + return -EINVAL; + if (copy_from_user(¶m32, optval, sizeof(param32))) + return -EFAULT; - err = __sctp_setsockopt_connectx(sk, - (struct sockaddr __user *)param.addrs, - param.addr_num, &assoc_id); + param.assoc_id = param32.assoc_id; + param.addr_num = param32.addr_num; + param.addrs = compat_ptr(param32.addrs); + } else +#endif + { + if (len < sizeof(param)) + return -EINVAL; + if (copy_from_user(¶m, optval, sizeof(param))) + return -EFAULT; + } + err = __sctp_setsockopt_connectx(sk, (struct sockaddr __user *) + param.addrs, param.addr_num, + &assoc_id); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) return -EFAULT; @@ -1461,7 +1485,7 @@ static void sctp_close(struct sock *sk, long timeout) pr_debug("%s: sk:%p, timeout:%ld\n", __func__, sk, timeout); - sctp_lock_sock(sk); + lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_state = SCTP_SS_CLOSING; @@ -1505,13 +1529,13 @@ static void sctp_close(struct sock *sk, long timeout) sctp_wait_for_close(sk, timeout); /* This will run the backlog queue. */ - sctp_release_sock(sk); + release_sock(sk); /* Supposedly, no process has access to the socket, but * the net layers still may. */ - sctp_local_bh_disable(); - sctp_bh_lock_sock(sk); + local_bh_disable(); + bh_lock_sock(sk); /* Hold the sock, since sk_common_release() will put sock_put() * and we have just a little more cleanup. @@ -1519,8 +1543,8 @@ static void sctp_close(struct sock *sk, long timeout) sock_hold(sk); sk_common_release(sk); - sctp_bh_unlock_sock(sk); - sctp_local_bh_enable(); + bh_unlock_sock(sk); + local_bh_enable(); sock_put(sk); @@ -1568,7 +1592,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, struct net *net = sock_net(sk); struct sctp_sock *sp; struct sctp_endpoint *ep; - struct sctp_association *new_asoc=NULL, *asoc=NULL; + struct sctp_association *new_asoc = NULL, *asoc = NULL; struct sctp_transport *transport, *chunk_tp; struct sctp_chunk *chunk; union sctp_addr to; @@ -1665,7 +1689,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, pr_debug("%s: about to look up association\n", __func__); - sctp_lock_sock(sk); + lock_sock(sk); /* If a msg_name has been specified, assume this is to be used. */ if (msg_name) { @@ -1743,7 +1767,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, * either the default or the user specified stream counts. */ if (sinfo) { - if (!sinit || (sinit && !sinit->sinit_num_ostreams)) { + if (!sinit || !sinit->sinit_num_ostreams) { /* Check against the defaults. */ if (sinfo->sinfo_stream >= sp->initmsg.sinit_num_ostreams) { @@ -1949,7 +1973,7 @@ out_free: sctp_association_free(asoc); } out_unlock: - sctp_release_sock(sk); + release_sock(sk); out_nounlock: return sctp_error(sk, msg_flags, err); @@ -2035,7 +2059,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, "addr_len:%p)\n", __func__, sk, msg, len, noblock, flags, addr_len); - sctp_lock_sock(sk); + lock_sock(sk); if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) { err = -ENOTCONN; @@ -2119,7 +2143,7 @@ out_free: sctp_ulpevent_free(event); } out: - sctp_release_sock(sk); + release_sock(sk); return err; } @@ -2195,6 +2219,7 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); + struct net *net = sock_net(sk); /* Applicable to UDP-style socket only */ if (sctp_style(sk, TCP)) @@ -2204,6 +2229,9 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; + if (sp->autoclose > net->sctp.max_autoclose) + sp->autoclose = net->sctp.max_autoclose; + return 0; } @@ -2458,7 +2486,7 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, int hb_change, pmtud_change, sackdelay_change; if (optlen != sizeof(struct sctp_paddrparams)) - return - EINVAL; + return -EINVAL; if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; @@ -2479,7 +2507,7 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ - if (!sctp_is_any(sk, ( union sctp_addr *)¶ms.spp_address)) { + if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spp_address)) { trans = sctp_addr_id2transport(sk, ¶ms.spp_address, params.spp_assoc_id); if (!trans) @@ -2523,6 +2551,16 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, return 0; } +static inline __u32 sctp_spp_sackdelay_enable(__u32 param_flags) +{ + return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_ENABLE; +} + +static inline __u32 sctp_spp_sackdelay_disable(__u32 param_flags) +{ + return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_DISABLE; +} + /* * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK) * @@ -2574,8 +2612,11 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, if (params.sack_delay == 0 && params.sack_freq == 0) return 0; } else if (optlen == sizeof(struct sctp_assoc_value)) { - pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); - pr_warn("Use struct sctp_sack_info instead\n"); + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of struct sctp_assoc_value in delayed_ack socket option.\n" + "Use struct sctp_sack_info instead\n", + current->comm, task_pid_nr(current)); if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; @@ -2584,7 +2625,7 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, else params.sack_freq = 0; } else - return - EINVAL; + return -EINVAL; /* Validate value parameter. */ if (params.sack_delay > 500) @@ -2603,37 +2644,31 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, asoc->sackdelay = msecs_to_jiffies(params.sack_delay); asoc->param_flags = - (asoc->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_ENABLE; + sctp_spp_sackdelay_enable(asoc->param_flags); } else { sp->sackdelay = params.sack_delay; sp->param_flags = - (sp->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_ENABLE; + sctp_spp_sackdelay_enable(sp->param_flags); } } if (params.sack_freq == 1) { if (asoc) { asoc->param_flags = - (asoc->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_DISABLE; + sctp_spp_sackdelay_disable(asoc->param_flags); } else { sp->param_flags = - (sp->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_DISABLE; + sctp_spp_sackdelay_disable(sp->param_flags); } } else if (params.sack_freq > 1) { if (asoc) { asoc->sackfreq = params.sack_freq; asoc->param_flags = - (asoc->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_ENABLE; + sctp_spp_sackdelay_enable(asoc->param_flags); } else { sp->sackfreq = params.sack_freq; sp->param_flags = - (sp->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_ENABLE; + sctp_spp_sackdelay_enable(sp->param_flags); } } @@ -2645,18 +2680,15 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, trans->sackdelay = msecs_to_jiffies(params.sack_delay); trans->param_flags = - (trans->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_ENABLE; + sctp_spp_sackdelay_enable(trans->param_flags); } if (params.sack_freq == 1) { trans->param_flags = - (trans->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_DISABLE; + sctp_spp_sackdelay_disable(trans->param_flags); } else if (params.sack_freq > 1) { trans->sackfreq = params.sack_freq; trans->param_flags = - (trans->param_flags & ~SPP_SACKDELAY) | - SPP_SACKDELAY_ENABLE; + sctp_spp_sackdelay_enable(trans->param_flags); } } } @@ -2810,6 +2842,8 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne { struct sctp_rtoinfo rtoinfo; struct sctp_association *asoc; + unsigned long rto_min, rto_max; + struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof (struct sctp_rtoinfo)) return -EINVAL; @@ -2823,26 +2857,36 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP)) return -EINVAL; + rto_max = rtoinfo.srto_max; + rto_min = rtoinfo.srto_min; + + if (rto_max) + rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max; + else + rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max; + + if (rto_min) + rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min; + else + rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min; + + if (rto_min > rto_max) + return -EINVAL; + if (asoc) { if (rtoinfo.srto_initial != 0) asoc->rto_initial = msecs_to_jiffies(rtoinfo.srto_initial); - if (rtoinfo.srto_max != 0) - asoc->rto_max = msecs_to_jiffies(rtoinfo.srto_max); - if (rtoinfo.srto_min != 0) - asoc->rto_min = msecs_to_jiffies(rtoinfo.srto_min); + asoc->rto_max = rto_max; + asoc->rto_min = rto_min; } else { /* If there is no association or the association-id = 0 * set the values to the endpoint. */ - struct sctp_sock *sp = sctp_sk(sk); - if (rtoinfo.srto_initial != 0) sp->rtoinfo.srto_initial = rtoinfo.srto_initial; - if (rtoinfo.srto_max != 0) - sp->rtoinfo.srto_max = rtoinfo.srto_max; - if (rtoinfo.srto_min != 0) - sp->rtoinfo.srto_min = rtoinfo.srto_min; + sp->rtoinfo.srto_max = rto_max; + sp->rtoinfo.srto_min = rto_min; } return 0; @@ -2978,8 +3022,11 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int val; if (optlen == sizeof(int)) { - pr_warn("Use of int in maxseg socket option deprecated\n"); - pr_warn("Use struct sctp_assoc_value instead\n"); + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of int in maxseg socket option.\n" + "Use struct sctp_assoc_value instead\n", + current->comm, task_pid_nr(current)); if (copy_from_user(&val, optval, optlen)) return -EFAULT; params.assoc_id = 0; @@ -3236,8 +3283,11 @@ static int sctp_setsockopt_maxburst(struct sock *sk, int assoc_id = 0; if (optlen == sizeof(int)) { - pr_warn("Use of int in max_burst socket option deprecated\n"); - pr_warn("Use struct sctp_assoc_value instead\n"); + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of int in max_burst socket option deprecated.\n" + "Use struct sctp_assoc_value instead\n", + current->comm, task_pid_nr(current)); if (copy_from_user(&val, optval, optlen)) return -EFAULT; } else if (optlen == sizeof(struct sctp_assoc_value)) { @@ -3272,10 +3322,10 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunk val; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authchunk)) @@ -3292,7 +3342,7 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, } /* add this chunk id to the endpoint */ - return sctp_auth_ep_add_chunkid(sctp_sk(sk)->ep, val.sauth_chunk); + return sctp_auth_ep_add_chunkid(ep, val.sauth_chunk); } /* @@ -3305,18 +3355,18 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_hmacalgo *hmacs; u32 idents; int err; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; - hmacs= memdup_user(optval, optlen); + hmacs = memdup_user(optval, optlen); if (IS_ERR(hmacs)) return PTR_ERR(hmacs); @@ -3327,7 +3377,7 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, goto out; } - err = sctp_auth_ep_set_hmacs(sctp_sk(sk)->ep, hmacs); + err = sctp_auth_ep_set_hmacs(ep, hmacs); out: kfree(hmacs); return err; @@ -3343,18 +3393,18 @@ static int sctp_setsockopt_auth_key(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkey *authkey; struct sctp_association *asoc; int ret; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; - authkey= memdup_user(optval, optlen); + authkey = memdup_user(optval, optlen); if (IS_ERR(authkey)) return PTR_ERR(authkey); @@ -3369,7 +3419,7 @@ static int sctp_setsockopt_auth_key(struct sock *sk, goto out; } - ret = sctp_auth_set_key(sctp_sk(sk)->ep, asoc, authkey); + ret = sctp_auth_set_key(ep, asoc, authkey); out: kzfree(authkey); return ret; @@ -3385,11 +3435,11 @@ static int sctp_setsockopt_active_key(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authkeyid)) @@ -3401,8 +3451,7 @@ static int sctp_setsockopt_active_key(struct sock *sk, if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - return sctp_auth_set_active_key(sctp_sk(sk)->ep, asoc, - val.scact_keynumber); + return sctp_auth_set_active_key(ep, asoc, val.scact_keynumber); } /* @@ -3414,11 +3463,11 @@ static int sctp_setsockopt_del_key(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authkeyid)) @@ -3430,8 +3479,7 @@ static int sctp_setsockopt_del_key(struct sock *sk, if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - return sctp_auth_del_key_id(sctp_sk(sk)->ep, asoc, - val.scact_keynumber); + return sctp_auth_del_key_id(ep, asoc, val.scact_keynumber); } @@ -3564,7 +3612,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, goto out_nounlock; } - sctp_lock_sock(sk); + lock_sock(sk); switch (optname) { case SCTP_SOCKOPT_BINDX_ADD: @@ -3682,7 +3730,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, break; } - sctp_release_sock(sk); + release_sock(sk); out_nounlock: return retval; @@ -3710,7 +3758,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr, int err = 0; struct sctp_af *af; - sctp_lock_sock(sk); + lock_sock(sk); pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); @@ -3726,7 +3774,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr, err = __sctp_connect(sk, addr, af->sockaddr_len, NULL); } - sctp_release_sock(sk); + release_sock(sk); return err; } @@ -3752,7 +3800,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err) long timeo; int error = 0; - sctp_lock_sock(sk); + lock_sock(sk); sp = sctp_sk(sk); ep = sp->ep; @@ -3790,7 +3838,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err) sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP); out: - sctp_release_sock(sk); + release_sock(sk); *err = error; return newsk; } @@ -3800,7 +3848,7 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) { int rc = -ENOTCONN; - sctp_lock_sock(sk); + lock_sock(sk); /* * SEQPACKET-style sockets in LISTENING state are valid, for @@ -3830,7 +3878,7 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) break; } out: - sctp_release_sock(sk); + release_sock(sk); return rc; } @@ -3908,7 +3956,7 @@ static int sctp_init_sock(struct sock *sk) */ sp->hbinterval = net->sctp.hb_interval; sp->pathmaxrxt = net->sctp.max_retrans_path; - sp->pathmtu = 0; // allow default discovery + sp->pathmtu = 0; /* allow default discovery */ sp->sackdelay = net->sctp.sack_timeout; sp->sackfreq = 2; sp->param_flags = SPP_HB_ENABLE | @@ -4451,7 +4499,7 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ - if (!sctp_is_any(sk, ( union sctp_addr *)¶ms.spp_address)) { + if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spp_address)) { trans = sctp_addr_id2transport(sk, ¶ms.spp_address, params.spp_assoc_id); if (!trans) { @@ -4557,12 +4605,15 @@ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len, if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else if (len == sizeof(struct sctp_assoc_value)) { - pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); - pr_warn("Use struct sctp_sack_info instead\n"); + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of struct sctp_assoc_value in delayed_ack socket option.\n" + "Use struct sctp_sack_info instead\n", + current->comm, task_pid_nr(current)); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else - return - EINVAL; + return -EINVAL; /* Get association, if sack_assoc_id != 0 and the socket is a one * to many style socket, and an association was not found, then @@ -4652,8 +4703,8 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, if (!asoc) return -EINVAL; - to = optval + offsetof(struct sctp_getaddrs,addrs); - space_left = len - offsetof(struct sctp_getaddrs,addrs); + to = optval + offsetof(struct sctp_getaddrs, addrs); + space_left = len - offsetof(struct sctp_getaddrs, addrs); list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { @@ -4713,7 +4764,7 @@ static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, memcpy(to, &temp, addrlen); to += addrlen; - cnt ++; + cnt++; space_left -= addrlen; *bytes_copied += addrlen; } @@ -4762,8 +4813,8 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, bp = &asoc->base.bind_addr; } - to = optval + offsetof(struct sctp_getaddrs,addrs); - space_left = len - offsetof(struct sctp_getaddrs,addrs); + to = optval + offsetof(struct sctp_getaddrs, addrs); + space_left = len - offsetof(struct sctp_getaddrs, addrs); addrs = kmalloc(space_left, GFP_KERNEL); if (!addrs) @@ -4802,7 +4853,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, memcpy(buf, &temp, addrlen); buf += addrlen; bytes_copied += addrlen; - cnt ++; + cnt++; space_left -= addrlen; } @@ -5074,7 +5125,7 @@ static int sctp_getsockopt_associnfo(struct sock *sk, int len, assocparams.sasoc_cookie_life = ktime_to_ms(asoc->cookie_life); list_for_each(pos, &asoc->peer.transport_addr_list) { - cnt ++; + cnt++; } assocparams.sasoc_number_peer_destinations = cnt; @@ -5202,8 +5253,11 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - pr_warn("Use of int in maxseg socket option deprecated\n"); - pr_warn("Use struct sctp_assoc_value instead\n"); + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of int in maxseg socket option.\n" + "Use struct sctp_assoc_value instead\n", + current->comm, task_pid_nr(current)); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5294,8 +5348,11 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - pr_warn("Use of int in max_burst socket option deprecated\n"); - pr_warn("Use struct sctp_assoc_value instead\n"); + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of int in max_burst socket option.\n" + "Use struct sctp_assoc_value instead\n", + current->comm, task_pid_nr(current)); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5329,16 +5386,16 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_hmacalgo __user *p = (void __user *)optval; struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; - hmacs = sctp_sk(sk)->ep->auth_hmacs_list; + hmacs = ep->auth_hmacs_list; data_len = ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t); if (len < sizeof(struct sctp_hmacalgo) + data_len) @@ -5359,11 +5416,11 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, static int sctp_getsockopt_active_key(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (len < sizeof(struct sctp_authkeyid)) @@ -5378,7 +5435,7 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, if (asoc) val.scact_keynumber = asoc->active_key_id; else - val.scact_keynumber = sctp_sk(sk)->ep->active_key_id; + val.scact_keynumber = ep->active_key_id; len = sizeof(struct sctp_authkeyid); if (put_user(len, optlen)) @@ -5392,7 +5449,7 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; @@ -5400,7 +5457,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, u32 num_chunks = 0; char __user *to; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (len < sizeof(struct sctp_authchunks)) @@ -5427,7 +5484,8 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, return -EFAULT; num: len = sizeof(struct sctp_authchunks) + num_chunks; - if (put_user(len, optlen)) return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; if (put_user(num_chunks, &p->gauth_number_of_chunks)) return -EFAULT; return 0; @@ -5436,7 +5494,7 @@ num: static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; @@ -5444,7 +5502,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, u32 num_chunks = 0; char __user *to; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (len < sizeof(struct sctp_authchunks)) @@ -5459,9 +5517,9 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, return -EINVAL; if (asoc) - ch = (struct sctp_chunks_param*)asoc->c.auth_chunks; + ch = (struct sctp_chunks_param *)asoc->c.auth_chunks; else - ch = sctp_sk(sk)->ep->auth_chunk_list; + ch = ep->auth_chunk_list; if (!ch) goto num; @@ -5718,7 +5776,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - sctp_lock_sock(sk); + lock_sock(sk); switch (optname) { case SCTP_STATUS: @@ -5842,7 +5900,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, break; } - sctp_release_sock(sk); + release_sock(sk); return retval; } @@ -5882,33 +5940,34 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) pr_debug("%s: begins, snum:%d\n", __func__, snum); - sctp_local_bh_disable(); + local_bh_disable(); if (snum == 0) { /* Search for an available port. */ int low, high, remaining, index; unsigned int rover; + struct net *net = sock_net(sk); - inet_get_local_port_range(sock_net(sk), &low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = net_random() % remaining + low; + rover = prandom_u32() % remaining + low; do { rover++; if ((rover < low) || (rover > high)) rover = low; - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) continue; index = sctp_phashfn(sock_net(sk), rover); head = &sctp_port_hashtable[index]; - sctp_spin_lock(&head->lock); + spin_lock(&head->lock); sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && net_eq(sock_net(sk), pp->net)) goto next; break; next: - sctp_spin_unlock(&head->lock); + spin_unlock(&head->lock); } while (--remaining > 0); /* Exhausted local port range during search? */ @@ -5929,7 +5988,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) * port iterator, pp being NULL. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), snum)]; - sctp_spin_lock(&head->lock); + spin_lock(&head->lock); sctp_for_each_hentry(pp, &head->chain) { if ((pp->port == snum) && net_eq(pp->net, sock_net(sk))) goto pp_found; @@ -6013,10 +6072,10 @@ success: ret = 0; fail_unlock: - sctp_spin_unlock(&head->lock); + spin_unlock(&head->lock); fail: - sctp_local_bh_enable(); + local_bh_enable(); return ret; } @@ -6108,7 +6167,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) if (unlikely(backlog < 0)) return err; - sctp_lock_sock(sk); + lock_sock(sk); /* Peeled-off sockets are not allowed to listen(). */ if (sctp_style(sk, UDP_HIGH_BANDWIDTH)) @@ -6141,7 +6200,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) err = 0; out: - sctp_release_sock(sk); + release_sock(sk); return err; } @@ -6250,20 +6309,20 @@ static inline void __sctp_put_port(struct sock *sk) inet_sk(sk)->inet_num)]; struct sctp_bind_bucket *pp; - sctp_spin_lock(&head->lock); + spin_lock(&head->lock); pp = sctp_sk(sk)->bind_hash; __sk_del_bind_node(sk); sctp_sk(sk)->bind_hash = NULL; inet_sk(sk)->inet_num = 0; sctp_bucket_destroy(pp); - sctp_spin_unlock(&head->lock); + spin_unlock(&head->lock); } void sctp_put_port(struct sock *sk) { - sctp_local_bh_disable(); + local_bh_disable(); __sctp_put_port(sk); - sctp_local_bh_enable(); + local_bh_enable(); } /* @@ -6401,7 +6460,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) * Note: This function is the same function as in core/datagram.c * with a few modifications to make lksctp work. */ -static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) +static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p) { int error; DEFINE_WAIT(wait); @@ -6438,9 +6497,9 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) * does not fit in the user's buffer, but this seems to be the * only way to honor MSG_DONTWAIT realistically. */ - sctp_release_sock(sk); + release_sock(sk); *timeo_p = schedule_timeout(*timeo_p); - sctp_lock_sock(sk); + lock_sock(sk); ready: finish_wait(sk_sleep(sk), &wait); @@ -6500,6 +6559,10 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (sk->sk_shutdown & RCV_SHUTDOWN) break; + if (sk_can_busy_loop(sk) && + sk_busy_loop(sk, noblock)) + continue; + /* User doesn't want to wait. */ error = -EAGAIN; if (!timeo) @@ -6540,6 +6603,46 @@ static void __sctp_write_space(struct sctp_association *asoc) } } +static void sctp_wake_up_waiters(struct sock *sk, + struct sctp_association *asoc) +{ + struct sctp_association *tmp = asoc; + + /* We do accounting for the sndbuf space per association, + * so we only need to wake our own association. + */ + if (asoc->ep->sndbuf_policy) + return __sctp_write_space(asoc); + + /* If association goes down and is just flushing its + * outq, then just normally notify others. + */ + if (asoc->base.dead) + return sctp_write_space(sk); + + /* Accounting for the sndbuf space is per socket, so we + * need to wake up others, try to be fair and in case of + * other associations, let them have a go first instead + * of just doing a sctp_write_space() call. + * + * Note that we reach sctp_wake_up_waiters() only when + * associations free up queued chunks, thus we are under + * lock and the list of associations on a socket is + * guaranteed not to change. + */ + for (tmp = list_next_entry(tmp, asocs); 1; + tmp = list_next_entry(tmp, asocs)) { + /* Manually skip the head element. */ + if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs)) + continue; + /* Wake up association. */ + __sctp_write_space(tmp); + /* We've reached the end. */ + if (tmp == asoc) + break; + } +} + /* Do accounting for the sndbuf space. * Decrement the used sndbuf space of the corresponding association by the * data size which was just transmitted(freed). @@ -6567,7 +6670,7 @@ static void sctp_wfree(struct sk_buff *skb) sk_mem_uncharge(sk, skb->truesize); sock_wfree(skb); - __sctp_write_space(asoc); + sctp_wake_up_waiters(sk, asoc); sctp_association_put(asoc); } @@ -6623,10 +6726,10 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, /* Let another process have a go. Since we are going * to sleep anyway. */ - sctp_release_sock(sk); + release_sock(sk); current_timeo = schedule_timeout(current_timeo); BUG_ON(sk != asoc->base.sk); - sctp_lock_sock(sk); + lock_sock(sk); *timeo_p = current_timeo; } @@ -6652,7 +6755,7 @@ do_nonblock: goto out; } -void sctp_data_ready(struct sock *sk, int len) +void sctp_data_ready(struct sock *sk) { struct socket_wq *wq; @@ -6731,9 +6834,9 @@ static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p) /* Let another process have a go. Since we are going * to sleep anyway. */ - sctp_release_sock(sk); + release_sock(sk); current_timeo = schedule_timeout(current_timeo); - sctp_lock_sock(sk); + lock_sock(sk); *timeo_p = current_timeo; } @@ -6776,9 +6879,9 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo) TASK_INTERRUPTIBLE); if (list_empty(&ep->asocs)) { - sctp_release_sock(sk); + release_sock(sk); timeo = schedule_timeout(timeo); - sctp_lock_sock(sk); + lock_sock(sk); } err = -EINVAL; @@ -6811,9 +6914,9 @@ static void sctp_wait_for_close(struct sock *sk, long timeout) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&sctp_sk(sk)->ep->asocs)) break; - sctp_release_sock(sk); + release_sock(sk); timeout = schedule_timeout(timeout); - sctp_lock_sock(sk); + lock_sock(sk); } while (!signal_pending(current) && timeout); finish_wait(sk_sleep(sk), &wait); @@ -6843,7 +6946,8 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; newsk->sk_flags = sk->sk_flags; - newsk->sk_no_check = sk->sk_no_check; + newsk->sk_no_check_tx = sk->sk_no_check_tx; + newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; newsk->sk_shutdown = sk->sk_shutdown; @@ -6914,14 +7018,14 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), inet_sk(oldsk)->inet_num)]; - sctp_local_bh_disable(); - sctp_spin_lock(&head->lock); + local_bh_disable(); + spin_lock(&head->lock); pp = sctp_sk(oldsk)->bind_hash; sk_add_bind_node(newsk, &pp->owner); sctp_sk(newsk)->bind_hash = pp; inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num; - sctp_spin_unlock(&head->lock); - sctp_local_bh_enable(); + spin_unlock(&head->lock); + local_bh_enable(); /* Copy the bind_addr list from the original endpoint to the new * endpoint so that we can handle restarts properly @@ -7010,7 +7114,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, newsk->sk_shutdown |= RCV_SHUTDOWN; newsk->sk_state = SCTP_SS_ESTABLISHED; - sctp_release_sock(newsk); + release_sock(newsk); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 80b17b5df6b..12c7e01c267 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -34,6 +34,8 @@ * Sridhar Samudrala <sri@us.ibm.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <net/sctp/structs.h> #include <net/sctp/sctp.h> #include <linux/sysctl.h> @@ -46,6 +48,11 @@ static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; +static int rto_alpha_min = 0; +static int rto_beta_min = 0; +static int rto_alpha_max = 1000; +static int rto_beta_max = 1000; + static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) @@ -55,11 +62,22 @@ extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; -static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, - int write, +static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, - loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +static int proc_sctp_do_auth(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + static struct ctl_table sctp_table[] = { { .procname = "sctp_mem", @@ -101,32 +119,36 @@ static struct ctl_table sctp_net_table[] = { .data = &init_net.sctp.rto_min, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_sctp_do_rto_min, .extra1 = &one, - .extra2 = &timer_max + .extra2 = &init_net.sctp.rto_max }, { .procname = "rto_max", .data = &init_net.sctp.rto_max, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .proc_handler = proc_sctp_do_rto_max, + .extra1 = &init_net.sctp.rto_min, .extra2 = &timer_max }, { .procname = "rto_alpha_exp_divisor", .data = &init_net.sctp.rto_alpha, .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = proc_sctp_do_alpha_beta, + .extra1 = &rto_alpha_min, + .extra2 = &rto_alpha_max, }, { .procname = "rto_beta_exp_divisor", .data = &init_net.sctp.rto_beta, .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = proc_sctp_do_alpha_beta, + .extra1 = &rto_beta_min, + .extra2 = &rto_beta_max, }, { .procname = "max_burst", @@ -146,6 +168,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", + .data = &init_net.sctp.sctp_hmac_alg, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, @@ -260,7 +283,7 @@ static struct ctl_table sctp_net_table[] = { .data = &init_net.sctp.auth_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_sctp_do_auth, }, { .procname = "addr_scope_policy", @@ -293,47 +316,45 @@ static struct ctl_table sctp_net_table[] = { { /* sentinel */ } }; -static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, - int write, +static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; - char tmp[8]; struct ctl_table tbl; - int ret; - int changed = 0; + bool changed = false; char *none = "none"; + char tmp[8]; + int ret; memset(&tbl, 0, sizeof(struct ctl_table)); if (write) { tbl.data = tmp; - tbl.maxlen = 8; + tbl.maxlen = sizeof(tmp); } else { tbl.data = net->sctp.sctp_hmac_alg ? : none; tbl.maxlen = strlen(tbl.data); } - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write) { + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { #ifdef CONFIG_CRYPTO_MD5 if (!strncmp(tmp, "md5", 3)) { net->sctp.sctp_hmac_alg = "md5"; - changed = 1; + changed = true; } #endif #ifdef CONFIG_CRYPTO_SHA1 if (!strncmp(tmp, "sha1", 4)) { net->sctp.sctp_hmac_alg = "sha1"; - changed = 1; + changed = true; } #endif if (!strncmp(tmp, "none", 4)) { net->sctp.sctp_hmac_alg = NULL; - changed = 1; + changed = true; } - if (!changed) ret = -EINVAL; } @@ -341,6 +362,104 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, return ret; } +static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + unsigned int min = *(unsigned int *) ctl->extra1; + unsigned int max = *(unsigned int *) ctl->extra2; + struct ctl_table tbl; + int ret, new_value; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.rto_min; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + if (new_value > max || new_value < min) + return -EINVAL; + + net->sctp.rto_min = new_value; + } + + return ret; +} + +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + unsigned int min = *(unsigned int *) ctl->extra1; + unsigned int max = *(unsigned int *) ctl->extra2; + struct ctl_table tbl; + int ret, new_value; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.rto_max; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + if (new_value > max || new_value < min) + return -EINVAL; + + net->sctp.rto_max = new_value; + } + + return ret; +} + +static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + pr_warn_once("Changing rto_alpha or rto_beta may lead to " + "suboptimal rtt/srtt estimations!\n"); + + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + +static int proc_sctp_do_auth(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + struct ctl_table tbl; + int new_value, ret; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.auth_enable; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + struct sock *sk = net->sctp.ctl_sock; + + net->sctp.auth_enable = new_value; + /* Update the value in the control socket */ + lock_sock(sk); + sctp_sk(sk)->ep->auth_enable = new_value; + release_sock(sk); + } + + return ret; +} + int sctp_sysctl_net_register(struct net *net) { struct ctl_table *table; @@ -354,6 +473,10 @@ int sctp_sysctl_net_register(struct net *net) table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table); + if (net->sctp.sysctl_header == NULL) { + kfree(table); + return -ENOMEM; + } return 0; } @@ -366,7 +489,7 @@ void sctp_sysctl_net_unregister(struct net *net) kfree(table); } -static struct ctl_table_header * sctp_sysctl_header; +static struct ctl_table_header *sctp_sysctl_header; /* Sysctl registration. */ void sctp_sysctl_register(void) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index d0810dc5f07..7dd672fa651 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net, */ peer->rto = msecs_to_jiffies(net->sctp.rto_initial); - peer->last_time_heard = jiffies; + peer->last_time_heard = ktime_get(); peer->last_time_ecne_reduced = jiffies; peer->param_flags = SPP_HB_DISABLE | @@ -652,5 +652,4 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t) if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto)) sctp_transport_hold(t); } - return; } diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 85c64658bd0..b6842fdb53d 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -366,9 +366,10 @@ fail: * specification [SCTP] and any extensions for a list of possible * error formats. */ -struct sctp_ulpevent *sctp_ulpevent_make_remote_error( - const struct sctp_association *asoc, struct sctp_chunk *chunk, - __u16 flags, gfp_t gfp) +struct sctp_ulpevent * +sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, + struct sctp_chunk *chunk, __u16 flags, + gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_remote_error *sre; @@ -387,8 +388,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error( /* Copy the skb to a new skb with room for us to prepend * notification with. */ - skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error), - 0, gfp); + skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp); /* Pull off the rest of the cause TLV from the chunk. */ skb_pull(chunk->skb, elen); @@ -399,62 +399,21 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error( event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); - sre = (struct sctp_remote_error *) - skb_push(skb, sizeof(struct sctp_remote_error)); + sre = (struct sctp_remote_error *) skb_push(skb, sizeof(*sre)); /* Trim the buffer to the right length. */ - skb_trim(skb, sizeof(struct sctp_remote_error) + elen); + skb_trim(skb, sizeof(*sre) + elen); - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_type: - * It should be SCTP_REMOTE_ERROR. - */ + /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */ + memset(sre, 0, sizeof(*sre)); sre->sre_type = SCTP_REMOTE_ERROR; - - /* - * Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_flags: 16 bits (unsigned integer) - * Currently unused. - */ sre->sre_flags = 0; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_length: sizeof (__u32) - * - * This field is the total length of the notification data, - * including the notification header. - */ sre->sre_length = skb->len; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_error: 16 bits (unsigned integer) - * This value represents one of the Operational Error causes defined in - * the SCTP specification, in network byte order. - */ sre->sre_error = cause; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_assoc_id: sizeof (sctp_assoc_t) - * - * The association id field, holds the identifier for the association. - * All notifications for a given association have the same association - * identifier. For TCP style socket, this field is ignored. - */ sctp_ulpevent_set_owner(event, asoc); sre->sre_assoc_id = sctp_assoc2id(asoc); return event; - fail: return NULL; } @@ -899,7 +858,9 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event) return notification->sn_header.sn_type; } -/* Copy out the sndrcvinfo into a msghdr. */ +/* RFC6458, Section 5.3.2. SCTP Header Information Structure + * (SCTP_SNDRCV, DEPRECATED) + */ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr) { @@ -908,74 +869,21 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, if (sctp_ulpevent_is_notification(event)) return; - /* Sockets API Extensions for SCTP - * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) - * - * sinfo_stream: 16 bits (unsigned integer) - * - * For recvmsg() the SCTP stack places the message's stream number in - * this value. - */ + memset(&sinfo, 0, sizeof(sinfo)); sinfo.sinfo_stream = event->stream; - /* sinfo_ssn: 16 bits (unsigned integer) - * - * For recvmsg() this value contains the stream sequence number that - * the remote endpoint placed in the DATA chunk. For fragmented - * messages this is the same number for all deliveries of the message - * (if more than one recvmsg() is needed to read the message). - */ sinfo.sinfo_ssn = event->ssn; - /* sinfo_ppid: 32 bits (unsigned integer) - * - * In recvmsg() this value is - * the same information that was passed by the upper layer in the peer - * application. Please note that byte order issues are NOT accounted - * for and this information is passed opaquely by the SCTP stack from - * one end to the other. - */ sinfo.sinfo_ppid = event->ppid; - /* sinfo_flags: 16 bits (unsigned integer) - * - * This field may contain any of the following flags and is composed of - * a bitwise OR of these values. - * - * recvmsg() flags: - * - * SCTP_UNORDERED - This flag is present when the message was sent - * non-ordered. - */ sinfo.sinfo_flags = event->flags; - /* sinfo_tsn: 32 bit (unsigned integer) - * - * For the receiving side, this field holds a TSN that was - * assigned to one of the SCTP Data Chunks. - */ sinfo.sinfo_tsn = event->tsn; - /* sinfo_cumtsn: 32 bit (unsigned integer) - * - * This field will hold the current cumulative TSN as - * known by the underlying SCTP layer. Note this field is - * ignored when sending and only valid for a receive - * operation when sinfo_flags are set to SCTP_UNORDERED. - */ sinfo.sinfo_cumtsn = event->cumtsn; - /* sinfo_assoc_id: sizeof (sctp_assoc_t) - * - * The association handle field, sinfo_assoc_id, holds the identifier - * for the association announced in the COMMUNICATION_UP notification. - * All notifications for a given association have the same identifier. - * Ignored for one-to-one style sockets. - */ sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc); - - /* context value that is set via SCTP_CONTEXT socket option. */ + /* Context value that is set via SCTP_CONTEXT socket option. */ sinfo.sinfo_context = event->asoc->default_rcv_context; - /* These fields are not used while receiving. */ sinfo.sinfo_timetolive = 0; put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV, - sizeof(struct sctp_sndrcvinfo), (void *)&sinfo); + sizeof(sinfo), &sinfo); } /* Do accounting for bytes received and hold a reference to the association diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index a67470083be..d49dc2ed30a 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -38,14 +38,15 @@ #include <linux/types.h> #include <linux/skbuff.h> #include <net/sock.h> +#include <net/busy_poll.h> #include <net/sctp/structs.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ -static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq, +static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *); -static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *, +static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *, struct sctp_ulpevent *); static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq); @@ -107,7 +108,7 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, event = sctp_ulpq_reasm(ulpq, event); /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)){ + if ((event) && (event->msg_flags & MSG_EOR)) { /* Create a temporary list to collect chunks on. */ skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); @@ -204,6 +205,9 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) goto out_free; + if (!sctp_ulpevent_is_notification(event)) + sk_mark_napi_id(sk, skb); + /* Check if the user wishes to receive this event. */ if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) goto out_free; @@ -259,7 +263,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sctp_ulpq_clear_pd(ulpq); if (queue == &sk->sk_receive_queue) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return 1; out_free: @@ -336,7 +340,8 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, pos = f_frag->next; /* Get the last skb in the f_frag's frag_list if present. */ - for (last = list; list; last = list, list = list->next); + for (last = list; list; last = list, list = list->next) + ; /* Add the list of remaining fragments to the first fragments * frag_list. @@ -726,7 +731,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) { /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)){ + if ((event) && (event->msg_flags & MSG_EOR)) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); @@ -1134,5 +1139,5 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) /* If there is data waiting, send it up the socket now. */ if (sctp_ulpq_clear_pd(ulpq) || ev) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } diff --git a/net/socket.c b/net/socket.c index cd38a785f7d..abf56b2a14f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -72,6 +72,7 @@ #include <linux/if_bridge.h> #include <linux/if_frad.h> #include <linux/if_vlan.h> +#include <linux/ptp_classify.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/cache.h> @@ -450,16 +451,17 @@ EXPORT_SYMBOL(sockfd_lookup); static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) { - struct file *file; + struct fd f = fdget(fd); struct socket *sock; *err = -EBADF; - file = fget_light(fd, fput_needed); - if (file) { - sock = sock_from_file(file, err); - if (sock) + if (f.file) { + sock = sock_from_file(f.file, err); + if (likely(sock)) { + *fput_needed = f.flags; return sock; - fput_light(file, *fput_needed); + } + fdput(f); } return NULL; } @@ -593,7 +595,7 @@ void sock_release(struct socket *sock) } if (rcu_dereference_protected(sock->wq, 1)->fasync_list) - printk(KERN_ERR "sock_release: fasync list not empty!\n"); + pr_err("%s: fasync list not empty!\n", __func__); if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags)) return; @@ -1265,8 +1267,8 @@ int __sock_create(struct net *net, int family, int type, int protocol, static int warned; if (!warned) { warned = 1; - printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", - current->comm); + pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n", + current->comm); } family = PF_PACKET; } @@ -1445,48 +1447,61 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, err = fd1; goto out_release_both; } + fd2 = get_unused_fd_flags(flags); if (unlikely(fd2 < 0)) { err = fd2; - put_unused_fd(fd1); - goto out_release_both; + goto out_put_unused_1; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (unlikely(IS_ERR(newfile1))) { err = PTR_ERR(newfile1); - put_unused_fd(fd1); - put_unused_fd(fd2); - goto out_release_both; + goto out_put_unused_both; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); - fput(newfile1); - put_unused_fd(fd1); - put_unused_fd(fd2); - sock_release(sock2); - goto out; + goto out_fput_1; } + err = put_user(fd1, &usockvec[0]); + if (err) + goto out_fput_both; + + err = put_user(fd2, &usockvec[1]); + if (err) + goto out_fput_both; + audit_fd_pair(fd1, fd2); + fd_install(fd1, newfile1); fd_install(fd2, newfile2); /* fd1 and fd2 may be already another descriptors. * Not kernel problem. */ - err = put_user(fd1, &usockvec[0]); - if (!err) - err = put_user(fd2, &usockvec[1]); - if (!err) - return 0; + return 0; - sys_close(fd2); - sys_close(fd1); - return err; +out_fput_both: + fput(newfile2); + fput(newfile1); + put_unused_fd(fd2); + put_unused_fd(fd1); + goto out; +out_fput_1: + fput(newfile1); + put_unused_fd(fd2); + put_unused_fd(fd1); + sock_release(sock2); + goto out; + +out_put_unused_both: + put_unused_fd(fd2); +out_put_unused_1: + put_unused_fd(fd1); out_release_both: sock_release(sock2); out_release_1: @@ -1865,8 +1880,8 @@ out: * Receive a datagram from a socket. */ -asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size, - unsigned int flags) +SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, + unsigned int, flags) { return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } @@ -1972,6 +1987,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, { if (copy_from_user(kmsg, umsg, sizeof(struct msghdr))) return -EFAULT; + + if (kmsg->msg_namelen < 0) + return -EINVAL; + if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); return 0; @@ -2582,8 +2601,7 @@ int sock_register(const struct net_proto_family *ops) int err; if (ops->family >= NPROTO) { - printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, - NPROTO); + pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } @@ -2597,7 +2615,7 @@ int sock_register(const struct net_proto_family *ops) } spin_unlock(&net_family_lock); - printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); + pr_info("NET: Registered protocol family %d\n", ops->family); return err; } EXPORT_SYMBOL(sock_register); @@ -2625,7 +2643,7 @@ void sock_unregister(int family) synchronize_rcu(); - printk(KERN_INFO "NET: Unregistered protocol family %d\n", family); + pr_info("NET: Unregistered protocol family %d\n", family); } EXPORT_SYMBOL(sock_unregister); @@ -2668,9 +2686,7 @@ static int __init sock_init(void) goto out; #endif -#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING - skb_timestamping_init(); -#endif + ptp_classifier_init(); out: return err; diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 241b54f3020..0754d0f466d 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -9,19 +9,6 @@ config SUNRPC_BACKCHANNEL bool depends on SUNRPC -config SUNRPC_XPRT_RDMA - tristate - depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS - default SUNRPC && INFINIBAND - help - This option allows the NFS client and server to support - an RDMA-enabled transport. - - To compile RPC client RDMA transport support as a module, - choose M here: the module will be called xprtrdma. - - If unsure, say N. - config SUNRPC_SWAP bool depends on SUNRPC @@ -57,3 +44,29 @@ config SUNRPC_DEBUG but makes troubleshooting NFS issues significantly harder. If unsure, say Y. + +config SUNRPC_XPRT_RDMA_CLIENT + tristate "RPC over RDMA Client Support" + depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS + default SUNRPC && INFINIBAND + help + This option allows the NFS client to support an RDMA-enabled + transport. + + To compile RPC client RDMA transport support as a module, + choose M here: the module will be called xprtrdma. + + If unsure, say N. + +config SUNRPC_XPRT_RDMA_SERVER + tristate "RPC over RDMA Server Support" + depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS + default SUNRPC && INFINIBAND + help + This option allows the NFS server to support an RDMA-enabled + transport. + + To compile RPC server RDMA transport support as a module, + choose M here: the module will be called svcrdma. + + If unsure, say N. diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 8209a0411bc..e5a7a1cac8f 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -5,7 +5,8 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ -obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ + +obj-y += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o auth_generic.o \ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 5285ead196c..f7736671742 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -296,7 +296,7 @@ static void rpcauth_unhash_cred_locked(struct rpc_cred *cred) { hlist_del_rcu(&cred->cr_hash); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); } @@ -592,6 +592,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) put_group_info(acred.group_info); return ret; } +EXPORT_SYMBOL_GPL(rpcauth_lookupcred); void rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 97912b40c25..b6e440baccc 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -108,6 +108,7 @@ struct gss_auth { static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); +static void gss_put_auth(struct gss_auth *gss_auth); static void gss_free_ctx(struct gss_cl_ctx *); static const struct rpc_pipe_ops gss_upcall_ops_v0; @@ -142,7 +143,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } @@ -320,6 +321,7 @@ gss_release_msg(struct gss_upcall_msg *gss_msg) if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); + gss_put_auth(gss_msg->auth); kfree(gss_msg); } @@ -498,9 +500,12 @@ gss_alloc_msg(struct gss_auth *gss_auth, default: err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); if (err) - goto err_free_msg; + goto err_put_pipe_version; }; + kref_get(&gss_auth->kref); return gss_msg; +err_put_pipe_version: + put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); err: @@ -532,14 +537,7 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) static void warn_gssd(void) { - static unsigned long ratelimit; - unsigned long now = jiffies; - - if (time_after(now, ratelimit)) { - printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" - "Please check user daemon is running.\n"); - ratelimit = now + 15*HZ; - } + dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n"); } static inline int @@ -600,7 +598,6 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; - unsigned long timeout; DEFINE_WAIT(wait); int err; @@ -608,17 +605,16 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) __func__, from_kuid(&init_user_ns, cred->cr_uid)); retry: err = 0; - /* Default timeout is 15s unless we know that gssd is not running */ - timeout = 15 * HZ; - if (!sn->gssd_running) - timeout = HZ >> 2; + /* if gssd is down, just skip upcalling altogether */ + if (!gssd_running(net)) { + warn_gssd(); + return -EACCES; + } gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, - sn->pipe_version >= 0, timeout); + sn->pipe_version >= 0, 15 * HZ); if (sn->pipe_version < 0) { - if (err == 0) - sn->gssd_running = 0; warn_gssd(); err = -EACCES; } @@ -1000,6 +996,8 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); if (gss_auth->service == 0) goto err_put_mech; + if (!gssd_running(gss_auth->net)) + goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2; @@ -1071,6 +1069,12 @@ gss_free_callback(struct kref *kref) } static void +gss_put_auth(struct gss_auth *gss_auth) +{ + kref_put(&gss_auth->kref, gss_free_callback); +} + +static void gss_destroy(struct rpc_auth *auth) { struct gss_auth *gss_auth = container_of(auth, @@ -1091,7 +1095,7 @@ gss_destroy(struct rpc_auth *auth) gss_auth->gss_pipe[1] = NULL; rpcauth_destroy_credcache(auth); - kref_put(&gss_auth->kref, gss_free_callback); + gss_put_auth(gss_auth); } /* @@ -1262,7 +1266,7 @@ gss_destroy_nullcred(struct rpc_cred *cred) call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); - kref_put(&gss_auth->kref, gss_free_callback); + gss_put_auth(gss_auth); } static void @@ -1517,7 +1521,7 @@ out: static int gss_refresh_null(struct rpc_task *task) { - return -EACCES; + return 0; } static __be32 * diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 76e42e6be75..24589bd2a4b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -59,6 +59,7 @@ #include <linux/crypto.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> +#include <linux/lcm.h> #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -72,7 +73,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, u32 outbits, u8 *out) { - int a, b, c, lcm; + unsigned long ulcm; int byte, i, msbit; /* the code below is more readable if I make these bytes @@ -82,17 +83,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, outbits >>= 3; /* first compute lcm(n,k) */ - - a = outbits; - b = inbits; - - while (b != 0) { - c = b; - b = a%b; - a = c; - } - - lcm = outbits*inbits/a; + ulcm = lcm(inbits, outbits); /* now do the real work */ @@ -101,7 +92,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, /* this will end up cycling through k lcm(k,n)/k times, which is correct */ - for (i = lcm-1; i >= 0; i--) { + for (i = ulcm-1; i >= 0; i--) { /* compute the msbit in k which gets added into this byte */ msbit = ( /* first, start with the msbit in the first, diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 27ce2624093..92d5ab99fbf 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -218,10 +218,8 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) spin_lock(®istered_mechs_lock); list_for_each_entry(pos, ®istered_mechs, gm_list) { - if (!mech_supports_pseudoflavor(pos, pseudoflavor)) { - module_put(pos->gm_owner); + if (!mech_supports_pseudoflavor(pos, pseudoflavor)) continue; - } if (try_module_get(pos->gm_owner)) gm = pos; break; diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 458f85e9b0b..abbb7dcd168 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -137,7 +137,6 @@ void init_gssp_clnt(struct sunrpc_net *sn) { mutex_init(&sn->gssp_lock); sn->gssp_clnt = NULL; - init_waitqueue_head(&sn->gssp_wq); } int set_gssp_clnt(struct net *net) @@ -154,7 +153,6 @@ int set_gssp_clnt(struct net *net) sn->gssp_clnt = clnt; } mutex_unlock(&sn->gssp_lock); - wake_up(&sn->gssp_wq); return ret; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 008cdade5aa..4ce5eccec1f 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1263,65 +1263,34 @@ out: return ret; } -DEFINE_SPINLOCK(use_gssp_lock); - -static bool use_gss_proxy(struct net *net) -{ - struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - - if (sn->use_gss_proxy != -1) - return sn->use_gss_proxy; - spin_lock(&use_gssp_lock); - /* - * If you wanted gss-proxy, you should have said so before - * starting to accept requests: - */ - sn->use_gss_proxy = 0; - spin_unlock(&use_gssp_lock); - return 0; -} - -#ifdef CONFIG_PROC_FS - +/* + * Try to set the sn->use_gss_proxy variable to a new value. We only allow + * it to be changed if it's currently undefined (-1). If it's any other value + * then return -EBUSY unless the type wouldn't have changed anyway. + */ static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - int ret = 0; + int ret; WARN_ON_ONCE(type != 0 && type != 1); - spin_lock(&use_gssp_lock); - if (sn->use_gss_proxy == -1 || sn->use_gss_proxy == type) - sn->use_gss_proxy = type; - else - ret = -EBUSY; - spin_unlock(&use_gssp_lock); - wake_up(&sn->gssp_wq); - return ret; -} - -static inline bool gssp_ready(struct sunrpc_net *sn) -{ - switch (sn->use_gss_proxy) { - case -1: - return false; - case 0: - return true; - case 1: - return sn->gssp_clnt; - } - WARN_ON_ONCE(1); - return false; + ret = cmpxchg(&sn->use_gss_proxy, -1, type); + if (ret != -1 && ret != type) + return -EBUSY; + return 0; } -static int wait_for_gss_proxy(struct net *net, struct file *file) +static bool use_gss_proxy(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - if (file->f_flags & O_NONBLOCK && !gssp_ready(sn)) - return -EAGAIN; - return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn)); + /* If use_gss_proxy is still undefined, then try to disable it */ + if (sn->use_gss_proxy == -1) + set_gss_proxy(net, 0); + return sn->use_gss_proxy; } +#ifdef CONFIG_PROC_FS static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -1342,10 +1311,10 @@ static ssize_t write_gssp(struct file *file, const char __user *buf, return res; if (i != 1) return -EINVAL; - res = set_gss_proxy(net, 1); + res = set_gssp_clnt(net); if (res) return res; - res = set_gssp_clnt(net); + res = set_gss_proxy(net, 1); if (res) return res; return count; @@ -1355,16 +1324,12 @@ static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct net *net = PDE_DATA(file_inode(file)); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; size_t len; - int ret; - ret = wait_for_gss_proxy(net, file); - if (ret) - return ret; - - snprintf(tbuf, sizeof(tbuf), "%d\n", use_gss_proxy(net)); + snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy); len = strlen(tbuf); if (p >= len) return 0; @@ -1538,6 +1503,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (unwrap_integ_data(rqstp, &rqstp->rq_arg, gc->gc_seq, rsci->mechctx)) goto garbage_args; + rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE; break; case RPC_GSS_SVC_PRIVACY: /* placeholders for length and seq. number: */ @@ -1546,6 +1512,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (unwrap_priv_data(rqstp, &rqstp->rq_arg, gc->gc_seq, rsci->mechctx)) goto garbage_args; + rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2; break; default: goto auth_err; @@ -1626,8 +1593,7 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) BUG_ON(integ_len % 4); *p++ = htonl(integ_len); *p++ = htonl(gc->gc_seq); - if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, - integ_len)) + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) BUG(); if (resbuf->tail[0].iov_base == NULL) { if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) @@ -1635,10 +1601,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) resbuf->tail[0].iov_base = resbuf->head[0].iov_base + resbuf->head[0].iov_len; resbuf->tail[0].iov_len = 0; - resv = &resbuf->tail[0]; - } else { - resv = &resbuf->tail[0]; } + resv = &resbuf->tail[0]; mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) goto out_err; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 890a29912d5..9761a0da964 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -64,7 +64,6 @@ static void xprt_free_allocation(struct rpc_rqst *req) free_page((unsigned long)xbufp->head[0].iov_base); xbufp = &req->rq_snd_buf; free_page((unsigned long)xbufp->head[0].iov_base); - list_del(&req->rq_bc_pa_list); kfree(req); } @@ -168,8 +167,10 @@ out_free: /* * Memory allocation failed, free the temporary list */ - list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) + list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) { + list_del(&req->rq_bc_pa_list); xprt_free_allocation(req); + } dprintk("RPC: setup backchannel transport failed\n"); return -ENOMEM; @@ -198,6 +199,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) xprt_dec_alloc_count(xprt, max_reqs); list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { dprintk("RPC: req=%p\n", req); + list_del(&req->rq_bc_pa_list); xprt_free_allocation(req); if (--max_reqs == 0) break; @@ -210,39 +212,23 @@ out: } EXPORT_SYMBOL_GPL(xprt_destroy_backchannel); -/* - * One or more rpc_rqst structure have been preallocated during the - * backchannel setup. Buffer space for the send and private XDR buffers - * has been preallocated as well. Use xprt_alloc_bc_request to allocate - * to this request. Use xprt_free_bc_request to return it. - * - * We know that we're called in soft interrupt context, grab the spin_lock - * since there is no need to grab the bottom half spin_lock. - * - * Return an available rpc_rqst, otherwise NULL if non are available. - */ -struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt) +static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid) { - struct rpc_rqst *req; + struct rpc_rqst *req = NULL; dprintk("RPC: allocate a backchannel request\n"); - spin_lock(&xprt->bc_pa_lock); - if (!list_empty(&xprt->bc_pa_list)) { - req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, - rq_bc_pa_list); - list_del(&req->rq_bc_pa_list); - } else { - req = NULL; - } - spin_unlock(&xprt->bc_pa_lock); + if (list_empty(&xprt->bc_pa_list)) + goto not_found; - if (req != NULL) { - set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); - req->rq_reply_bytes_recvd = 0; - req->rq_bytes_sent = 0; - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, + rq_bc_pa_list); + req->rq_reply_bytes_recvd = 0; + req->rq_bytes_sent = 0; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); - } + req->rq_xid = xid; + req->rq_connect_cookie = xprt->connect_cookie; +not_found: dprintk("RPC: backchannel req=%p\n", req); return req; } @@ -257,10 +243,11 @@ void xprt_free_bc_request(struct rpc_rqst *req) dprintk("RPC: free backchannel req=%p\n", req); - smp_mb__before_clear_bit(); + req->rq_connect_cookie = xprt->connect_cookie - 1; + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (!xprt_need_to_requeue(xprt)) { /* @@ -279,7 +266,57 @@ void xprt_free_bc_request(struct rpc_rqst *req) * may be reused by a new callback request. */ spin_lock_bh(&xprt->bc_pa_lock); - list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list); + list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock_bh(&xprt->bc_pa_lock); } +/* + * One or more rpc_rqst structure have been preallocated during the + * backchannel setup. Buffer space for the send and private XDR buffers + * has been preallocated as well. Use xprt_alloc_bc_request to allocate + * to this request. Use xprt_free_bc_request to return it. + * + * We know that we're called in soft interrupt context, grab the spin_lock + * since there is no need to grab the bottom half spin_lock. + * + * Return an available rpc_rqst, otherwise NULL if non are available. + */ +struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid) +{ + struct rpc_rqst *req; + + spin_lock(&xprt->bc_pa_lock); + list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) { + if (req->rq_connect_cookie != xprt->connect_cookie) + continue; + if (req->rq_xid == xid) + goto found; + } + req = xprt_alloc_bc_request(xprt, xid); +found: + spin_unlock(&xprt->bc_pa_lock); + return req; +} + +/* + * Add callback request to callback list. The callback + * service sleeps on the sv_cb_waitq waiting for new + * requests. Wake it up after adding enqueing the + * request. + */ +void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct svc_serv *bc_serv = xprt->bc_serv; + + req->rq_private_buf.len = copied; + set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); + + dprintk("RPC: add callback request to list\n"); + spin_lock(&bc_serv->sv_cb_lock); + list_del(&req->rq_bc_pa_list); + list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); + wake_up(&bc_serv->sv_cb_waitq); + spin_unlock(&bc_serv->sv_cb_lock); +} + diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index a72de074172..06636214113 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -374,7 +374,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) } return; out: - printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); + printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name); } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); @@ -619,7 +619,7 @@ static void cache_limit_defers(void) /* Consider removing either the first or the last */ if (cache_defer_cnt > DFR_MAX) { - if (net_random() & 1) + if (prandom_u32() & 1) discard = list_entry(cache_defer_list.next, struct cache_deferred_req, recent); else @@ -1111,9 +1111,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen) *bp++ = 'x'; len -= 2; while (blen && len >= 2) { - unsigned char c = *buf++; - *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); - *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + bp = hex_byte_pack(bp, *buf++); len -= 2; blen--; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f09b7db2c49..2e6ab10734f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -438,6 +438,38 @@ out_no_rpciod: return ERR_PTR(err); } +struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, + struct rpc_xprt *xprt) +{ + struct rpc_clnt *clnt = NULL; + + clnt = rpc_new_client(args, xprt, NULL); + if (IS_ERR(clnt)) + return clnt; + + if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { + int err = rpc_ping(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } + } + + clnt->cl_softrtry = 1; + if (args->flags & RPC_CLNT_CREATE_HARDRTRY) + clnt->cl_softrtry = 0; + + if (args->flags & RPC_CLNT_CREATE_AUTOBIND) + clnt->cl_autobind = 1; + if (args->flags & RPC_CLNT_CREATE_DISCRTRY) + clnt->cl_discrtry = 1; + if (!(args->flags & RPC_CLNT_CREATE_QUIET)) + clnt->cl_chatty = 1; + + return clnt; +} +EXPORT_SYMBOL_GPL(rpc_create_xprt); + /** * rpc_create - create an RPC client and transport with one call * @args: rpc_clnt create argument structure @@ -451,7 +483,6 @@ out_no_rpciod: struct rpc_clnt *rpc_create(struct rpc_create_args *args) { struct rpc_xprt *xprt; - struct rpc_clnt *clnt; struct xprt_create xprtargs = { .net = args->net, .ident = args->protocol, @@ -515,30 +546,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) xprt->resvport = 0; - clnt = rpc_new_client(args, xprt, NULL); - if (IS_ERR(clnt)) - return clnt; - - if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { - int err = rpc_ping(clnt); - if (err != 0) { - rpc_shutdown_client(clnt); - return ERR_PTR(err); - } - } - - clnt->cl_softrtry = 1; - if (args->flags & RPC_CLNT_CREATE_HARDRTRY) - clnt->cl_softrtry = 0; - - if (args->flags & RPC_CLNT_CREATE_AUTOBIND) - clnt->cl_autobind = 1; - if (args->flags & RPC_CLNT_CREATE_DISCRTRY) - clnt->cl_discrtry = 1; - if (!(args->flags & RPC_CLNT_CREATE_QUIET)) - clnt->cl_chatty = 1; - - return clnt; + return rpc_create_xprt(args, xprt); } EXPORT_SYMBOL_GPL(rpc_create); @@ -1363,6 +1371,7 @@ rpc_restart_call_prepare(struct rpc_task *task) if (RPC_ASSASSINATED(task)) return 0; task->tk_action = call_start; + task->tk_status = 0; if (task->tk_ops->rpc_call_prepare != NULL) task->tk_action = rpc_prepare_task; return 1; @@ -1379,6 +1388,7 @@ rpc_restart_call(struct rpc_task *task) if (RPC_ASSASSINATED(task)) return 0; task->tk_action = call_start; + task->tk_status = 0; return 1; } EXPORT_SYMBOL_GPL(rpc_restart_call); @@ -1529,9 +1539,13 @@ call_refreshresult(struct rpc_task *task) task->tk_action = call_refresh; switch (status) { case 0: - if (rpcauth_uptodatecred(task)) + if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; - return; + return; + } + /* Use rate-limiting and a max number of retries if refresh + * had status 0 but failed to update the cred. + */ case -ETIMEDOUT: rpc_delay(task, 3*HZ); case -EAGAIN: @@ -1724,11 +1738,10 @@ call_bind_status(struct rpc_task *task) case -EPROTONOSUPPORT: dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n", task->tk_pid); - task->tk_status = 0; - task->tk_action = call_bind; - return; + goto retry_timeout; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: + case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: case -EHOSTUNREACH: @@ -1751,6 +1764,7 @@ call_bind_status(struct rpc_task *task) return; retry_timeout: + task->tk_status = 0; task->tk_action = call_timeout; } @@ -1793,19 +1807,19 @@ call_connect_status(struct rpc_task *task) trace_rpc_connect_status(task, status); task->tk_status = 0; switch (status) { - /* if soft mounted, test if we've timed out */ - case -ETIMEDOUT: - task->tk_action = call_timeout; - return; case -ECONNREFUSED: case -ECONNRESET: + case -ECONNABORTED: case -ENETUNREACH: - /* retry with existing socket, after a delay */ - rpc_delay(task, 3*HZ); + case -EHOSTUNREACH: if (RPC_IS_SOFTCONN(task)) break; + /* retry with existing socket, after a delay */ + rpc_delay(task, 3*HZ); case -EAGAIN: - task->tk_action = call_bind; + /* Check for timeouts before looping back to call_bind */ + case -ETIMEDOUT: + task->tk_action = call_timeout; return; case 0: clnt->cl_stats->netreconn++; @@ -1902,6 +1916,7 @@ call_transmit_status(struct rpc_task *task) break; } case -ECONNRESET: + case -ECONNABORTED: case -ENOTCONN: case -EPIPE: rpc_task_force_reencode(task); @@ -1999,6 +2014,10 @@ call_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + if (RPC_IS_SOFTCONN(task)) { + rpc_exit(task, status); + break; + } /* * Delay any retries for 3 seconds, then handle as if it * were a timeout. @@ -2011,8 +2030,9 @@ call_status(struct rpc_task *task) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; - case -ECONNRESET: case -ECONNREFUSED: + case -ECONNRESET: + case -ECONNABORTED: rpc_force_rebind(clnt); rpc_delay(task, 3*HZ); case -EPIPE: diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 779742cfc1f..df582687653 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -14,6 +14,7 @@ struct sunrpc_net { struct cache_detail *rsi_cache; struct super_block *pipefs_sb; + struct rpc_pipe *gssd_dummy; struct mutex pipefs_sb_lock; struct list_head all_clients; @@ -26,14 +27,11 @@ struct sunrpc_net { unsigned int rpcb_is_af_local : 1; struct mutex gssp_lock; - wait_queue_head_t gssp_wq; struct rpc_clnt *gssp_clnt; int use_gss_proxy; int pipe_version; atomic_t pipe_users; struct proc_dir_entry *use_gssp_proc; - - unsigned int gssd_running; }; extern int sunrpc_net_id; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index bf04b30a788..b1855489856 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -17,6 +17,7 @@ #include <linux/fsnotify.h> #include <linux/kernel.h> #include <linux/rcupdate.h> +#include <linux/utsname.h> #include <asm/ioctls.h> #include <linux/poll.h> @@ -38,7 +39,7 @@ #define NET_NAME(net) ((net == &init_net) ? " (init_net)" : "") static struct file_system_type rpc_pipe_fs_type; - +static const struct rpc_pipe_ops gssd_dummy_pipe_ops; static struct kmem_cache *rpc_inode_cachep __read_mostly; @@ -216,14 +217,11 @@ rpc_destroy_inode(struct inode *inode) static int rpc_pipe_open(struct inode *inode, struct file *filp) { - struct net *net = inode->i_sb->s_fs_info; - struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; int first_open; int res = -ENXIO; mutex_lock(&inode->i_mutex); - sn->gssd_running = 1; pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -1159,6 +1157,7 @@ enum { RPCAUTH_nfsd4_cb, RPCAUTH_cache, RPCAUTH_nfsd, + RPCAUTH_gssd, RPCAUTH_RootEOF }; @@ -1195,6 +1194,10 @@ static const struct rpc_filelist files[] = { .name = "nfsd", .mode = S_IFDIR | S_IRUGO | S_IXUGO, }, + [RPCAUTH_gssd] = { + .name = "gssd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, }; /* @@ -1208,13 +1211,24 @@ struct dentry *rpc_d_lookup_sb(const struct super_block *sb, } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); -void rpc_pipefs_init_net(struct net *net) +int rpc_pipefs_init_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + sn->gssd_dummy = rpc_mkpipe_data(&gssd_dummy_pipe_ops, 0); + if (IS_ERR(sn->gssd_dummy)) + return PTR_ERR(sn->gssd_dummy); + mutex_init(&sn->pipefs_sb_lock); - sn->gssd_running = 1; sn->pipe_version = -1; + return 0; +} + +void rpc_pipefs_exit_net(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + rpc_destroy_pipe_data(sn->gssd_dummy); } /* @@ -1244,11 +1258,134 @@ void rpc_put_sb_net(const struct net *net) } EXPORT_SYMBOL_GPL(rpc_put_sb_net); +static const struct rpc_filelist gssd_dummy_clnt_dir[] = { + [0] = { + .name = "clntXX", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, +}; + +static ssize_t +dummy_downcall(struct file *filp, const char __user *src, size_t len) +{ + return -EINVAL; +} + +static const struct rpc_pipe_ops gssd_dummy_pipe_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = dummy_downcall, +}; + +/* + * Here we present a bogus "info" file to keep rpc.gssd happy. We don't expect + * that it will ever use this info to handle an upcall, but rpc.gssd expects + * that this file will be there and have a certain format. + */ +static int +rpc_show_dummy_info(struct seq_file *m, void *v) +{ + seq_printf(m, "RPC server: %s\n", utsname()->nodename); + seq_printf(m, "service: foo (1) version 0\n"); + seq_printf(m, "address: 127.0.0.1\n"); + seq_printf(m, "protocol: tcp\n"); + seq_printf(m, "port: 0\n"); + return 0; +} + +static int +rpc_dummy_info_open(struct inode *inode, struct file *file) +{ + return single_open(file, rpc_show_dummy_info, NULL); +} + +static const struct file_operations rpc_dummy_info_operations = { + .owner = THIS_MODULE, + .open = rpc_dummy_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct rpc_filelist gssd_dummy_info_file[] = { + [0] = { + .name = "info", + .i_fop = &rpc_dummy_info_operations, + .mode = S_IFREG | S_IRUSR, + }, +}; + +/** + * rpc_gssd_dummy_populate - create a dummy gssd pipe + * @root: root of the rpc_pipefs filesystem + * @pipe_data: pipe data created when netns is initialized + * + * Create a dummy set of directories and a pipe that gssd can hold open to + * indicate that it is up and running. + */ +static struct dentry * +rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) +{ + int ret = 0; + struct dentry *gssd_dentry; + struct dentry *clnt_dentry = NULL; + struct dentry *pipe_dentry = NULL; + struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name, + strlen(files[RPCAUTH_gssd].name)); + + /* We should never get this far if "gssd" doesn't exist */ + gssd_dentry = d_hash_and_lookup(root, &q); + if (!gssd_dentry) + return ERR_PTR(-ENOENT); + + ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL); + if (ret) { + pipe_dentry = ERR_PTR(ret); + goto out; + } + + q.name = gssd_dummy_clnt_dir[0].name; + q.len = strlen(gssd_dummy_clnt_dir[0].name); + clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); + if (!clnt_dentry) { + pipe_dentry = ERR_PTR(-ENOENT); + goto out; + } + + ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL); + if (ret) { + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); + pipe_dentry = ERR_PTR(ret); + goto out; + } + + pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); + if (IS_ERR(pipe_dentry)) { + __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1); + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); + } +out: + dput(clnt_dentry); + dput(gssd_dentry); + return pipe_dentry; +} + +static void +rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) +{ + struct dentry *clnt_dir = pipe_dentry->d_parent; + struct dentry *gssd_dir = clnt_dir->d_parent; + + __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry); + __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); + __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); + dput(pipe_dentry); +} + static int rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct dentry *root; + struct dentry *root, *gssd_dentry; struct net *net = data; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1266,6 +1403,13 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; + + gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy); + if (IS_ERR(gssd_dentry)) { + __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); + return PTR_ERR(gssd_dentry); + } + dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", net, NET_NAME(net)); mutex_lock(&sn->pipefs_sb_lock); @@ -1280,6 +1424,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return 0; err_depopulate: + rpc_gssd_dummy_depopulate(gssd_dentry); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); @@ -1289,6 +1434,16 @@ err_depopulate: return err; } +bool +gssd_running(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + struct rpc_pipe *pipe = sn->gssd_dummy; + + return pipe->nreaders || pipe->nwriters; +} +EXPORT_SYMBOL_GPL(gssd_running); + static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index ff3cc4bf4b2..c0365c14b85 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -637,7 +637,8 @@ static void __rpc_queue_timer_fn(unsigned long ptr) static void __rpc_atrun(struct rpc_task *task) { - task->tk_status = 0; + if (task->tk_status == -ETIMEDOUT) + task->tk_status = 0; } /* @@ -831,7 +832,8 @@ static void rpc_async_schedule(struct work_struct *work) * @size: requested byte size * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL if the request cannot be serviced immediately. + * returning NULL and suppressing warning if the request cannot be serviced + * immediately. * The caller can arrange to sleep in a way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a @@ -844,7 +846,7 @@ static void rpc_async_schedule(struct work_struct *work) void *rpc_malloc(struct rpc_task *task, size_t size) { struct rpc_buffer *buf; - gfp_t gfp = GFP_NOWAIT; + gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; if (RPC_IS_SWAPPER(task)) gfp |= __GFP_MEMALLOC; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 0a648c502fc..2df87f78e51 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -173,7 +173,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if (csum_fold(desc.csum)) return -1; - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev); return 0; no_checksum: diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index 14c9f6d1c5f..f2b7cb540e6 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -43,6 +43,19 @@ static inline int rpc_reply_expected(struct rpc_task *task) (task->tk_msg.rpc_proc->p_decode != NULL); } +static inline int sock_is_loopback(struct sock *sk) +{ + struct dst_entry *dst; + int loopback = 0; + rcu_read_lock(); + dst = rcu_dereference(sk->sk_dst_cache); + if (dst && dst->dev && + (dst->dev->features & NETIF_F_LOOPBACK)) + loopback = 1; + rcu_read_unlock(); + return loopback; +} + int svc_send_common(struct socket *sock, struct xdr_buf *xdr, struct page *headpage, unsigned long headoffset, struct page *tailpage, unsigned long tailoffset); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 3d6498af9ad..cd30120de9e 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -44,12 +44,17 @@ static __net_init int sunrpc_init_net(struct net *net) if (err) goto err_unixgid; - rpc_pipefs_init_net(net); + err = rpc_pipefs_init_net(net); + if (err) + goto err_pipefs; + INIT_LIST_HEAD(&sn->all_clients); spin_lock_init(&sn->rpc_client_lock); spin_lock_init(&sn->rpcb_clnt_lock); return 0; +err_pipefs: + unix_gid_cache_destroy(net); err_unixgid: ip_map_cache_destroy(net); err_ipmap: @@ -60,6 +65,7 @@ err_proc: static __net_exit void sunrpc_exit_net(struct net *net) { + rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e7fbe368b4a..5de6801cd92 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -916,9 +916,6 @@ static int __svc_register(struct net *net, const char *progname, #endif } - if (error < 0) - printk(KERN_WARNING "svc: failed to register %sv%u RPC " - "service (errno %d).\n", progname, version, -error); return error; } @@ -937,6 +934,7 @@ int svc_register(const struct svc_serv *serv, struct net *net, const unsigned short port) { struct svc_program *progp; + struct svc_version *vers; unsigned int i; int error = 0; @@ -946,7 +944,8 @@ int svc_register(const struct svc_serv *serv, struct net *net, for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { - if (progp->pg_vers[i] == NULL) + vers = progp->pg_vers[i]; + if (vers == NULL) continue; dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n", @@ -955,16 +954,26 @@ int svc_register(const struct svc_serv *serv, struct net *net, proto == IPPROTO_UDP? "udp" : "tcp", port, family, - progp->pg_vers[i]->vs_hidden? - " (but not telling portmap)" : ""); + vers->vs_hidden ? + " (but not telling portmap)" : ""); - if (progp->pg_vers[i]->vs_hidden) + if (vers->vs_hidden) continue; error = __svc_register(net, progp->pg_name, progp->pg_prog, i, family, proto, port); - if (error < 0) + + if (vers->vs_rpcb_optnl) { + error = 0; + continue; + } + + if (error < 0) { + printk(KERN_WARNING "svc: failed to register " + "%sv%u RPC service (errno %d).\n", + progp->pg_name, i, -error); break; + } } } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 80a6640f329..b4737fbdec1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -571,7 +571,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -int svc_alloc_arg(struct svc_rqst *rqstp) +static int svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg; @@ -597,6 +597,7 @@ int svc_alloc_arg(struct svc_rqst *rqstp) } rqstp->rq_pages[i] = p; } + rqstp->rq_page_end = &rqstp->rq_pages[i]; rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ /* Make arg->head point to first page and arg->pages point to rest */ @@ -612,7 +613,7 @@ int svc_alloc_arg(struct svc_rqst *rqstp) return 0; } -struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) { struct svc_xprt *xprt; struct svc_pool *pool = rqstp->rq_pool; @@ -691,7 +692,7 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) return xprt; } -void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) +static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) { spin_lock_bh(&serv->sv_lock); set_bit(XPT_TEMP, &newxpt->xpt_flags); @@ -730,6 +731,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) newxpt = xprt->xpt_ops->xpo_accept(xprt); if (newxpt) svc_add_new_temp_xprt(serv, newxpt); + else + module_put(xprt->xpt_class->xcl_owner); } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", @@ -793,7 +796,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) clear_bit(XPT_OLD, &xprt->xpt_flags); - rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); + rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp); rqstp->rq_chandle.defer = svc_defer; if (serv->sv_stats) diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 2af7b0cba43..79c0f3459b5 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -54,6 +54,8 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) } spin_unlock(&authtab_lock); + rqstp->rq_auth_slack = 0; + rqstp->rq_authop = aops; return aops->accept(rqstp, authp); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b6e59f0a947..b507cd327d9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -60,7 +60,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); -static void svc_udp_data_ready(struct sock *, int); +static void svc_udp_data_ready(struct sock *); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); static void svc_sock_detach(struct svc_xprt *); @@ -400,17 +400,23 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, release_sock(sock->sk); #endif } + +static int svc_sock_secure_port(struct svc_rqst *rqstp) +{ + return svc_port_is_privileged(svc_addr(rqstp)); +} + /* * INET callback when data has been received on the socket. */ -static void svc_udp_data_ready(struct sock *sk, int count) +static void svc_udp_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { - dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", - svsk, sk, count, + dprintk("svc: socket %p(inet %p), busy=%d\n", + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); @@ -678,6 +684,7 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_udp_class = { @@ -731,7 +738,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) * A data_ready event on a listening socket means there's a connection * pending. Do not use state_change as a substitute for it. */ -static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; wait_queue_head_t *wq; @@ -783,7 +790,7 @@ static void svc_tcp_state_change(struct sock *sk) wake_up_interruptible_all(wq); } -static void svc_tcp_data_ready(struct sock *sk, int count) +static void svc_tcp_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; wait_queue_head_t *wq = sk_sleep(sk); @@ -842,8 +849,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) * tell us anything. For now just warn about unpriv connections. */ if (!svc_port_is_privileged(sin)) { - dprintk(KERN_WARNING - "%s: connect from unprivileged port: %s\n", + dprintk("%s: connect from unprivileged port: %s\n", serv->sv_name, __svc_print_addr(sin, buf, sizeof(buf))); } @@ -867,6 +873,10 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); + if (sock_is_loopback(newsock->sk)) + set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags); + else + clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags); if (serv->sv_stats) serv->sv_stats->nettcpconn++; @@ -1112,6 +1122,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; + rqstp->rq_local = !!test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; @@ -1234,6 +1245,7 @@ static struct svc_xprt_ops svc_tcp_bc_ops = { .xpo_detach = svc_bc_tcp_sock_detach, .xpo_free = svc_bc_sock_free, .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_tcp_bc_class = { @@ -1272,6 +1284,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_tcp_class = { @@ -1397,6 +1410,22 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return svsk; } +bool svc_alien_sock(struct net *net, int fd) +{ + int err; + struct socket *sock = sockfd_lookup(fd, &err); + bool ret = false; + + if (!sock) + goto out; + if (sock_net(sock->sk) != net) + ret = true; + sockfd_put(sock); +out: + return ret; +} +EXPORT_SYMBOL_GPL(svc_alien_sock); + /** * svc_addsock - add a listener socket to an RPC service * @serv: pointer to RPC service to which to add a new listener diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 1504bb11e4f..23fb4e75e24 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -462,6 +462,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) struct kvec *iov = buf->head; int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; + xdr_set_scratch_buffer(xdr, NULL, 0); BUG_ON(scratch_len < 0); xdr->buf = buf; xdr->iov = iov; @@ -482,6 +483,73 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) EXPORT_SYMBOL_GPL(xdr_init_encode); /** + * xdr_commit_encode - Ensure all data is written to buffer + * @xdr: pointer to xdr_stream + * + * We handle encoding across page boundaries by giving the caller a + * temporary location to write to, then later copying the data into + * place; xdr_commit_encode does that copying. + * + * Normally the caller doesn't need to call this directly, as the + * following xdr_reserve_space will do it. But an explicit call may be + * required at the end of encoding, or any other time when the xdr_buf + * data might be read. + */ +void xdr_commit_encode(struct xdr_stream *xdr) +{ + int shift = xdr->scratch.iov_len; + void *page; + + if (shift == 0) + return; + page = page_address(*xdr->page_ptr); + memcpy(xdr->scratch.iov_base, page, shift); + memmove(page, page + shift, (void *)xdr->p - page); + xdr->scratch.iov_len = 0; +} +EXPORT_SYMBOL_GPL(xdr_commit_encode); + +__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) +{ + static __be32 *p; + int space_left; + int frag1bytes, frag2bytes; + + if (nbytes > PAGE_SIZE) + return NULL; /* Bigger buffers require special handling */ + if (xdr->buf->len + nbytes > xdr->buf->buflen) + return NULL; /* Sorry, we're totally out of space */ + frag1bytes = (xdr->end - xdr->p) << 2; + frag2bytes = nbytes - frag1bytes; + if (xdr->iov) + xdr->iov->iov_len += frag1bytes; + else + xdr->buf->page_len += frag1bytes; + xdr->page_ptr++; + xdr->iov = NULL; + /* + * If the last encode didn't end exactly on a page boundary, the + * next one will straddle boundaries. Encode into the next + * page, then copy it back later in xdr_commit_encode. We use + * the "scratch" iov to track any temporarily unused fragment of + * space at the end of the previous buffer: + */ + xdr->scratch.iov_base = xdr->p; + xdr->scratch.iov_len = frag1bytes; + p = page_address(*xdr->page_ptr); + /* + * Note this is where the next encode will start after we've + * shifted this one back: + */ + xdr->p = (void *)p + frag2bytes; + space_left = xdr->buf->buflen - xdr->buf->len; + xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE); + xdr->buf->page_len += frag2bytes; + xdr->buf->len += nbytes; + return p; +} + +/** * xdr_reserve_space - Reserve buffer space for sending * @xdr: pointer to xdr_stream * @nbytes: number of bytes to reserve @@ -495,20 +563,122 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) __be32 *p = xdr->p; __be32 *q; + xdr_commit_encode(xdr); /* align nbytes on the next 32-bit boundary */ nbytes += 3; nbytes &= ~3; q = p + (nbytes >> 2); if (unlikely(q > xdr->end || q < p)) - return NULL; + return xdr_get_next_encode_buffer(xdr, nbytes); xdr->p = q; - xdr->iov->iov_len += nbytes; + if (xdr->iov) + xdr->iov->iov_len += nbytes; + else + xdr->buf->page_len += nbytes; xdr->buf->len += nbytes; return p; } EXPORT_SYMBOL_GPL(xdr_reserve_space); /** + * xdr_truncate_encode - truncate an encode buffer + * @xdr: pointer to xdr_stream + * @len: new length of buffer + * + * Truncates the xdr stream, so that xdr->buf->len == len, + * and xdr->p points at offset len from the start of the buffer, and + * head, tail, and page lengths are adjusted to correspond. + * + * If this means moving xdr->p to a different buffer, we assume that + * that the end pointer should be set to the end of the current page, + * except in the case of the head buffer when we assume the head + * buffer's current length represents the end of the available buffer. + * + * This is *not* safe to use on a buffer that already has inlined page + * cache pages (as in a zero-copy server read reply), except for the + * simple case of truncating from one position in the tail to another. + * + */ +void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + int fraglen; + int new, old; + + if (len > buf->len) { + WARN_ON_ONCE(1); + return; + } + xdr_commit_encode(xdr); + + fraglen = min_t(int, buf->len - len, tail->iov_len); + tail->iov_len -= fraglen; + buf->len -= fraglen; + if (tail->iov_len && buf->len == len) { + xdr->p = tail->iov_base + tail->iov_len; + /* xdr->end, xdr->iov should be set already */ + return; + } + WARN_ON_ONCE(fraglen); + fraglen = min_t(int, buf->len - len, buf->page_len); + buf->page_len -= fraglen; + buf->len -= fraglen; + + new = buf->page_base + buf->page_len; + old = new + fraglen; + xdr->page_ptr -= (old >> PAGE_SHIFT) - (new >> PAGE_SHIFT); + + if (buf->page_len && buf->len == len) { + xdr->p = page_address(*xdr->page_ptr); + xdr->end = (void *)xdr->p + PAGE_SIZE; + xdr->p = (void *)xdr->p + (new % PAGE_SIZE); + /* xdr->iov should already be NULL */ + return; + } + if (fraglen) { + xdr->end = head->iov_base + head->iov_len; + xdr->page_ptr--; + } + /* (otherwise assume xdr->end is already set) */ + head->iov_len = len; + buf->len = len; + xdr->p = head->iov_base + head->iov_len; + xdr->iov = buf->head; +} +EXPORT_SYMBOL(xdr_truncate_encode); + +/** + * xdr_restrict_buflen - decrease available buffer space + * @xdr: pointer to xdr_stream + * @newbuflen: new maximum number of bytes available + * + * Adjust our idea of how much space is available in the buffer. + * If we've already used too much space in the buffer, returns -1. + * If the available space is already smaller than newbuflen, returns 0 + * and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen + * and ensures xdr->end is set at most offset newbuflen from the start + * of the buffer. + */ +int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen) +{ + struct xdr_buf *buf = xdr->buf; + int left_in_this_buf = (void *)xdr->end - (void *)xdr->p; + int end_offset = buf->len + left_in_this_buf; + + if (newbuflen < 0 || newbuflen < buf->len) + return -1; + if (newbuflen > buf->buflen) + return 0; + if (newbuflen < end_offset) + xdr->end = (void *)xdr->end + newbuflen - end_offset; + buf->buflen = newbuflen; + return 0; +} +EXPORT_SYMBOL(xdr_restrict_buflen); + +/** * xdr_write_pages - Insert a list of pages into an XDR buffer for sending * @xdr: pointer to xdr_stream * @pages: list of pages @@ -833,8 +1003,20 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) } EXPORT_SYMBOL_GPL(xdr_buf_from_iov); -/* Sets subbuf to the portion of buf of length len beginning base bytes - * from the start of buf. Returns -1 if base of length are out of bounds. */ +/** + * xdr_buf_subsegment - set subbuf to a portion of buf + * @buf: an xdr buffer + * @subbuf: the result buffer + * @base: beginning of range in bytes + * @len: length of range in bytes + * + * sets @subbuf to an xdr buffer representing the portion of @buf of + * length @len starting at offset @base. + * + * @buf and @subbuf may be pointers to the same struct xdr_buf. + * + * Returns -1 if base of length are out of bounds. + */ int xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, unsigned int base, unsigned int len) @@ -847,9 +1029,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, len -= subbuf->head[0].iov_len; base = 0; } else { - subbuf->head[0].iov_base = NULL; - subbuf->head[0].iov_len = 0; base -= buf->head[0].iov_len; + subbuf->head[0].iov_len = 0; } if (base < buf->page_len) { @@ -871,9 +1052,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, len -= subbuf->tail[0].iov_len; base = 0; } else { - subbuf->tail[0].iov_base = NULL; - subbuf->tail[0].iov_len = 0; base -= buf->tail[0].iov_len; + subbuf->tail[0].iov_len = 0; } if (base || len) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 04199bc8416..c3b2b3369e5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -71,24 +71,6 @@ static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); -/* - * The transport code maintains an estimate on the maximum number of out- - * standing RPC requests, using a smoothed version of the congestion - * avoidance implemented in 44BSD. This is basically the Van Jacobson - * congestion algorithm: If a retransmit occurs, the congestion window is - * halved; otherwise, it is incremented by 1/cwnd when - * - * - a reply is received and - * - a full number of requests are outstanding and - * - the congestion window hasn't been updated recently. - */ -#define RPC_CWNDSHIFT (8U) -#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) -#define RPC_INITCWND RPC_CWNDSCALE -#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) - -#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) - /** * xprt_register_transport - register a transport implementation * @transport: transport to register @@ -230,9 +212,9 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } else queue_work(rpciod_workqueue, &xprt->task_cleanup); } @@ -446,7 +428,15 @@ EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); * @task: recently completed RPC request used to adjust window * @result: result code of completed RPC request * - * We use a time-smoothed congestion estimator to avoid heavy oscillation. + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. */ void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result) { @@ -749,6 +739,11 @@ static void xprt_connect_status(struct rpc_task *task) } switch (task->tk_status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ECONNABORTED: + case -ENETUNREACH: + case -EHOSTUNREACH: case -EAGAIN: dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); break; @@ -1188,7 +1183,7 @@ static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) static inline void xprt_init_xid(struct rpc_xprt *xprt) { - xprt->xid = net_random(); + xprt->xid = prandom_u32(); } static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) @@ -1378,15 +1373,3 @@ void xprt_put(struct rpc_xprt *xprt) if (atomic_dec_and_test(&xprt->count)) xprt_destroy(xprt); } - -/** - * xprt_get - return a reference to an RPC transport. - * @xprt: pointer to the transport - * - */ -struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) -{ - if (atomic_inc_not_zero(&xprt->count)) - return xprt; - return NULL; -} diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 5a8f268bdd3..da5136fd569 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,8 +1,8 @@ -obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o +obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o xprtrdma-y := transport.o rpc_rdma.o verbs.o -obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o +obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o svcrdma-y := svc_rdma.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e03725bfe2b..693966d3f33 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -78,8 +78,7 @@ static const char transfertypes[][12] = { * elements. Segments are then coalesced when registered, if possible * within the selected memreg mode. * - * Note, this routine is never called if the connection's memory - * registration strategy is 0 (bounce buffers). + * Returns positive number of segments converted, or a negative errno. */ static int @@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, page_base = xdrbuf->page_base & ~PAGE_MASK; p = 0; while (len && n < nsegs) { + if (!ppages[p]) { + /* alloc the pagelist for receiving buffer */ + ppages[p] = alloc_page(GFP_ATOMIC); + if (!ppages[p]) + return -ENOMEM; + } seg[n].mr_page = ppages[p]; seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); - BUG_ON(seg[n].mr_len > PAGE_SIZE); + if (seg[n].mr_len > PAGE_SIZE) + return -EIO; len -= seg[n].mr_len; ++n; ++p; @@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, /* Message overflows the seg array */ if (len && n == nsegs) - return 0; + return -EIO; if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing @@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, return n; if (n == nsegs) /* Tail remains, but we're out of segments */ - return 0; + return -EIO; seg[n].mr_page = NULL; seg[n].mr_offset = xdrbuf->tail[0].iov_base; seg[n].mr_len = xdrbuf->tail[0].iov_len; @@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * Reply chunk (a counted array): * N elements: * 1 - N - HLOO - HLOO - ... - HLOO + * + * Returns positive RPC/RDMA header size, or negative errno. */ -static unsigned int +static ssize_t rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - int nsegs, nchunks = 0; + int n, nsegs, nchunks = 0; unsigned int pos; struct rpcrdma_mr_seg *seg = req->rl_segments; struct rpcrdma_read_chunk *cur_rchunk = NULL; @@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, pos = target->head[0].iov_len; nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); - if (nsegs == 0) - return 0; + if (nsegs < 0) + return nsegs; do { - /* bind/register the memory, then build chunk from result. */ - int n = rpcrdma_register_external(seg, nsegs, + n = rpcrdma_register_external(seg, nsegs, cur_wchunk != NULL, r_xprt); if (n <= 0) goto out; @@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, /* success. all failures return above */ req->rl_nchunks = nchunks; - BUG_ON(nchunks == 0); - BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) - && (nchunks > 3)); - /* * finish off header. If write, marshal discrim and nchunks. */ @@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, out: for (pos = 0; nchunks--;) pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt, NULL); - return 0; + &req->rl_segments[pos], r_xprt); + return n; } /* @@ -361,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. * [2] -- optional padding. * [3] -- if padded, header only in [1] and data here. + * + * Returns zero on success, otherwise a negative errno. */ int @@ -370,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; - size_t hdrlen, rpclen, padlen; + size_t rpclen, padlen; + ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; @@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* The following simplification is not true forever */ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) wtype = rpcrdma_noch; - BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); - - if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && - (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { - /* forced to "pure inline"? */ - dprintk("RPC: %s: too much data (%d/%d) for inline\n", - __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); - return -1; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { + dprintk("RPC: %s: cannot marshal multiple chunk lists\n", + __func__); + return -EIO; } hdrlen = 28; /*sizeof *headerp;*/ @@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - BUG_ON(wtype != rpcrdma_noch); - + if (wtype != rpcrdma_noch) { + dprintk("RPC: %s: invalid chunk list\n", + __func__); + return -EIO; + } } else { headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; @@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (wtype == rpcrdma_noch && - r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) + if (wtype == rpcrdma_noch) wtype = rpcrdma_replych; } } @@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, headerp, wtype); } - - if (hdrlen == 0) - return -1; + if (hdrlen < 0) + return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", @@ -649,9 +652,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) break; page_base = 0; } - rqst->rq_rcv_buf.page_len = olen - copy_len; - } else - rqst->rq_rcv_buf.page_len = 0; + } if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { curlen = copy_len; @@ -682,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) rqst->rq_private_buf = rqst->rq_rcv_buf; } -/* - * This function is called when an async event is posted to - * the connection which changes the connection state. All it - * does at this point is mark the connection up/down, the rpc - * timers do the rest. - */ void -rpcrdma_conn_func(struct rpcrdma_ep *ep) +rpcrdma_connect_worker(struct work_struct *work) { + struct rpcrdma_ep *ep = + container_of(work, struct rpcrdma_ep, rep_connect_worker.work); struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); @@ -707,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) } /* - * This function is called when memory window unbind which we are waiting - * for completes. Just use rr_func (zeroed by upcall) to signal completion. + * This function is called when an async event is posted to + * the connection which changes the connection state. All it + * does at this point is mark the connection up/down, the rpc + * timers do the rest. */ -static void -rpcrdma_unbind_func(struct rpcrdma_rep *rep) +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) { - wake_up(&rep->rr_unbind); + schedule_delayed_work(&ep->rep_connect_worker, 0); } /* @@ -730,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpc_xprt *xprt = rep->rr_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); __be32 *iptr; - int i, rdmalen, status; + int rdmalen, status; + unsigned long cwnd; /* Check status. If bad, signal disconnect and return rep to pool */ if (rep->rr_len == ~0U) { @@ -785,6 +785,7 @@ repost: /* from here on, the reply is no longer an orphan */ req->rl_reply = rep; + xprt->reestablish_timeout = 0; /* check for expected message types */ /* The order of some of these tests is important. */ @@ -859,26 +860,10 @@ badheader: break; } - /* If using mw bind, start the deregister process now. */ - /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ - if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_MEMWINDOWS: - for (i = 0; req->rl_nchunks-- > 1;) - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt, NULL); - /* Optionally wait (not here) for unbinds to complete */ - rep->rr_func = rpcrdma_unbind_func; - (void) rpcrdma_deregister_external(&req->rl_segments[i], - r_xprt, rep); - break; - case RPCRDMA_MEMWINDOWS_ASYNC: - for (i = 0; req->rl_nchunks--;) - i += rpcrdma_deregister_external(&req->rl_segments[i], - r_xprt, NULL); - break; - default: - break; - } + cwnd = xprt->cwnd; + xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 0ce75524ed2..8f92a61ee2d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, /* Set up the XDR head */ rqstp->rq_arg.head[0].iov_base = page_address(page); - rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); + rqstp->rq_arg.head[0].iov_len = + min_t(size_t, byte_count, ctxt->sge[0].length); rqstp->rq_arg.len = byte_count; rqstp->rq_arg.buflen = byte_count; @@ -85,11 +87,12 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, page = ctxt->pages[sge_no]; put_page(rqstp->rq_pages[sge_no]); rqstp->rq_pages[sge_no] = page; - bc -= min(bc, ctxt->sge[sge_no].length); + bc -= min_t(u32, bc, ctxt->sge[sge_no].length); rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; sge_no++; } rqstp->rq_respages = &rqstp->rq_pages[sge_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; /* We should never run out of SGE because the limit is defined to * support the max allowed RPC data length @@ -112,289 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -/* Encode a read-chunk-list as an array of IB SGE - * - * Assumptions: - * - chunk[0]->position points to pages[0] at an offset of 0 - * - pages[] is not physically or virtually contiguous and consists of - * PAGE_SIZE elements. - * - * Output: - * - sge array pointing into pages[] array. - * - chunk_sge array specifying sge index and count for each - * chunk in the read list - * - */ -static int map_read_chunks(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - struct rpcrdma_msg *rmsgp, - struct svc_rdma_req_map *rpl_map, - struct svc_rdma_req_map *chl_map, - int ch_count, - int byte_count) +static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) { - int sge_no; - int sge_bytes; - int page_off; - int page_no; - int ch_bytes; - int ch_no; - struct rpcrdma_read_chunk *ch; + if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == + RDMA_TRANSPORT_IWARP) + return 1; + else + return min_t(int, sge_count, xprt->sc_max_sge); +} - sge_no = 0; - page_no = 0; - page_off = 0; - ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - ch_no = 0; - ch_bytes = ntohl(ch->rc_target.rs_length); - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = &head->pages[head->count]; - head->hdr_count = head->count; /* save count of hdr pages */ - head->arg.page_base = 0; - head->arg.page_len = ch_bytes; - head->arg.len = rqstp->rq_arg.len + ch_bytes; - head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; - head->count++; - chl_map->ch[0].start = 0; - while (byte_count) { - rpl_map->sge[sge_no].iov_base = - page_address(rqstp->rq_arg.pages[page_no]) + page_off; - sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); - rpl_map->sge[sge_no].iov_len = sge_bytes; - /* - * Don't bump head->count here because the same page - * may be used by multiple SGE. - */ - head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; +typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last); + +/* Issue an RDMA_READ using the local lkey to map the data sink */ +static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) +{ + struct ib_send_wr read_wr; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; - byte_count -= sge_bytes; - ch_bytes -= sge_bytes; - sge_no++; - /* - * If all bytes for this chunk have been mapped to an - * SGE, move to the next SGE - */ - if (ch_bytes == 0) { - chl_map->ch[ch_no].count = - sge_no - chl_map->ch[ch_no].start; - ch_no++; - ch++; - chl_map->ch[ch_no].start = sge_no; - ch_bytes = ntohl(ch->rc_target.rs_length); - /* If bytes remaining account for next chunk */ - if (byte_count) { - head->arg.page_len += ch_bytes; - head->arg.len += ch_bytes; - head->arg.buflen += ch_bytes; - } + ctxt->direction = DMA_FROM_DEVICE; + ctxt->read_hdr = head; + pages_needed = + min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + ctxt->sge[pno].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + head->arg.pages[pg_no], pg_off, + PAGE_SIZE - pg_off, + DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[pno].addr); + if (ret) + goto err; + atomic_inc(&xprt->sc_dma_used); + + /* The lkey here is either a local dma lkey or a dma_mr lkey */ + ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].length = len; + ctxt->count++; + + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; } - /* - * If this SGE consumed all of the page, move to the - * next page - */ - if ((sge_bytes + page_off) == PAGE_SIZE) { - page_no++; - page_off = 0; - /* - * If there are still bytes left to map, bump - * the page count - */ - if (byte_count) - head->count++; - } else - page_off += sge_bytes; + rs_length -= len; } - BUG_ON(byte_count != 0); - return sge_no; + + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.wr_id = (unsigned long)ctxt; + read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = pages_needed; + + ret = svc_rdma_send(xprt, &read_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; + } + + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + return ret; } -/* Map a read-chunk-list to an XDR and fast register the page-list. - * - * Assumptions: - * - chunk[0] position points to pages[0] at an offset of 0 - * - pages[] will be made physically contiguous by creating a one-off memory - * region using the fastreg verb. - * - byte_count is # of bytes in read-chunk-list - * - ch_count is # of chunks in read-chunk-list - * - * Output: - * - sge array pointing into pages[] array. - * - chunk_sge array specifying sge index and count for each - * chunk in the read list - */ -static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, +/* Issue an RDMA_READ using an FRMR to map the data sink */ +static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, - struct rpcrdma_msg *rmsgp, - struct svc_rdma_req_map *rpl_map, - struct svc_rdma_req_map *chl_map, - int ch_count, - int byte_count) + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) { - int page_no; - int ch_no; - u32 offset; - struct rpcrdma_read_chunk *ch; - struct svc_rdma_fastreg_mr *frmr; - int ret = 0; + struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; + struct ib_send_wr fastreg_wr; + u8 key; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; - frmr = svc_rdma_get_frmr(xprt); if (IS_ERR(frmr)) return -ENOMEM; - head->frmr = frmr; - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = &head->pages[head->count]; - head->hdr_count = head->count; /* save count of hdr pages */ - head->arg.page_base = 0; - head->arg.page_len = byte_count; - head->arg.len = rqstp->rq_arg.len + byte_count; - head->arg.buflen = rqstp->rq_arg.buflen + byte_count; + ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = frmr; + pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); - /* Fast register the page list */ - frmr->kva = page_address(rqstp->rq_arg.pages[0]); + frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]); frmr->direction = DMA_FROM_DEVICE; frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); - frmr->map_len = byte_count; - frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; - for (page_no = 0; page_no < frmr->page_list_len; page_no++) { - frmr->page_list->page_list[page_no] = + frmr->map_len = pages_needed << PAGE_SHIFT; + frmr->page_list_len = pages_needed; + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + frmr->page_list->page_list[pno] = ib_dma_map_page(xprt->sc_cm_id->device, - rqstp->rq_arg.pages[page_no], 0, + head->arg.pages[pg_no], 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[pno]); + if (ret) + goto err; atomic_inc(&xprt->sc_dma_used); - head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; - } - head->count += page_no; - /* rq_respages points one past arg pages */ - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; - - /* Create the reply and chunk maps */ - offset = 0; - ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - for (ch_no = 0; ch_no < ch_count; ch_no++) { - int len = ntohl(ch->rc_target.rs_length); - rpl_map->sge[ch_no].iov_base = frmr->kva + offset; - rpl_map->sge[ch_no].iov_len = len; - chl_map->ch[ch_no].count = 1; - chl_map->ch[ch_no].start = ch_no; - offset += len; - ch++; + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; + } + rs_length -= len; } - ret = svc_rdma_fastreg(xprt, frmr); - if (ret) - goto fatal_err; - - return ch_no; - - fatal_err: - printk("svcrdma: error fast registering xdr for xprt %p", xprt); - svc_rdma_put_frmr(xprt, frmr); - return -EIO; -} - -static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, - struct svc_rdma_op_ctxt *ctxt, - struct svc_rdma_fastreg_mr *frmr, - struct kvec *vec, - u64 *sgl_offset, - int count) -{ - int i; - unsigned long off; + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - ctxt->count = count; - ctxt->direction = DMA_FROM_DEVICE; - for (i = 0; i < count; i++) { - ctxt->sge[i].length = 0; /* in case map fails */ - if (!frmr) { - BUG_ON(!virt_to_page(vec[i].iov_base)); - off = (unsigned long)vec[i].iov_base & ~PAGE_MASK; - ctxt->sge[i].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - virt_to_page(vec[i].iov_base), - off, - vec[i].iov_len, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - ctxt->sge[i].addr)) - return -EINVAL; - ctxt->sge[i].lkey = xprt->sc_dma_lkey; - atomic_inc(&xprt->sc_dma_used); - } else { - ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; - ctxt->sge[i].lkey = frmr->mr->lkey; - } - ctxt->sge[i].length = vec[i].iov_len; - *sgl_offset = *sgl_offset + vec[i].iov_len; + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset; + ctxt->sge[0].lkey = frmr->mr->lkey; + ctxt->sge[0].length = read; + ctxt->count = 1; + ctxt->read_hdr = head; + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + fastreg_wr.next = &read_wr; + + /* Prepare RDMA_READ */ + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = 1; + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + read_wr.wr_id = (unsigned long)ctxt; + read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; + } else { + read_wr.opcode = IB_WR_RDMA_READ; + read_wr.next = &inv_wr; + /* Prepare invalidate */ + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = (unsigned long)ctxt; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; + inv_wr.ex.invalidate_rkey = frmr->mr->lkey; + } + ctxt->wr_op = read_wr.opcode; + + /* Post the chain */ + ret = svc_rdma_send(xprt, &fastreg_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; } - return 0; -} -static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) -{ - if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == - RDMA_TRANSPORT_IWARP) && - sge_count > 1) - return 1; - else - return min_t(int, sge_count, xprt->sc_max_sge); + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + svc_rdma_put_frmr(xprt, frmr); + return ret; } -/* - * Use RDMA_READ to read data from the advertised client buffer into the - * XDR stream starting at rq_arg.head[0].iov_base. - * Each chunk in the array - * contains the following fields: - * discrim - '1', This isn't used for data placement - * position - The xdr stream offset (the same for every chunk) - * handle - RMR for client memory region - * length - data transfer length - * offset - 64 bit tagged offset in remote memory region - * - * On our side, we need to read into a pagelist. The first page immediately - * follows the RPC header. - * - * This function returns: - * 0 - No error and no read-list found. - * - * 1 - Successful read-list processing. The data is not yet in - * the pagelist and therefore the RPC request must be deferred. The - * I/O completion will enqueue the transport again and - * svc_rdma_recvfrom will complete the request. - * - * <0 - Error processing/posting read-list. - * - * NOTE: The ctxt must not be touched after the last WR has been posted - * because the I/O completion processing may occur on another - * processor and free / modify the context. Ne touche pas! - */ -static int rdma_read_xdr(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rmsgp, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *hdr_ctxt) +static int rdma_read_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head) { - struct ib_send_wr read_wr; - struct ib_send_wr inv_wr; - int err = 0; - int ch_no; - int ch_count; - int byte_count; - int sge_count; - u64 sgl_offset; + int page_no, ch_count, ret; struct rpcrdma_read_chunk *ch; - struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_req_map *rpl_map; - struct svc_rdma_req_map *chl_map; + u32 page_offset, byte_count; + u64 rs_offset; + rdma_reader_fn reader; /* If no read list is present, return 0 */ ch = svc_rdma_get_read_chunk(rmsgp); @@ -405,129 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; - /* Allocate temporary reply and chunk maps */ - rpl_map = svc_rdma_get_req_map(); - chl_map = svc_rdma_get_req_map(); + /* The request is completed when the RDMA_READs complete. The + * head context keeps all the pages that comprise the + * request. + */ + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; + head->arg.page_base = 0; + head->arg.page_len = 0; + head->arg.len = rqstp->rq_arg.len; + head->arg.buflen = rqstp->rq_arg.buflen; - if (!xprt->sc_frmr_pg_list_len) - sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, ch_count, - byte_count); + /* Use FRMR if supported */ + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) + reader = rdma_read_chunk_frmr; else - sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, ch_count, - byte_count); - if (sge_count < 0) { - err = -EIO; - goto out; - } - - sgl_offset = 0; - ch_no = 0; + reader = rdma_read_chunk_lcl; + page_no = 0; page_offset = 0; for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - ch->rc_discrim != 0; ch++, ch_no++) { - u64 rs_offset; -next_sge: - ctxt = svc_rdma_get_context(xprt); - ctxt->direction = DMA_FROM_DEVICE; - ctxt->frmr = hdr_ctxt->frmr; - ctxt->read_hdr = NULL; - clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + ch->rc_discrim != 0; ch++) { - /* Prepare READ WR */ - memset(&read_wr, 0, sizeof read_wr); - read_wr.wr_id = (unsigned long)ctxt; - read_wr.opcode = IB_WR_RDMA_READ; - ctxt->wr_op = read_wr.opcode; - read_wr.send_flags = IB_SEND_SIGNALED; - read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle); xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, &rs_offset); - read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset; - read_wr.sg_list = ctxt->sge; - read_wr.num_sge = - rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); - err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, - &rpl_map->sge[chl_map->ch[ch_no].start], - &sgl_offset, - read_wr.num_sge); - if (err) { - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - goto out; - } - if (((ch+1)->rc_discrim == 0) && - (read_wr.num_sge == chl_map->ch[ch_no].count)) { - /* - * Mark the last RDMA_READ with a bit to - * indicate all RPC data has been fetched from - * the client and the RPC needs to be enqueued. - */ - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - if (hdr_ctxt->frmr) { - set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); - /* - * Invalidate the local MR used to map the data - * sink. - */ - if (xprt->sc_dev_caps & - SVCRDMA_DEVCAP_READ_W_INV) { - read_wr.opcode = - IB_WR_RDMA_READ_WITH_INV; - ctxt->wr_op = read_wr.opcode; - read_wr.ex.invalidate_rkey = - ctxt->frmr->mr->lkey; - } else { - /* Prepare INVALIDATE WR */ - memset(&inv_wr, 0, sizeof inv_wr); - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED; - inv_wr.ex.invalidate_rkey = - hdr_ctxt->frmr->mr->lkey; - read_wr.next = &inv_wr; - } - } - ctxt->read_hdr = hdr_ctxt; + byte_count = ntohl(ch->rc_target.rs_length); + + while (byte_count > 0) { + ret = reader(xprt, rqstp, head, + &page_no, &page_offset, + ntohl(ch->rc_target.rs_handle), + byte_count, rs_offset, + ((ch+1)->rc_discrim == 0) /* last */ + ); + if (ret < 0) + goto err; + byte_count -= ret; + rs_offset += ret; + head->arg.buflen += ret; } - /* Post the read */ - err = svc_rdma_send(xprt, &read_wr); - if (err) { - printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", - err); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - goto out; - } - atomic_inc(&rdma_stat_read); - - if (read_wr.num_sge < chl_map->ch[ch_no].count) { - chl_map->ch[ch_no].count -= read_wr.num_sge; - chl_map->ch[ch_no].start += read_wr.num_sge; - goto next_sge; - } - sgl_offset = 0; - err = 1; } - - out: - svc_rdma_put_req_map(rpl_map); - svc_rdma_put_req_map(chl_map); - + ret = 1; + err: /* Detach arg pages. svc_recv will replenish them */ - for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) - rqstp->rq_pages[ch_no] = NULL; - - /* - * Detach res pages. If svc_release sees any it will attempt to - * put them. - */ - while (rqstp->rq_next_page != rqstp->rq_respages) - *(--rqstp->rq_next_page) = NULL; + for (page_no = 0; + &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++) + rqstp->rq_pages[page_no] = NULL; - return err; + return ret; } static int rdma_read_complete(struct svc_rqst *rqstp, @@ -550,7 +455,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, /* rq_respages starts after the last arg page */ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; - rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; /* Rebuild rq_arg head and tail. */ rqstp->rq_arg.head[0] = head->arg.head[0]; @@ -599,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_op_ctxt, dto_q); list_del_init(&ctxt->dto_q); - } - if (ctxt) { spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); return rdma_read_complete(rqstp, ctxt); - } - - if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, struct svc_rdma_op_ctxt, dto_q); @@ -625,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) goto close_out; - BUG_ON(ret); goto out; } dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", @@ -648,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) } /* Read read-list data. */ - ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); + ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { /* read-list posted, defer until data received from client. */ goto defer; - } - if (ret < 0) { + } else if (ret < 0) { /* Post of read-list failed, free context. */ svc_rdma_put_context(ctxt, 1); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c1d124dc772..49fd21a5c21 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -49,152 +50,6 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* Encode an XDR as an array of IB SGE - * - * Assumptions: - * - head[0] is physically contiguous. - * - tail[0] is physically contiguous. - * - pages[] is not physically or virtually contiguous and consists of - * PAGE_SIZE elements. - * - * Output: - * SGE[0] reserved for RCPRDMA header - * SGE[1] data from xdr->head[] - * SGE[2..sge_count-2] data from xdr->pages[] - * SGE[sge_count-1] data from xdr->tail. - * - * The max SGE we need is the length of the XDR / pagesize + one for - * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES - * reserves a page for both the request and the reply header, and this - * array is only concerned with the reply we are assured that we have - * on extra page for the RPCRMDA header. - */ -static int fast_reg_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) -{ - int sge_no; - u32 sge_bytes; - u32 page_bytes; - u32 page_off; - int page_no = 0; - u8 *frva; - struct svc_rdma_fastreg_mr *frmr; - - frmr = svc_rdma_get_frmr(xprt); - if (IS_ERR(frmr)) - return -ENOMEM; - vec->frmr = frmr; - - /* Skip the RPCRDMA header */ - sge_no = 1; - - /* Map the head. */ - frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); - vec->sge[sge_no].iov_base = xdr->head[0].iov_base; - vec->sge[sge_no].iov_len = xdr->head[0].iov_len; - vec->count = 2; - sge_no++; - - /* Map the XDR head */ - frmr->kva = frva; - frmr->direction = DMA_TO_DEVICE; - frmr->access_flags = 0; - frmr->map_len = PAGE_SIZE; - frmr->page_list_len = 1; - page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, - virt_to_page(xdr->head[0].iov_base), - page_off, - PAGE_SIZE - page_off, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - atomic_inc(&xprt->sc_dma_used); - - /* Map the XDR page list */ - page_off = xdr->page_base; - page_bytes = xdr->page_len + page_off; - if (!page_bytes) - goto encode_tail; - - /* Map the pages */ - vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; - vec->sge[sge_no].iov_len = page_bytes; - sge_no++; - while (page_bytes) { - struct page *page; - - page = xdr->pages[page_no++]; - sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); - page_bytes -= sge_bytes; - - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, - page, page_off, - sge_bytes, DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - - atomic_inc(&xprt->sc_dma_used); - page_off = 0; /* reset for next time through loop */ - frmr->map_len += PAGE_SIZE; - frmr->page_list_len++; - } - vec->count++; - - encode_tail: - /* Map tail */ - if (0 == xdr->tail[0].iov_len) - goto done; - - vec->count++; - vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; - - if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == - ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { - /* - * If head and tail use the same page, we don't need - * to map it again. - */ - vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; - } else { - void *va; - - /* Map another page for the tail */ - page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; - va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); - vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; - - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va), - page_off, - PAGE_SIZE, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - atomic_inc(&xprt->sc_dma_used); - frmr->map_len += PAGE_SIZE; - frmr->page_list_len++; - } - - done: - if (svc_rdma_fastreg(xprt, frmr)) - goto fatal_err; - - return 0; - - fatal_err: - printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); - vec->frmr = NULL; - svc_rdma_put_frmr(xprt, frmr); - return -EIO; -} - static int map_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, struct svc_rdma_req_map *vec) @@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt, BUG_ON(xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); - if (xprt->sc_frmr_pg_list_len) - return fast_reg_xdr(xprt, xdr, vec); - /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; @@ -265,6 +117,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, xdr_off -= xdr->head[0].iov_len; if (xdr_off < xdr->page_len) { /* This offset is in the page list */ + xdr_off += xdr->page_base; page = xdr->pages[xdr_off >> PAGE_SHIFT]; xdr_off &= ~PAGE_MASK; } else { @@ -281,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, } /* Assumptions: - * - We are using FRMR - * - or - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -326,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_bytes = min_t(size_t, bc, vec->sge[xdr_sge_no].iov_len-sge_off); sge[sge_no].length = sge_bytes; - if (!vec->frmr) { - sge[sge_no].addr = - dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - sge[sge_no].addr)) - goto err; - atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; - } else { - sge[sge_no].addr = (unsigned long) - vec->sge[xdr_sge_no].iov_base + sge_off; - sge[sge_no].lkey = vec->frmr->mr->lkey; - } + sge[sge_no].addr = + dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; ctxt->count++; - ctxt->frmr = vec->frmr; sge_off = 0; sge_no++; xdr_sge_no++; @@ -368,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, return 0; err: svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(xprt, vec->frmr); svc_rdma_put_context(ctxt, 0); /* Fatal error, close transport */ return -EIO; @@ -396,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - if (vec->frmr) - max_write = vec->frmr->map_len; - else - max_write = xprt->sc_max_sge * PAGE_SIZE; + max_write = xprt->sc_max_sge * PAGE_SIZE; /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; @@ -471,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - if (vec->frmr) - max_write = vec->frmr->map_len; - else - max_write = xprt->sc_max_sge * PAGE_SIZE; + max_write = xprt->sc_max_sge * PAGE_SIZE; /* xdr offset starts at RPC message */ nchunks = ntohl(arg_ary->wc_nchunks); @@ -544,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; - struct ib_send_wr inv_wr; int sge_no; int sge_bytes; int page_no; @@ -558,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma, "svcrdma: could not post a receive buffer, err=%d." "Closing transport %p.\n", ret, rdma); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_rdma_put_frmr(rdma, vec->frmr); svc_rdma_put_context(ctxt, 0); return -ENOTCONN; } @@ -566,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; - ctxt->frmr = vec->frmr; - if (vec->frmr) - set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); - else - clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_dma_lkey; @@ -589,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma, int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; - if (!vec->frmr) { - ctxt->sge[sge_no].addr = - dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(rdma->sc_cm_id->device, - ctxt->sge[sge_no].addr)) - goto err; - atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; - } else { - ctxt->sge[sge_no].addr = (unsigned long) - vec->sge[sge_no].iov_base; - ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; - } + ctxt->sge[sge_no].addr = + dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } BUG_ON(byte_count != 0); @@ -625,6 +449,8 @@ static int send_reply(struct svcxprt_rdma *rdma, if (page_no+1 >= sge_no) ctxt->sge[page_no+1].length = 0; } + rqstp->rq_next_page = rqstp->rq_respages + 1; + BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; @@ -633,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.num_sge = sge_no; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - if (vec->frmr) { - /* Prepare INVALIDATE WR */ - memset(&inv_wr, 0, sizeof inv_wr); - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED; - inv_wr.ex.invalidate_rkey = - vec->frmr->mr->lkey; - send_wr.next = &inv_wr; - } ret = svc_rdma_send(rdma, &send_wr); if (ret) @@ -651,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma, err: svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(rdma, vec->frmr); svc_rdma_put_context(ctxt, 1); return -EIO; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 62e4f9bcc38..e7323fbbd34 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -65,6 +66,7 @@ static void dto_tasklet_func(unsigned long data); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static int svc_rdma_secure_port(struct svc_rqst *); static void rq_cq_reap(struct svcxprt_rdma *xprt); static void sq_cq_reap(struct svcxprt_rdma *xprt); @@ -82,6 +84,7 @@ static struct svc_xprt_ops svc_rdma_ops = { .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, + .xpo_secure_port = svc_rdma_secure_port, }; struct svc_xprt_class svc_rdma_class = { @@ -160,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } map->count = 0; - map->frmr = NULL; return map; } @@ -336,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt, switch (ctxt->wr_op) { case IB_WR_SEND: - if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) - svc_rdma_put_frmr(xprt, ctxt->frmr); + BUG_ON(ctxt->frmr); svc_rdma_put_context(ctxt, 1); break; case IB_WR_RDMA_WRITE: + BUG_ON(ctxt->frmr); svc_rdma_put_context(ctxt, 0); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: + svc_rdma_put_frmr(xprt, ctxt->frmr); if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; BUG_ON(!read_hdr); - if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) - svc_rdma_put_frmr(xprt, ctxt->frmr); spin_lock_bh(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); list_add_tail(&read_hdr->dto_q, @@ -363,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt, break; default: + BUG_ON(1); printk(KERN_ERR "svcrdma: unexpected completion type, " "opcode=%d\n", ctxt->wr_op); @@ -378,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt, static void sq_cq_reap(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - struct ib_wc wc; + struct ib_wc wc_a[6]; + struct ib_wc *wc; struct ib_cq *cq = xprt->sc_sq_cq; int ret; + memset(wc_a, 0, sizeof(wc_a)); + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) return; ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); - while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - if (wc.status != IB_WC_SUCCESS) - /* Close the transport */ - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) { + int i; - /* Decrement used SQ WR count */ - atomic_dec(&xprt->sc_sq_count); - wake_up(&xprt->sc_send_wait); + for (i = 0; i < ret; i++) { + wc = &wc_a[i]; + if (wc->status != IB_WC_SUCCESS) { + dprintk("svcrdma: sq wc err status %d\n", + wc->status); - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - if (ctxt) - process_context(xprt, ctxt); + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + } - svc_xprt_put(&xprt->sc_xprt); + /* Decrement used SQ WR count */ + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + + ctxt = (struct svc_rdma_op_ctxt *) + (unsigned long)wc->wr_id; + if (ctxt) + process_context(xprt, ctxt); + + svc_xprt_put(&xprt->sc_xprt); + } } if (ctxt) @@ -477,8 +492,7 @@ struct page *svc_rdma_get_page(void) while ((page = alloc_page(GFP_KERNEL)) == NULL) { /* If we can't get memory, wait a bit and try again */ - printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " - "jiffies.\n"); + printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n"); schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); } return page; @@ -994,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) need_dma_mr = 0; break; case RDMA_TRANSPORT_IB: - if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else if (!(devattr.device_cap_flags & + IB_DEVICE_LOCAL_DMA_LKEY)) { need_dma_mr = 1; dma_mr_acc = IB_ACCESS_LOCAL_WRITE; } else @@ -1191,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) container_of(xprt, struct svcxprt_rdma, sc_xprt); /* - * If there are fewer SQ WR available than required to send a - * simple response, return false. - */ - if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) - return 0; - - /* - * ...or there are already waiters on the SQ, + * If there are already waiters on the SQ, * return false. */ if (waitqueue_active(&rdma->sc_send_wait)) @@ -1208,6 +1219,11 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } +static int svc_rdma_secure_port(struct svc_rqst *rqstp) +{ + return 1; +} + /* * Attempt to register the kvec representing the RPC memory with the * device. diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 285dc088411..66f91f0d071 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = { #endif +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void @@ -229,7 +234,6 @@ static void xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; dprintk("RPC: %s: called\n", __func__); @@ -238,10 +242,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) xprt_clear_connected(xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", - __func__, rc); + rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); @@ -289,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args) /* 60 second timeout, no retries */ xprt->timeout = &xprt_rdma_default_timeout; - xprt->bind_timeout = (60U * HZ); - xprt->reestablish_timeout = (5U * HZ); - xprt->idle_timeout = (5U * 60 * HZ); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; xprt->resvport = 0; /* privileged port not needed */ xprt->tsh_size = 0; /* RPC-RDMA handles framing */ @@ -391,7 +392,7 @@ out4: xprt_rdma_free_addresses(xprt); rc = -EINVAL; out3: - (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); + rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: @@ -436,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); xprt->reestablish_timeout <<= 1; - if (xprt->reestablish_timeout > (30 * HZ)) - xprt->reestablish_timeout = (30 * HZ); - else if (xprt->reestablish_timeout < (5 * HZ)) - xprt->reestablish_timeout = (5 * HZ); + if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; + else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) @@ -447,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -static int -xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -{ - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int credits = atomic_read(&r_xprt->rx_buf.rb_credits); - - /* == RPC_CWNDSCALE @ init, but *after* setup */ - if (r_xprt->rx_buf.rb_cwndscale == 0UL) { - r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; - dprintk("RPC: %s: cwndscale %lu\n", __func__, - r_xprt->rx_buf.rb_cwndscale); - BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); - } - xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; - return xprt_reserve_xprt_cong(xprt, task); -} - /* * The RDMA allocate/free functions need the task structure as a place * to hide the struct rpcrdma_req, which is necessary for the actual send/recv @@ -479,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) struct rpcrdma_req *req, *nreq; req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); - BUG_ON(NULL == req); + if (req == NULL) + return NULL; if (size > req->rl_size) { dprintk("RPC: %s: size %zd too large for buffer[%zd]: " @@ -503,18 +488,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) * If the allocation or registration fails, the RPC framework * will (doggedly) retry. */ - if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == - RPCRDMA_BOUNCEBUFFERS) { - /* forced to "pure inline" */ - dprintk("RPC: %s: too much data (%zd) for inline " - "(r/w max %d/%d)\n", __func__, size, - rpcx_to_rdmad(xprt).inline_rsize, - rpcx_to_rdmad(xprt).inline_wsize); - size = req->rl_size; - rpc_exit(task, -EIO); /* fail the operation */ - rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; - goto out; - } if (task->tk_flags & RPC_TASK_SWAPPER) nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); else @@ -543,7 +516,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) req = nreq; } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); -out: req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; @@ -579,9 +551,7 @@ xprt_rdma_free(void *buffer) __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); /* - * Finish the deregistration. When using mw bind, this was - * begun in rpcrdma_reply_handler(). In all other modes, we - * do it here, in thread context. The process is considered + * Finish the deregistration. The process is considered * complete when the rr_func vector becomes NULL - this * was put in place during rpcrdma_reply_handler() - the wait * call below will not block if the dereg is "done". If @@ -590,12 +560,7 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt, NULL); - } - - if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { - rep->rr_func = NULL; /* abandon the callback */ - req->rl_reply = NULL; + &req->rl_segments[i], r_xprt); } if (req->rl_iov.length == 0) { /* see allocate above */ @@ -630,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc; - /* marshal the send itself */ - if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { - r_xprt->rx_stats.failed_marshal_count++; - dprintk("RPC: %s: rpcrdma_marshal_req failed\n", - __func__); - return -EIO; + if (req->rl_niovs == 0) { + rc = rpcrdma_marshal_req(rqst); + if (rc < 0) + goto failed_marshal; } if (req->rl_reply == NULL) /* e.g. reconnection */ @@ -660,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task) rqst->rq_bytes_sent = 0; return 0; +failed_marshal: + r_xprt->rx_stats.failed_marshal_count++; + dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", + __func__, rc); + if (rc == -EIO) + return -EIO; drop_connection: xprt_disconnect_done(xprt); return -ENOTCONN; /* implies disconnect */ @@ -705,7 +675,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) */ static struct rpc_xprt_ops xprt_rdma_procs = { - .reserve_xprt = xprt_rdma_reserve_xprt, + .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ @@ -733,7 +703,7 @@ static void __exit xprt_rdma_cleanup(void) { int rc; - dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #ifdef RPC_DEBUG if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); @@ -755,14 +725,14 @@ static int __init xprt_rdma_init(void) if (rc) return rc; - dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); - dprintk(KERN_INFO "Defaults:\n"); - dprintk(KERN_INFO "\tSlots %d\n" + dprintk("Defaults:\n"); + dprintk("\tSlots %d\n" "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", xprt_rdma_slot_table_entries, xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); - dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n", + dprintk("\tPadding %d\n\tMemreg %d\n", xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); #ifdef RPC_DEBUG diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 93726560eaa..13dbd1c389f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -48,8 +48,8 @@ */ #include <linux/interrupt.h> -#include <linux/pci.h> /* for Tavor hack below */ #include <linux/slab.h> +#include <asm/bitops.h> #include "xprt_rdma.h" @@ -142,98 +142,139 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) } } -static inline -void rpcrdma_event_process(struct ib_wc *wc) +static void +rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - struct rpcrdma_mw *frmr; - struct rpcrdma_rep *rep = - (struct rpcrdma_rep *)(unsigned long) wc->wr_id; + struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", - __func__, rep, wc->status, wc->opcode, wc->byte_len); + dprintk("RPC: %s: frmr %p status %X opcode %d\n", + __func__, frmr, wc->status, wc->opcode); - if (!rep) /* send or bind completion that we don't care about */ + if (wc->wr_id == 0ULL) return; - - if (IB_WC_SUCCESS != wc->status) { - dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", - __func__, wc->opcode, wc->status); - rep->rr_len = ~0U; - if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) - rpcrdma_schedule_tasklet(rep); + if (wc->status != IB_WC_SUCCESS) return; - } - switch (wc->opcode) { - case IB_WC_FAST_REG_MR: - frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + if (wc->opcode == IB_WC_FAST_REG_MR) frmr->r.frmr.state = FRMR_IS_VALID; - break; - case IB_WC_LOCAL_INV: - frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + else if (wc->opcode == IB_WC_LOCAL_INV) frmr->r.frmr.state = FRMR_IS_INVALID; - break; - case IB_WC_RECV: - rep->rr_len = wc->byte_len; - ib_dma_sync_single_for_cpu( - rdmab_to_ia(rep->rr_buffer)->ri_id->device, - rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); - /* Keep (only) the most recent credits, after check validity */ - if (rep->rr_len >= 16) { - struct rpcrdma_msg *p = - (struct rpcrdma_msg *) rep->rr_base; - unsigned int credits = ntohl(p->rm_credit); - if (credits == 0) { - dprintk("RPC: %s: server" - " dropped credits to 0!\n", __func__); - /* don't deadlock */ - credits = 1; - } else if (credits > rep->rr_buffer->rb_max_requests) { - dprintk("RPC: %s: server" - " over-crediting: %d (%d)\n", - __func__, credits, - rep->rr_buffer->rb_max_requests); - credits = rep->rr_buffer->rb_max_requests; - } - atomic_set(&rep->rr_buffer->rb_credits, credits); - } - /* fall through */ - case IB_WC_BIND_MW: - rpcrdma_schedule_tasklet(rep); - break; - default: - dprintk("RPC: %s: unexpected WC event %X\n", - __func__, wc->opcode); - break; - } } -static inline int -rpcrdma_cq_poll(struct ib_cq *cq) +static int +rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { - struct ib_wc wc; - int rc; + struct ib_wc *wcs; + int budget, count, rc; - for (;;) { - rc = ib_poll_cq(cq, 1, &wc); - if (rc < 0) { - dprintk("RPC: %s: ib_poll_cq failed %i\n", - __func__, rc); + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_send_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) return rc; - } - if (rc == 0) - break; - rpcrdma_event_process(&wc); + count = rc; + while (count-- > 0) + rpcrdma_sendcq_process_wc(wcs++); + } while (rc == RPCRDMA_POLLSIZE && --budget); + return 0; +} + +/* + * Handle send, fast_reg_mr, and local_inv completions. + * + * Send events are typically suppressed and thus do not result + * in an upcall. Occasionally one is signaled, however. This + * prevents the provider's completion queue from wrapping and + * losing a completion. + */ +static void +rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) +{ + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; + int rc; + + rc = rpcrdma_sendcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); + return; } + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; + } + + rpcrdma_sendcq_poll(cq, ep); +} + +static void +rpcrdma_recvcq_process_wc(struct ib_wc *wc) +{ + struct rpcrdma_rep *rep = + (struct rpcrdma_rep *)(unsigned long)wc->wr_id; + + dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", + __func__, rep, wc->status, wc->opcode, wc->byte_len); + + if (wc->status != IB_WC_SUCCESS) { + rep->rr_len = ~0U; + goto out_schedule; + } + if (wc->opcode != IB_WC_RECV) + return; + + rep->rr_len = wc->byte_len; + ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, + rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); + + if (rep->rr_len >= 16) { + struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; + unsigned int credits = ntohl(p->rm_credit); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > rep->rr_buffer->rb_max_requests) + credits = rep->rr_buffer->rb_max_requests; + atomic_set(&rep->rr_buffer->rb_credits, credits); + } + +out_schedule: + rpcrdma_schedule_tasklet(rep); +} + +static int +rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +{ + struct ib_wc *wcs; + int budget, count, rc; + + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_recv_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) + return rc; + + count = rc; + while (count-- > 0) + rpcrdma_recvcq_process_wc(wcs++); + } while (rc == RPCRDMA_POLLSIZE && --budget); return 0; } /* - * rpcrdma_cq_event_upcall + * Handle receive completions. * - * This upcall handles recv, send, bind and unbind events. * It is reentrant but processes single events in order to maintain * ordering of receives to keep server credits. * @@ -242,26 +283,31 @@ rpcrdma_cq_poll(struct ib_cq *cq) * connection shutdown. That is, the structures required for * the completion of the reply handler must remain intact until * all memory has been reclaimed. - * - * Note that send events are suppressed and do not result in an upcall. */ static void -rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) +rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) { + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; int rc; - rc = rpcrdma_cq_poll(cq); - if (rc) + rc = rpcrdma_recvcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); return; + } - rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (rc) { - dprintk("RPC: %s: ib_req_notify_cq failed %i\n", + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); return; } - rpcrdma_cq_poll(cq); + rpcrdma_recvcq_poll(cq, ep); } #ifdef RPC_DEBUG @@ -493,54 +539,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; } - switch (memreg) { - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: - if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { - dprintk("RPC: %s: MEMWINDOWS registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; - } - break; - case RPCRDMA_MTHCAFMR: - if (!ia->ri_id->device->alloc_fmr) { -#if RPCRDMA_PERSISTENT_REGISTRATION - dprintk("RPC: %s: MTHCAFMR registration " - "specified but not supported by adapter, " - "using riskier RPCRDMA_ALLPHYSICAL\n", - __func__); - memreg = RPCRDMA_ALLPHYSICAL; -#else - dprintk("RPC: %s: MTHCAFMR registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; -#endif - } - break; - case RPCRDMA_FRMR: + if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ if ((devattr.device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { -#if RPCRDMA_PERSISTENT_REGISTRATION dprintk("RPC: %s: FRMR registration " - "specified but not supported by adapter, " - "using riskier RPCRDMA_ALLPHYSICAL\n", - __func__); + "not supported by HCA\n", __func__); + memreg = RPCRDMA_MTHCAFMR; + } else { + /* Mind the ia limit on FRMR page list depth */ + ia->ri_max_frmr_depth = min_t(unsigned int, + RPCRDMA_MAX_DATA_SEGS, + devattr.max_fast_reg_page_list_len); + } + } + if (memreg == RPCRDMA_MTHCAFMR) { + if (!ia->ri_id->device->alloc_fmr) { + dprintk("RPC: %s: MTHCAFMR registration " + "not supported by HCA\n", __func__); +#if RPCRDMA_PERSISTENT_REGISTRATION memreg = RPCRDMA_ALLPHYSICAL; #else - dprintk("RPC: %s: FRMR registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; + rc = -ENOMEM; + goto out2; #endif } - break; } /* @@ -552,8 +576,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * adapter. */ switch (memreg) { - case RPCRDMA_BOUNCEBUFFERS: - case RPCRDMA_REGISTER: case RPCRDMA_FRMR: break; #if RPCRDMA_PERSISTENT_REGISTRATION @@ -563,30 +585,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) IB_ACCESS_REMOTE_READ; goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_MW_BIND; - goto register_setup; case RPCRDMA_MTHCAFMR: if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; +#if RPCRDMA_PERSISTENT_REGISTRATION register_setup: +#endif ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " - "phys register failed with %lX\n\t" - "Will continue with degraded performance\n", + "phys register failed with %lX\n", __func__, PTR_ERR(ia->ri_bind_mem)); - memreg = RPCRDMA_REGISTER; - ia->ri_bind_mem = NULL; + rc = -ENOMEM; + goto out2; } break; default: - printk(KERN_ERR "%s: invalid memory registration mode %d\n", - __func__, memreg); - rc = -EINVAL; + printk(KERN_ERR "RPC: Unsupported memory " + "registration mode: %d\n", memreg); + rc = -ENOMEM; goto out2; } dprintk("RPC: %s: memory registration strategy is %d\n", @@ -640,6 +658,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { struct ib_device_attr devattr; + struct ib_cq *sendcq, *recvcq; int rc, err; rc = ib_query_device(ia->ri_id->device, &devattr); @@ -659,32 +678,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: + case RPCRDMA_FRMR: { + int depth = 7; + /* Add room for frmr register and invalidate WRs. * 1. FRMR reg WR for head * 2. FRMR invalidate WR for head - * 3. FRMR reg WR for pagelist - * 4. FRMR invalidate WR for pagelist + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist * 5. FRMR reg WR for tail * 6. FRMR invalidate WR for tail * 7. The RDMA_SEND WR */ - ep->rep_attr.cap.max_send_wr *= 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + int delta = RPCRDMA_MAX_DATA_SEGS - + ia->ri_max_frmr_depth; + + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + + } + ep->rep_attr.cap.max_send_wr *= depth; if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { - cdata->max_requests = devattr.max_qp_wr / 7; + cdata->max_requests = devattr.max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; } break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - /* Add room for mw_binds+unbinds - overkill! */ - ep->rep_attr.cap.max_send_wr++; - ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); - if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) - return -EINVAL; - break; + } default: break; } @@ -705,46 +734,51 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_sge); /* set trigger for requesting send completion */ - ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - ep->rep_cqinit -= RPCRDMA_MAX_SEGS; - break; - default: - break; - } + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; INIT_CQCOUNT(ep); ep->rep_ia = ia; init_waitqueue_head(&ep->rep_connect_wait); + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); - /* - * Create a single cq for receive dto and mw_bind (only ever - * care about unbind, really). Send completions are suppressed. - * Use single threaded tasklet upcalls to maintain ordering. - */ - ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, - rpcrdma_cq_async_error_upcall, NULL, - ep->rep_attr.cap.max_recv_wr + + sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, + rpcrdma_cq_async_error_upcall, ep, ep->rep_attr.cap.max_send_wr + 1, 0); - if (IS_ERR(ep->rep_cq)) { - rc = PTR_ERR(ep->rep_cq); - dprintk("RPC: %s: ib_create_cq failed: %i\n", + if (IS_ERR(sendcq)) { + rc = PTR_ERR(sendcq); + dprintk("RPC: %s: failed to create send CQ: %i\n", __func__, rc); goto out1; } - rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); + rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + goto out2; + } + + recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, + rpcrdma_cq_async_error_upcall, ep, + ep->rep_attr.cap.max_recv_wr + 1, 0); + if (IS_ERR(recvcq)) { + rc = PTR_ERR(recvcq); + dprintk("RPC: %s: failed to create recv CQ: %i\n", + __func__, rc); + goto out2; + } + + rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); if (rc) { dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); + ib_destroy_cq(recvcq); goto out2; } - ep->rep_attr.send_cq = ep->rep_cq; - ep->rep_attr.recv_cq = ep->rep_cq; + ep->rep_attr.send_cq = sendcq; + ep->rep_attr.recv_cq = recvcq; /* Initialize cma parameters */ @@ -754,9 +788,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) - ep->rep_remote_cma.responder_resources = 0; - else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; @@ -768,7 +800,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, return 0; out2: - err = ib_destroy_cq(ep->rep_cq); + err = ib_destroy_cq(sendcq); if (err) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); @@ -782,11 +814,8 @@ out1: * Disconnect and destroy endpoint. After this, the only * valid operations on the ep are to free it (if dynamically * allocated) or re-create it. - * - * The caller's error handling must be sure to not leak the endpoint - * if this function fails. */ -int +void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; @@ -794,6 +823,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: entering, connected is %d\n", __func__, ep->rep_connected); + cancel_delayed_work_sync(&ep->rep_connect_worker); + if (ia->ri_id->qp) { rc = rpcrdma_ep_disconnect(ep, ia); if (rc) @@ -809,13 +840,17 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ep->rep_pad_mr = NULL; } - rpcrdma_clean_cq(ep->rep_cq); - rc = ib_destroy_cq(ep->rep_cq); + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rc = ib_destroy_cq(ep->rep_attr.recv_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); - return rc; + rpcrdma_clean_cq(ep->rep_attr.send_cq); + rc = ib_destroy_cq(ep->rep_attr.send_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); } /* @@ -831,17 +866,20 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: + dprintk("RPC: %s: reconnecting...\n", __func__); rc = rpcrdma_ep_disconnect(ep, ia); if (rc && rc != -ENOTCONN) dprintk("RPC: %s: rpcrdma_ep_disconnect" " status %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_cq); + + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { - rc = PTR_ERR(id); + rc = -EHOSTUNREACH; goto out; } /* TEMP TEMP TEMP - fail if new device: @@ -855,35 +893,32 @@ retry: printk("RPC: %s: can't reconnect on " "different device!\n", __func__); rdma_destroy_id(id); - rc = -ENETDOWN; + rc = -ENETUNREACH; goto out; } /* END TEMP */ + rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + rdma_destroy_id(id); + rc = -ENETUNREACH; + goto out; + } rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; + } else { + dprintk("RPC: %s: connecting...\n", __func__); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + /* do not update ep->rep_connected */ + return -ENETUNREACH; + } } - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); - if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); - goto out; - } - -/* XXX Tavor device performs badly with 2K MTU! */ -if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { - struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); - if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && - (pcid->vendor == PCI_VENDOR_ID_MELLANOX || - pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { - struct ib_qp_attr attr = { - .path_mtu = IB_MTU_1024 - }; - rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); - } -} - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -944,7 +979,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_clean_cq(ep->rep_cq); + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -967,7 +1003,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { char *p; - size_t len; + size_t len, rlen, wlen; int i, rc; struct rpcrdma_mw *r; @@ -997,11 +1033,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * sizeof(struct rpcrdma_mw); break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; default: break; } @@ -1032,32 +1063,29 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, } p += cdata->padding; - /* - * Allocate the fmr's, or mw's for mw_bind chunk registration. - * We "cycle" the mw's in order to minimize rkey reuse, - * and also reduce unbind-to-bind collision. - */ INIT_LIST_HEAD(&buf->rb_mws); r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - RPCRDMA_MAX_SEGS); + ia->ri_max_frmr_depth); if (IS_ERR(r->r.frmr.fr_mr)) { rc = PTR_ERR(r->r.frmr.fr_mr); dprintk("RPC: %s: ib_alloc_fast_reg_mr" " failed %i\n", __func__, rc); goto out; } - r->r.frmr.fr_pgl = - ib_alloc_fast_reg_page_list(ia->ri_id->device, - RPCRDMA_MAX_SEGS); + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); if (IS_ERR(r->r.frmr.fr_pgl)) { rc = PTR_ERR(r->r.frmr.fr_pgl); dprintk("RPC: %s: " "ib_alloc_fast_reg_page_list " "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); goto out; } list_add(&r->mw_list, &buf->rb_mws); @@ -1082,21 +1110,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, ++r; } break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - /* Allocate one extra request's worth, for full cycling */ - for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1); - if (IS_ERR(r->r.mw)) { - rc = PTR_ERR(r->r.mw); - dprintk("RPC: %s: ib_alloc_mw" - " failed %i\n", __func__, rc); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } - break; default: break; } @@ -1105,16 +1118,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * Allocate/init the request/reply buffers. Doing this * using kmalloc for now -- one for each buf. */ + wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); + rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); + dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", + __func__, wlen, rlen); + for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; struct rpcrdma_rep *rep; - len = cdata->inline_wsize + sizeof(struct rpcrdma_req); - /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ - /* Typical ~2400b, so rounding up saves work later */ - if (len < 4096) - len = 4096; - req = kmalloc(len, GFP_KERNEL); + req = kmalloc(wlen, GFP_KERNEL); if (req == NULL) { dprintk("RPC: %s: request buffer %d alloc" " failed\n", __func__, i); @@ -1126,16 +1139,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, buf->rb_send_bufs[i]->rl_buffer = buf; rc = rpcrdma_register_internal(ia, req->rl_base, - len - offsetof(struct rpcrdma_req, rl_base), + wlen - offsetof(struct rpcrdma_req, rl_base), &buf->rb_send_bufs[i]->rl_handle, &buf->rb_send_bufs[i]->rl_iov); if (rc) goto out; - buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); + buf->rb_send_bufs[i]->rl_size = wlen - + sizeof(struct rpcrdma_req); - len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); - rep = kmalloc(len, GFP_KERNEL); + rep = kmalloc(rlen, GFP_KERNEL); if (rep == NULL) { dprintk("RPC: %s: reply buffer %d alloc failed\n", __func__, i); @@ -1145,10 +1158,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, memset(rep, 0, sizeof(struct rpcrdma_rep)); buf->rb_recv_bufs[i] = rep; buf->rb_recv_bufs[i]->rr_buffer = buf; - init_waitqueue_head(&rep->rr_unbind); rc = rpcrdma_register_internal(ia, rep->rr_base, - len - offsetof(struct rpcrdma_rep, rr_base), + rlen - offsetof(struct rpcrdma_rep, rr_base), &buf->rb_recv_bufs[i]->rr_handle, &buf->rb_recv_bufs[i]->rr_iov); if (rc) @@ -1179,7 +1191,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) - * 1a. bind mw memory * 2. send mr memory (mr free, then kfree) * 3. padding (if any) [moved to rpcrdma_ep_destroy] * 4. arrays @@ -1194,41 +1205,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) kfree(buf->rb_recv_bufs[i]); } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s:" - " ib_dereg_mr" - " failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - break; - case RPCRDMA_MTHCAFMR: - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_fmr" - " failed %i\n", - __func__, rc); - break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = ib_dealloc_mw(r->r.mw); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_mw" - " failed %i\n", - __func__, rc); - break; - default: - break; - } - } rpcrdma_deregister_internal(ia, buf->rb_send_bufs[i]->rl_handle, &buf->rb_send_bufs[i]->rl_iov); @@ -1236,6 +1212,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } } + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; + case RPCRDMA_MTHCAFMR: + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_fmr" + " failed %i\n", + __func__, rc); + break; + default: + break; + } + } + kfree(buf->rb_pool); } @@ -1299,21 +1302,17 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) int i; unsigned long flags; - BUG_ON(req->rl_nchunks != 0); spin_lock_irqsave(&buffers->rb_lock, flags); buffers->rb_send_bufs[--buffers->rb_send_index] = req; req->rl_niovs = 0; if (req->rl_reply) { buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; - init_waitqueue_head(&req->rl_reply->rr_unbind); req->rl_reply->rr_func = NULL; req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: /* * Cycle mw's back in reverse order, and "spin" them. * This delays and scrambles reuse as much as possible. @@ -1358,8 +1357,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) /* * Put reply buffers back into pool when not attached to - * request. This happens in error conditions, and when - * aborting unbinds. Pre-decrement counter/array index. + * request. This happens in error conditions. */ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) @@ -1498,8 +1496,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, seg1->mr_offset -= pageoff; /* start of page */ seg1->mr_len += pageoff; len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; + if (*nsegs > ia->ri_max_frmr_depth) + *nsegs = ia->ri_max_frmr_depth; for (page_no = i = 0; i < *nsegs;) { rpcrdma_map_one(ia, seg, writing); pa = seg->mr_dma; @@ -1536,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, } else post_wr = &frmr_wr; - /* Bump the key */ - key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); - /* Prepare FRMR WR */ memset(&frmr_wr, 0, sizeof frmr_wr); frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; @@ -1550,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr_wr.wr.fast_reg.page_list_len = page_no; frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - BUG_ON(frmr_wr.wr.fast_reg.length < len); + if (frmr_wr.wr.fast_reg.length < len) { + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + return -EIO; + } + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + frmr_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); @@ -1661,135 +1664,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, return rc; } -static int -rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct ib_mw_bind param; - int rc; - - *nsegs = 1; - rpcrdma_map_one(ia, seg, writing); - param.bind_info.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.bind_info.addr = seg->mr_dma; - param.bind_info.length = seg->mr_len; - param.send_flags = 0; - param.bind_info.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.bind_info.addr; - seg->mr_nsegs = 1; - } - return rc; -} - -static int -rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt, void **r) -{ - struct ib_mw_bind param; - LIST_HEAD(l); - int rc; - - BUG_ON(seg->mr_nsegs != 1); - param.bind_info.mr = ia->ri_bind_mem; - param.bind_info.addr = 0ULL; /* unbind */ - param.bind_info.length = 0; - param.bind_info.mw_access_flags = 0; - if (*r) { - param.wr_id = (u64) (unsigned long) *r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - *r = NULL; /* will upcall on completion */ - return rc; -} - -static int -rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len, i, rc = 0; - - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (len = 0, i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, i, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - int rc; - - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); - return rc; -} - int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) @@ -1819,16 +1693,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; - /* Registration using memory windows */ - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Default registration each time */ default: - rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); - break; + return -1; } if (rc) return -1; @@ -1838,7 +1704,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt, void *r) + struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; int nsegs = seg->mr_nsegs, rc; @@ -1847,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, #if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: - BUG_ON(nsegs != 1); rpcrdma_unmap_one(ia, seg); - rc = 0; break; #endif @@ -1861,21 +1725,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, rc = rpcrdma_deregister_fmr_external(seg, ia); break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); - break; - default: - rc = rpcrdma_deregister_default_external(seg, ia); break; } - if (r) { - struct rpcrdma_rep *rep = r; - void (*func)(struct rpcrdma_rep *) = rep->rr_func; - rep->rr_func = NULL; - func(rep); /* dereg done, callback now */ - } return nsegs; } @@ -1950,7 +1802,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, ib_dma_sync_single_for_cpu(ia->ri_id->device, rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); - DECR_CQCOUNT(ep); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); if (rc) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445dc1d1..89e7cd47970 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -43,6 +43,7 @@ #include <linux/wait.h> /* wait_queue_head_t, etc */ #include <linux/spinlock.h> /* spinlock_t, etc */ #include <linux/atomic.h> /* atomic_t, etc */ +#include <linux/workqueue.h> /* struct work_struct */ #include <rdma/rdma_cm.h> /* RDMA connection api */ #include <rdma/ib_verbs.h> /* RDMA verbs api */ @@ -66,18 +67,21 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; + unsigned int ri_max_frmr_depth; }; /* * RDMA Endpoint -- one per transport instance */ +#define RPCRDMA_WC_BUDGET (128) +#define RPCRDMA_POLLSIZE (16) + struct rpcrdma_ep { atomic_t rep_cqcount; int rep_cqinit; int rep_connected; struct rpcrdma_ia *rep_ia; - struct ib_cq *rep_cq; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; struct ib_sge rep_pad; /* holds zeroed pad */ @@ -86,6 +90,9 @@ struct rpcrdma_ep { struct rpc_xprt *rep_xprt; /* for rep_func */ struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; + struct delayed_work rep_connect_worker; + struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; + struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; }; #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) @@ -124,7 +131,6 @@ struct rpcrdma_rep { struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ struct list_head rr_list; /* tasklet list */ - wait_queue_head_t rr_unbind; /* optional unbind wait */ struct ib_sge rr_iov; /* for posting */ struct ib_mr *rr_handle; /* handle for mem in rr_iov */ char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ @@ -159,7 +165,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ struct ib_mr *rl_mr; /* if registered directly */ struct rpcrdma_mw { /* if registered from region */ union { - struct ib_mw *mw; struct ib_fmr *fmr; struct { struct ib_fast_reg_page_list *fr_pgl; @@ -207,7 +212,6 @@ struct rpcrdma_req { struct rpcrdma_buffer { spinlock_t rb_lock; /* protects indexes */ atomic_t rb_credits; /* most recent server credits */ - unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; @@ -300,7 +304,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); */ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); -int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); @@ -330,11 +334,12 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *, int rpcrdma_register_external(struct rpcrdma_mr_seg *, int, int, struct rpcrdma_xprt *); int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *, void *); + struct rpcrdma_xprt *); /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ +void rpcrdma_connect_worker(struct work_struct *); void rpcrdma_conn_func(struct rpcrdma_ep *); void rpcrdma_reply_handler(struct rpcrdma_rep *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index dd9d295813c..be8bbd5d65e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -254,9 +254,10 @@ struct sock_xprt { /* * Saved socket callback addresses */ - void (*old_data_ready)(struct sock *, int); + void (*old_data_ready)(struct sock *); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); + void (*old_error_report)(struct sock *); }; /* @@ -274,6 +275,11 @@ struct sock_xprt { */ #define TCP_RPC_REPLY (1UL << 6) +static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) { return (struct sockaddr *) &xprt->addr; @@ -504,6 +510,7 @@ static int xs_nospace(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct sock *sk = transport->inet; int ret = -EAGAIN; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", @@ -521,7 +528,7 @@ static int xs_nospace(struct rpc_task *task) * window size */ set_bit(SOCK_NOSPACE, &transport->sock->flags); - transport->inet->sk_write_pending++; + sk->sk_write_pending++; /* ...and wait for more buffer space */ xprt_wait_for_buffer_space(task, xs_nospace_callback); } @@ -531,6 +538,9 @@ static int xs_nospace(struct rpc_task *task) } spin_unlock_bh(&xprt->transport_lock); + + /* Race breaker in case memory is freed before above code is called */ + sk->sk_write_space(sk); return ret; } @@ -799,6 +809,7 @@ static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) transport->old_data_ready = sk->sk_data_ready; transport->old_state_change = sk->sk_state_change; transport->old_write_space = sk->sk_write_space; + transport->old_error_report = sk->sk_error_report; } static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) @@ -806,6 +817,34 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; sk->sk_write_space = transport->old_write_space; + sk->sk_error_report = transport->old_error_report; +} + +/** + * xs_error_report - callback to handle TCP socket state errors + * @sk: socket + * + * Note: we don't call sock_error() since there may be a rpc_task + * using the socket, and so we don't want to clear sk->sk_err. + */ +static void xs_error_report(struct sock *sk) +{ + struct rpc_xprt *xprt; + int err; + + read_lock_bh(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + + err = -sk->sk_err; + if (err == 0) + goto out; + dprintk("RPC: xs_error_report client %p, error=%d...\n", + xprt, -err); + trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt_wake_pending_tasks(xprt, err); + out: + read_unlock_bh(&sk->sk_callback_lock); } static void xs_reset_transport(struct sock_xprt *transport) @@ -827,8 +866,6 @@ static void xs_reset_transport(struct sock_xprt *transport) xs_restore_old_callbacks(transport, sk); write_unlock_bh(&sk->sk_callback_lock); - sk->sk_no_check = 0; - trace_rpc_socket_close(&transport->xprt, sock); sock_release(sock); } @@ -854,11 +891,11 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); xprt->reestablish_timeout = 0; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xprt_disconnect_done(xprt); } @@ -870,6 +907,12 @@ static void xs_tcp_close(struct rpc_xprt *xprt) xs_tcp_shutdown(xprt); } +static void xs_xprt_free(struct rpc_xprt *xprt) +{ + xs_free_peer_addresses(xprt); + xprt_free(xprt); +} + /** * xs_destroy - prepare to shutdown a transport * @xprt: doomed transport @@ -880,16 +923,10 @@ static void xs_destroy(struct rpc_xprt *xprt) dprintk("RPC: xs_destroy xprt %p\n", xprt); xs_close(xprt); - xs_free_peer_addresses(xprt); - xprt_free(xprt); + xs_xprt_free(xprt); module_put(THIS_MODULE); } -static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) -{ - return (struct rpc_xprt *) sk->sk_user_data; -} - static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) { struct xdr_skb_reader desc = { @@ -912,7 +949,7 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) * * Currently this assumes we can read the whole reply in a single gulp. */ -static void xs_local_data_ready(struct sock *sk, int len) +static void xs_local_data_ready(struct sock *sk) { struct rpc_task *task; struct rpc_xprt *xprt; @@ -975,7 +1012,7 @@ static void xs_local_data_ready(struct sock *sk, int len) * @len: how much data to read * */ -static void xs_udp_data_ready(struct sock *sk, int len) +static void xs_udp_data_ready(struct sock *sk) { struct rpc_task *task; struct rpc_xprt *xprt; @@ -1272,41 +1309,29 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, * If we're unable to obtain the rpc_rqst we schedule the closing of the * connection and return -1. */ -static inline int xs_tcp_read_callback(struct rpc_xprt *xprt, +static int xs_tcp_read_callback(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct rpc_rqst *req; - req = xprt_alloc_bc_request(xprt); + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->transport_lock); + req = xprt_lookup_bc_request(xprt, transport->tcp_xid); if (req == NULL) { + spin_unlock(&xprt->transport_lock); printk(KERN_WARNING "Callback slot table overflowed\n"); xprt_force_disconnect(xprt); return -1; } - req->rq_xid = transport->tcp_xid; dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); xs_tcp_read_common(xprt, desc, req); - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) { - struct svc_serv *bc_serv = xprt->bc_serv; - - /* - * Add callback request to callback list. The callback - * service sleeps on the sv_cb_waitq waiting for new - * requests. Wake it up after adding enqueing the - * request. - */ - dprintk("RPC: add callback request to list\n"); - spin_lock(&bc_serv->sv_cb_lock); - list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); - spin_unlock(&bc_serv->sv_cb_lock); - wake_up(&bc_serv->sv_cb_waitq); - } - - req->rq_private_buf.len = transport->tcp_copied; + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) + xprt_complete_bc_request(req, transport->tcp_copied); + spin_unlock(&xprt->transport_lock); return 0; } @@ -1410,7 +1435,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns * @bytes: how much data to read * */ -static void xs_tcp_data_ready(struct sock *sk, int bytes) +static void xs_tcp_data_ready(struct sock *sk) { struct rpc_xprt *xprt; read_descriptor_t rd_desc; @@ -1470,12 +1495,12 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void xs_sock_mark_closed(struct rpc_xprt *xprt) @@ -1529,10 +1554,10 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; xprt->reestablish_timeout = 0; set_bit(XPRT_CLOSING, &xprt->state); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: @@ -1551,9 +1576,9 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); break; case TCP_CLOSE: xs_tcp_cancel_linger_timeout(xprt); @@ -1674,7 +1699,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) static unsigned short xs_get_random_port(void) { unsigned short range = xprt_max_resvport - xprt_min_resvport; - unsigned short rand = (unsigned short) net_random() % range; + unsigned short rand = (unsigned short) prandom_u32() % range; return rand + xprt_min_resvport; } @@ -1869,6 +1894,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_local_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; xprt_clear_connected(xprt); @@ -2018,7 +2044,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_no_check = UDP_CSUM_NORCV; sk->sk_allocation = GFP_ATOMIC; xprt_set_connected(xprt); @@ -2146,6 +2171,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ @@ -2508,6 +2534,10 @@ static void bc_close(struct rpc_xprt *xprt) static void bc_destroy(struct rpc_xprt *xprt) { + dprintk("RPC: bc_destroy xprt %p\n", xprt); + + xs_xprt_free(xprt); + module_put(THIS_MODULE); } static struct rpc_xprt_ops xs_local_ops = { @@ -2708,7 +2738,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) return xprt; ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } @@ -2786,7 +2816,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) return xprt; ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } @@ -2861,12 +2891,11 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); - if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } @@ -2883,15 +2912,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) struct svc_sock *bc_sock; struct rpc_xprt *ret; - if (args->bc_xprt->xpt_bc_xprt) { - /* - * This server connection already has a backchannel - * transport; we can't create a new one, as we wouldn't - * be able to match replies based on xid any more. So, - * reuse the already-existing one: - */ - return args->bc_xprt->xpt_bc_xprt; - } xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, xprt_tcp_slot_table_entries); if (IS_ERR(xprt)) @@ -2932,10 +2952,9 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) /* * Once we've associated a backchannel xprt with a connection, - * we want to keep it around as long as long as the connection - * lasts, in case we need to start using it for a backchannel - * again; this reference won't be dropped until bc_xprt is - * destroyed. + * we want to keep it around as long as the connection lasts, + * in case we need to start using it for a backchannel again; + * this reference won't be dropped until bc_xprt is destroyed. */ xprt_get(xprt); args->bc_xprt->xpt_bc_xprt = xprt; @@ -2950,13 +2969,14 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) */ xprt_set_connected(xprt); - if (try_module_get(THIS_MODULE)) return xprt; + + args->bc_xprt->xpt_bc_xprt = NULL; xprt_put(xprt); ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } diff --git a/net/tipc/Makefile b/net/tipc/Makefile index b282f7130d2..a080c66d819 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_TIPC) := tipc.o tipc-y += addr.o bcast.o bearer.o config.o \ - core.o handler.o link.o discover.o msg.o \ + core.o link.o discover.o msg.o \ name_distr.o subscr.o name_table.o net.o \ netlink.o node.o node_subscr.o port.o ref.o \ socket.o log.o eth_media.o server.o diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 60b00ab93d7..a74acf9ee80 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -37,6 +37,8 @@ #ifndef _TIPC_ADDR_H #define _TIPC_ADDR_H +#include "core.h" + #define TIPC_ZONE_MASK 0xff000000u #define TIPC_CLUSTER_MASK 0xfffff000u diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 4c2a80b3c01..55c6c9d3e1c 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -41,9 +41,9 @@ #include "bcast.h" #include "name_distr.h" -#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */ - -#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */ +#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */ +#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */ +#define BCBEARER MAX_BEARERS /** * struct tipc_bcbearer_pair - a pair of bearers used by broadcast link @@ -71,7 +71,7 @@ struct tipc_bcbearer_pair { * Note: The fields labelled "temporary" are incorporated into the bearer * to avoid consuming potentially limited stack space through the use of * large local variables within multicast routines. Concurrent access is - * prevented through use of the spinlock "bc_lock". + * prevented through use of the spinlock "bclink_lock". */ struct tipc_bcbearer { struct tipc_bearer bearer; @@ -84,34 +84,64 @@ struct tipc_bcbearer { /** * struct tipc_bclink - link used for broadcast messages + * @lock: spinlock governing access to structure * @link: (non-standard) broadcast link structure * @node: (non-standard) node structure representing b'cast link's peer node + * @flags: represent bclink states * @bcast_nodes: map of broadcast-capable nodes * @retransmit_to: node that most recently requested a retransmit * * Handles sequence numbering, fragmentation, bundling, etc. */ struct tipc_bclink { + spinlock_t lock; struct tipc_link link; struct tipc_node node; + unsigned int flags; struct tipc_node_map bcast_nodes; struct tipc_node *retransmit_to; }; -static struct tipc_bcbearer bcast_bearer; -static struct tipc_bclink bcast_link; - -static struct tipc_bcbearer *bcbearer = &bcast_bearer; -static struct tipc_bclink *bclink = &bcast_link; -static struct tipc_link *bcl = &bcast_link.link; - -static DEFINE_SPINLOCK(bc_lock); +static struct tipc_bcbearer *bcbearer; +static struct tipc_bclink *bclink; +static struct tipc_link *bcl; const char tipc_bclink_name[] = "broadcast-link"; static void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b, struct tipc_node_map *nm_diff); +static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node); +static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node); + +static void tipc_bclink_lock(void) +{ + spin_lock_bh(&bclink->lock); +} + +static void tipc_bclink_unlock(void) +{ + struct tipc_node *node = NULL; + + if (likely(!bclink->flags)) { + spin_unlock_bh(&bclink->lock); + return; + } + + if (bclink->flags & TIPC_BCLINK_RESET) { + bclink->flags &= ~TIPC_BCLINK_RESET; + node = tipc_bclink_retransmit_to(); + } + spin_unlock_bh(&bclink->lock); + + if (node) + tipc_link_reset_all(node); +} + +void tipc_bclink_set_flags(unsigned int flags) +{ + bclink->flags |= flags; +} static u32 bcbuf_acks(struct sk_buff *buf) { @@ -130,16 +160,16 @@ static void bcbuf_decr_acks(struct sk_buff *buf) void tipc_bclink_add_node(u32 addr) { - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); tipc_nmap_add(&bclink->bcast_nodes, addr); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); } void tipc_bclink_remove_node(u32 addr) { - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); tipc_nmap_remove(&bclink->bcast_nodes, addr); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); } static void bclink_set_last_sent(void) @@ -165,7 +195,7 @@ static void bclink_update_last_sent(struct tipc_node *node, u32 seqno) /** * tipc_bclink_retransmit_to - get most recent node to request retransmission * - * Called with bc_lock locked + * Called with bclink_lock locked */ struct tipc_node *tipc_bclink_retransmit_to(void) { @@ -177,7 +207,7 @@ struct tipc_node *tipc_bclink_retransmit_to(void) * @after: sequence number of last packet to *not* retransmit * @to: sequence number of last packet to retransmit * - * Called with bc_lock locked + * Called with bclink_lock locked */ static void bclink_retransmit_pkt(u32 after, u32 to) { @@ -194,7 +224,7 @@ static void bclink_retransmit_pkt(u32 after, u32 to) * @n_ptr: node that sent acknowledgement info * @acked: broadcast sequence # that has been acknowledged * - * Node is locked, bc_lock unlocked. + * Node is locked, bclink_lock unlocked. */ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) { @@ -202,8 +232,7 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) struct sk_buff *next; unsigned int released = 0; - spin_lock_bh(&bc_lock); - + tipc_bclink_lock(); /* Bail out if tx queue is empty (no clean up is required) */ crs = bcl->first_out; if (!crs) @@ -267,13 +296,13 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) if (unlikely(released && !list_empty(&bcl->waiting_ports))) tipc_link_wakeup_ports(bcl, 0); exit: - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); } /** * tipc_bclink_update_link_state - update broadcast link state * - * tipc_net_lock and node lock set + * RCU and node lock set */ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, u32 last_sent) { @@ -320,10 +349,10 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, u32 last_sent) ? buf_seqno(n_ptr->bclink.deferred_head) - 1 : n_ptr->bclink.last_sent); - spin_lock_bh(&bc_lock); - tipc_bearer_send(&bcbearer->bearer, buf, NULL); + tipc_bclink_lock(); + tipc_bearer_send(MAX_BEARERS, buf, NULL); bcl->stats.sent_nacks++; - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); kfree_skb(buf); n_ptr->bclink.oos_state++; @@ -335,8 +364,6 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, u32 last_sent) * * Delay any upcoming NACK by this node if another node has already * requested the first message this node is going to ask for. - * - * Only tipc_net_lock set. */ static void bclink_peek_nack(struct tipc_msg *msg) { @@ -356,13 +383,13 @@ static void bclink_peek_nack(struct tipc_msg *msg) } /* - * tipc_bclink_send_msg - broadcast a packet to all nodes in cluster + * tipc_bclink_xmit - broadcast a packet to all nodes in cluster */ -int tipc_bclink_send_msg(struct sk_buff *buf) +int tipc_bclink_xmit(struct sk_buff *buf) { int res; - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); if (!bclink->bcast_nodes.count) { res = msg_data_sz(buf_msg(buf)); @@ -370,21 +397,21 @@ int tipc_bclink_send_msg(struct sk_buff *buf) goto exit; } - res = tipc_link_send_buf(bcl, buf); + res = __tipc_link_xmit(bcl, buf); if (likely(res >= 0)) { bclink_set_last_sent(); bcl->stats.queue_sz_counts++; bcl->stats.accu_queue_sz += bcl->out_queue_size; } exit: - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); return res; } /** * bclink_accept_pkt - accept an incoming, in-sequence broadcast packet * - * Called with both sending node's lock and bc_lock taken. + * Called with both sending node's lock and bclink_lock taken. */ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) { @@ -399,19 +426,18 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) */ if (((seqno - tipc_own_addr) % TIPC_MIN_LINK_WIN) == 0) { - tipc_link_send_proto_msg( - node->active_links[node->addr & 1], - STATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(node->active_links[node->addr & 1], + STATE_MSG, 0, 0, 0, 0, 0); bcl->stats.sent_acks++; } } /** - * tipc_bclink_recv_pkt - receive a broadcast packet, and deliver upwards + * tipc_bclink_rcv - receive a broadcast packet, and deliver upwards * - * tipc_net_lock is read_locked, no other locks set + * RCU is locked, no other locks set */ -void tipc_bclink_recv_pkt(struct sk_buff *buf) +void tipc_bclink_rcv(struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); struct tipc_node *node; @@ -440,12 +466,12 @@ void tipc_bclink_recv_pkt(struct sk_buff *buf) if (msg_destnode(msg) == tipc_own_addr) { tipc_bclink_acknowledge(node, msg_bcast_ack(msg)); tipc_node_unlock(node); - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); bcl->stats.recv_nacks++; bclink->retransmit_to = node; bclink_retransmit_pkt(msg_bcgap_after(msg), msg_bcgap_to(msg)); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); } else { tipc_node_unlock(node); bclink_peek_nack(msg); @@ -463,51 +489,47 @@ receive: /* Deliver message to destination */ if (likely(msg_isdata(msg))) { - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); bclink_accept_pkt(node, seqno); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); tipc_node_unlock(node); if (likely(msg_mcast(msg))) - tipc_port_recv_mcast(buf, NULL); + tipc_port_mcast_rcv(buf, NULL); else kfree_skb(buf); } else if (msg_user(msg) == MSG_BUNDLER) { - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); bclink_accept_pkt(node, seqno); bcl->stats.recv_bundles++; bcl->stats.recv_bundled += msg_msgcnt(msg); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); tipc_node_unlock(node); - tipc_link_recv_bundle(buf); + tipc_link_bundle_rcv(buf); } else if (msg_user(msg) == MSG_FRAGMENTER) { - int ret; - ret = tipc_link_recv_fragment(&node->bclink.reasm_head, - &node->bclink.reasm_tail, - &buf); - if (ret == LINK_REASM_ERROR) + tipc_buf_append(&node->bclink.reasm_buf, &buf); + if (unlikely(!buf && !node->bclink.reasm_buf)) goto unlock; - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); bclink_accept_pkt(node, seqno); bcl->stats.recv_fragments++; - if (ret == LINK_REASM_COMPLETE) { + if (buf) { bcl->stats.recv_fragmented++; - /* Point msg to inner header */ msg = buf_msg(buf); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); goto receive; } - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); tipc_node_unlock(node); } else if (msg_user(msg) == NAME_DISTRIBUTOR) { - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); bclink_accept_pkt(node, seqno); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); tipc_node_unlock(node); - tipc_named_recv(buf); + tipc_named_rcv(buf); } else { - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); bclink_accept_pkt(node, seqno); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); tipc_node_unlock(node); kfree_skb(buf); } @@ -537,6 +559,7 @@ receive: buf = node->bclink.deferred_head; node->bclink.deferred_head = buf->next; + buf->next = NULL; node->bclink.deferred_size--; goto receive; } @@ -553,14 +576,14 @@ receive: } else deferred = 0; - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); if (deferred) bcl->stats.deferred_recv++; else bcl->stats.duplicates++; - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); unlock: tipc_node_unlock(node); @@ -628,13 +651,13 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, if (bp_index == 0) { /* Use original buffer for first bearer */ - tipc_bearer_send(b, buf, &b->bcast_addr); + tipc_bearer_send(b->identity, buf, &b->bcast_addr); } else { /* Avoid concurrent buffer access */ - tbuf = pskb_copy(buf, GFP_ATOMIC); + tbuf = pskb_copy_for_clone(buf, GFP_ATOMIC); if (!tbuf) break; - tipc_bearer_send(b, tbuf, &b->bcast_addr); + tipc_bearer_send(b->identity, tbuf, &b->bcast_addr); kfree_skb(tbuf); /* Bearer keeps a clone */ } @@ -656,22 +679,28 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, /** * tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer */ -void tipc_bcbearer_sort(void) +void tipc_bcbearer_sort(struct tipc_node_map *nm_ptr, u32 node, bool action) { struct tipc_bcbearer_pair *bp_temp = bcbearer->bpairs_temp; struct tipc_bcbearer_pair *bp_curr; + struct tipc_bearer *b; int b_index; int pri; - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); + + if (action) + tipc_nmap_add(nm_ptr, node); + else + tipc_nmap_remove(nm_ptr, node); /* Group bearers by priority (can assume max of two per priority) */ memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp)); + rcu_read_lock(); for (b_index = 0; b_index < MAX_BEARERS; b_index++) { - struct tipc_bearer *b = &tipc_bearers[b_index]; - - if (!b->active || !b->nodes.count) + b = rcu_dereference_rtnl(bearer_list[b_index]); + if (!b || !b->nodes.count) continue; if (!bp_temp[b->priority].primary) @@ -679,6 +708,7 @@ void tipc_bcbearer_sort(void) else bp_temp[b->priority].secondary = b; } + rcu_read_unlock(); /* Create array of bearer pairs for broadcasting */ bp_curr = bcbearer->bpairs; @@ -704,7 +734,7 @@ void tipc_bcbearer_sort(void) bp_curr++; } - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); } @@ -716,7 +746,7 @@ int tipc_bclink_stats(char *buf, const u32 buf_size) if (!bcl) return 0; - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); s = &bcl->stats; @@ -745,7 +775,7 @@ int tipc_bclink_stats(char *buf, const u32 buf_size) s->queue_sz_counts ? (s->accu_queue_sz / s->queue_sz_counts) : 0); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); return ret; } @@ -754,9 +784,9 @@ int tipc_bclink_reset_stats(void) if (!bcl) return -ENOPROTOOPT; - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); memset(&bcl->stats, 0, sizeof(bcl->stats)); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); return 0; } @@ -767,45 +797,59 @@ int tipc_bclink_set_queue_limits(u32 limit) if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN)) return -EINVAL; - spin_lock_bh(&bc_lock); + tipc_bclink_lock(); tipc_link_set_queue_limits(bcl, limit); - spin_unlock_bh(&bc_lock); + tipc_bclink_unlock(); return 0; } -void tipc_bclink_init(void) +int tipc_bclink_init(void) { + bcbearer = kzalloc(sizeof(*bcbearer), GFP_ATOMIC); + if (!bcbearer) + return -ENOMEM; + + bclink = kzalloc(sizeof(*bclink), GFP_ATOMIC); + if (!bclink) { + kfree(bcbearer); + return -ENOMEM; + } + + bcl = &bclink->link; bcbearer->bearer.media = &bcbearer->media; bcbearer->media.send_msg = tipc_bcbearer_send; sprintf(bcbearer->media.name, "tipc-broadcast"); + spin_lock_init(&bclink->lock); INIT_LIST_HEAD(&bcl->waiting_ports); bcl->next_out_no = 1; spin_lock_init(&bclink->node.lock); bcl->owner = &bclink->node; bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); - spin_lock_init(&bcbearer->bearer.lock); - bcl->b_ptr = &bcbearer->bearer; + bcl->bearer_id = MAX_BEARERS; + rcu_assign_pointer(bearer_list[MAX_BEARERS], &bcbearer->bearer); bcl->state = WORKING_WORKING; strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); + return 0; } void tipc_bclink_stop(void) { - spin_lock_bh(&bc_lock); - tipc_link_stop(bcl); - spin_unlock_bh(&bc_lock); - - memset(bclink, 0, sizeof(*bclink)); - memset(bcbearer, 0, sizeof(*bcbearer)); + tipc_bclink_lock(); + tipc_link_purge_queues(bcl); + tipc_bclink_unlock(); + + RCU_INIT_POINTER(bearer_list[BCBEARER], NULL); + synchronize_net(); + kfree(bcbearer); + kfree(bclink); } - /** * tipc_nmap_add - add a node to a node map */ -void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node) +static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node) { int n = tipc_node(node); int w = n / WSIZE; @@ -820,7 +864,7 @@ void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node) /** * tipc_nmap_remove - remove a node from a node map */ -void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node) +static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node) { int n = tipc_node(node); int w = n / WSIZE; diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 6ee587b469f..00330c45df3 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -39,6 +39,7 @@ #define MAX_NODES 4096 #define WSIZE 32 +#define TIPC_BCLINK_RESET 1 /** * struct tipc_node_map - set of node identifiers @@ -69,9 +70,6 @@ struct tipc_node; extern const char tipc_bclink_name[]; -void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node); -void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node); - /** * tipc_nmap_equal - test for equality of node maps */ @@ -84,20 +82,21 @@ static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, void tipc_port_list_add(struct tipc_port_list *pl_ptr, u32 port); void tipc_port_list_free(struct tipc_port_list *pl_ptr); -void tipc_bclink_init(void); +int tipc_bclink_init(void); void tipc_bclink_stop(void); +void tipc_bclink_set_flags(unsigned int flags); void tipc_bclink_add_node(u32 addr); void tipc_bclink_remove_node(u32 addr); struct tipc_node *tipc_bclink_retransmit_to(void); void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked); -int tipc_bclink_send_msg(struct sk_buff *buf); -void tipc_bclink_recv_pkt(struct sk_buff *buf); +int tipc_bclink_xmit(struct sk_buff *buf); +void tipc_bclink_rcv(struct sk_buff *buf); u32 tipc_bclink_get_last_sent(void); u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr); void tipc_bclink_update_link_state(struct tipc_node *n_ptr, u32 last_sent); int tipc_bclink_stats(char *stats_buf, const u32 buf_size); int tipc_bclink_reset_stats(void); int tipc_bclink_set_queue_limits(u32 limit); -void tipc_bcbearer_sort(void); +void tipc_bcbearer_sort(struct tipc_node_map *nm_ptr, u32 node, bool action); #endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index c2101c0bfd6..264474394f9 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1,8 +1,8 @@ /* * net/tipc/bearer.c: TIPC bearer code * - * Copyright (c) 1996-2006, Ericsson AB - * Copyright (c) 2004-2006, 2010-2011, Wind River Systems + * Copyright (c) 1996-2006, 2013, Ericsson AB + * Copyright (c) 2004-2006, 2010-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,12 +41,17 @@ #define MAX_ADDR_STR 60 -static struct tipc_media *media_list[MAX_MEDIA]; -static u32 media_count; +static struct tipc_media * const media_info_array[] = { + ð_media_info, +#ifdef CONFIG_TIPC_MEDIA_IB + &ib_media_info, +#endif + NULL +}; -struct tipc_bearer tipc_bearers[MAX_BEARERS]; +struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; -static void bearer_disable(struct tipc_bearer *b_ptr); +static void bearer_disable(struct tipc_bearer *b_ptr, bool shutting_down); /** * tipc_media_find - locates specified media object by name @@ -55,11 +60,11 @@ struct tipc_media *tipc_media_find(const char *name) { u32 i; - for (i = 0; i < media_count; i++) { - if (!strcmp(media_list[i]->name, name)) - return media_list[i]; + for (i = 0; media_info_array[i] != NULL; i++) { + if (!strcmp(media_info_array[i]->name, name)) + break; } - return NULL; + return media_info_array[i]; } /** @@ -69,44 +74,11 @@ static struct tipc_media *media_find_id(u8 type) { u32 i; - for (i = 0; i < media_count; i++) { - if (media_list[i]->type_id == type) - return media_list[i]; + for (i = 0; media_info_array[i] != NULL; i++) { + if (media_info_array[i]->type_id == type) + break; } - return NULL; -} - -/** - * tipc_register_media - register a media type - * - * Bearers for this media type must be activated separately at a later stage. - */ -int tipc_register_media(struct tipc_media *m_ptr) -{ - int res = -EINVAL; - - write_lock_bh(&tipc_net_lock); - - if ((strlen(m_ptr->name) + 1) > TIPC_MAX_MEDIA_NAME) - goto exit; - if (m_ptr->priority > TIPC_MAX_LINK_PRI) - goto exit; - if ((m_ptr->tolerance < TIPC_MIN_LINK_TOL) || - (m_ptr->tolerance > TIPC_MAX_LINK_TOL)) - goto exit; - if (media_count >= MAX_MEDIA) - goto exit; - if (tipc_media_find(m_ptr->name) || media_find_id(m_ptr->type_id)) - goto exit; - - media_list[media_count] = m_ptr; - media_count++; - res = 0; -exit: - write_unlock_bh(&tipc_net_lock); - if (res) - pr_warn("Media <%s> registration error\n", m_ptr->name); - return res; + return media_info_array[i]; } /** @@ -144,13 +116,11 @@ struct sk_buff *tipc_media_get_names(void) if (!buf) return NULL; - read_lock_bh(&tipc_net_lock); - for (i = 0; i < media_count; i++) { + for (i = 0; media_info_array[i] != NULL; i++) { tipc_cfg_append_tlv(buf, TIPC_TLV_MEDIA_NAME, - media_list[i]->name, - strlen(media_list[i]->name) + 1); + media_info_array[i]->name, + strlen(media_info_array[i]->name) + 1); } - read_unlock_bh(&tipc_net_lock); return buf; } @@ -207,27 +177,9 @@ struct tipc_bearer *tipc_bearer_find(const char *name) struct tipc_bearer *b_ptr; u32 i; - for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) { - if (b_ptr->active && (!strcmp(b_ptr->name, name))) - return b_ptr; - } - return NULL; -} - -/** - * tipc_bearer_find_interface - locates bearer object with matching interface name - */ -struct tipc_bearer *tipc_bearer_find_interface(const char *if_name) -{ - struct tipc_bearer *b_ptr; - char *b_if_name; - u32 i; - - for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) { - if (!b_ptr->active) - continue; - b_if_name = strchr(b_ptr->name, ':') + 1; - if (!strcmp(b_if_name, if_name)) + for (i = 0; i < MAX_BEARERS; i++) { + b_ptr = rtnl_dereference(bearer_list[i]); + if (b_ptr && (!strcmp(b_ptr->name, name))) return b_ptr; } return NULL; @@ -239,40 +191,52 @@ struct tipc_bearer *tipc_bearer_find_interface(const char *if_name) struct sk_buff *tipc_bearer_get_names(void) { struct sk_buff *buf; - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; int i, j; buf = tipc_cfg_reply_alloc(MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME)); if (!buf) return NULL; - read_lock_bh(&tipc_net_lock); - for (i = 0; i < media_count; i++) { + for (i = 0; media_info_array[i] != NULL; i++) { for (j = 0; j < MAX_BEARERS; j++) { - b_ptr = &tipc_bearers[j]; - if (b_ptr->active && (b_ptr->media == media_list[i])) { + b = rtnl_dereference(bearer_list[j]); + if (!b) + continue; + if (b->media == media_info_array[i]) { tipc_cfg_append_tlv(buf, TIPC_TLV_BEARER_NAME, - b_ptr->name, - strlen(b_ptr->name) + 1); + b->name, + strlen(b->name) + 1); } } } - read_unlock_bh(&tipc_net_lock); return buf; } -void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest) +void tipc_bearer_add_dest(u32 bearer_id, u32 dest) { - tipc_nmap_add(&b_ptr->nodes, dest); - tipc_bcbearer_sort(); - tipc_disc_add_dest(b_ptr->link_req); + struct tipc_bearer *b_ptr; + + rcu_read_lock(); + b_ptr = rcu_dereference_rtnl(bearer_list[bearer_id]); + if (b_ptr) { + tipc_bcbearer_sort(&b_ptr->nodes, dest, true); + tipc_disc_add_dest(b_ptr->link_req); + } + rcu_read_unlock(); } -void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest) +void tipc_bearer_remove_dest(u32 bearer_id, u32 dest) { - tipc_nmap_remove(&b_ptr->nodes, dest); - tipc_bcbearer_sort(); - tipc_disc_remove_dest(b_ptr->link_req); + struct tipc_bearer *b_ptr; + + rcu_read_lock(); + b_ptr = rcu_dereference_rtnl(bearer_list[bearer_id]); + if (b_ptr) { + tipc_bcbearer_sort(&b_ptr->nodes, dest, false); + tipc_disc_remove_dest(b_ptr->link_req); + } + rcu_read_unlock(); } /** @@ -317,13 +281,11 @@ int tipc_enable_bearer(const char *name, u32 disc_domain, u32 priority) return -EINVAL; } - write_lock_bh(&tipc_net_lock); - m_ptr = tipc_media_find(b_names.media_name); if (!m_ptr) { pr_warn("Bearer <%s> rejected, media <%s> not registered\n", name, b_names.media_name); - goto exit; + return -EINVAL; } if (priority == TIPC_MEDIA_LINK_PRI) @@ -333,21 +295,22 @@ restart: bearer_id = MAX_BEARERS; with_this_prio = 1; for (i = MAX_BEARERS; i-- != 0; ) { - if (!tipc_bearers[i].active) { + b_ptr = rtnl_dereference(bearer_list[i]); + if (!b_ptr) { bearer_id = i; continue; } - if (!strcmp(name, tipc_bearers[i].name)) { + if (!strcmp(name, b_ptr->name)) { pr_warn("Bearer <%s> rejected, already enabled\n", name); - goto exit; + return -EINVAL; } - if ((tipc_bearers[i].priority == priority) && + if ((b_ptr->priority == priority) && (++with_this_prio > 2)) { if (priority-- == 0) { pr_warn("Bearer <%s> rejected, duplicate priority\n", name); - goto exit; + return -EINVAL; } pr_warn("Bearer <%s> priority adjustment required %u->%u\n", name, priority + 1, priority); @@ -357,91 +320,79 @@ restart: if (bearer_id >= MAX_BEARERS) { pr_warn("Bearer <%s> rejected, bearer limit reached (%u)\n", name, MAX_BEARERS); - goto exit; + return -EINVAL; } - b_ptr = &tipc_bearers[bearer_id]; + b_ptr = kzalloc(sizeof(*b_ptr), GFP_ATOMIC); + if (!b_ptr) + return -ENOMEM; + strcpy(b_ptr->name, name); + b_ptr->media = m_ptr; res = m_ptr->enable_media(b_ptr); if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); - goto exit; + return -EINVAL; } b_ptr->identity = bearer_id; - b_ptr->media = m_ptr; b_ptr->tolerance = m_ptr->tolerance; b_ptr->window = m_ptr->window; + b_ptr->domain = disc_domain; b_ptr->net_plane = bearer_id + 'A'; - b_ptr->active = 1; b_ptr->priority = priority; - INIT_LIST_HEAD(&b_ptr->links); - spin_lock_init(&b_ptr->lock); - res = tipc_disc_create(b_ptr, &b_ptr->bcast_addr, disc_domain); + res = tipc_disc_create(b_ptr, &b_ptr->bcast_addr); if (res) { - bearer_disable(b_ptr); + bearer_disable(b_ptr, false); pr_warn("Bearer <%s> rejected, discovery object creation failed\n", name); - goto exit; + return -EINVAL; } + + rcu_assign_pointer(bearer_list[bearer_id], b_ptr); + pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, tipc_addr_string_fill(addr_string, disc_domain), priority); -exit: - write_unlock_bh(&tipc_net_lock); return res; } /** * tipc_reset_bearer - Reset all links established over this bearer */ -int tipc_reset_bearer(struct tipc_bearer *b_ptr) +static int tipc_reset_bearer(struct tipc_bearer *b_ptr) { - struct tipc_link *l_ptr; - struct tipc_link *temp_l_ptr; - - read_lock_bh(&tipc_net_lock); pr_info("Resetting bearer <%s>\n", b_ptr->name); - spin_lock_bh(&b_ptr->lock); - list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { - struct tipc_node *n_ptr = l_ptr->owner; - - spin_lock_bh(&n_ptr->lock); - tipc_link_reset(l_ptr); - spin_unlock_bh(&n_ptr->lock); - } - spin_unlock_bh(&b_ptr->lock); - read_unlock_bh(&tipc_net_lock); + tipc_link_reset_list(b_ptr->identity); + tipc_disc_reset(b_ptr); return 0; } /** * bearer_disable * - * Note: This routine assumes caller holds tipc_net_lock. + * Note: This routine assumes caller holds RTNL lock. */ -static void bearer_disable(struct tipc_bearer *b_ptr) +static void bearer_disable(struct tipc_bearer *b_ptr, bool shutting_down) { - struct tipc_link *l_ptr; - struct tipc_link *temp_l_ptr; - struct tipc_link_req *temp_req; + u32 i; pr_info("Disabling bearer <%s>\n", b_ptr->name); - spin_lock_bh(&b_ptr->lock); b_ptr->media->disable_media(b_ptr); - list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { - tipc_link_delete(l_ptr); - } - temp_req = b_ptr->link_req; - b_ptr->link_req = NULL; - spin_unlock_bh(&b_ptr->lock); - if (temp_req) - tipc_disc_delete(temp_req); + tipc_link_delete_list(b_ptr->identity, shutting_down); + if (b_ptr->link_req) + tipc_disc_delete(b_ptr->link_req); - memset(b_ptr, 0, sizeof(struct tipc_bearer)); + for (i = 0; i < MAX_BEARERS; i++) { + if (b_ptr == rtnl_dereference(bearer_list[i])) { + RCU_INIT_POINTER(bearer_list[i], NULL); + break; + } + } + kfree_rcu(b_ptr, rcu); } int tipc_disable_bearer(const char *name) @@ -449,28 +400,230 @@ int tipc_disable_bearer(const char *name) struct tipc_bearer *b_ptr; int res; - write_lock_bh(&tipc_net_lock); b_ptr = tipc_bearer_find(name); if (b_ptr == NULL) { pr_warn("Attempt to disable unknown bearer <%s>\n", name); res = -EINVAL; } else { - bearer_disable(b_ptr); + bearer_disable(b_ptr, false); res = 0; } - write_unlock_bh(&tipc_net_lock); return res; } +int tipc_enable_l2_media(struct tipc_bearer *b) +{ + struct net_device *dev; + char *driver_name = strchr((const char *)b->name, ':') + 1; + + /* Find device with specified name */ + dev = dev_get_by_name(&init_net, driver_name); + if (!dev) + return -ENODEV; + + /* Associate TIPC bearer with L2 bearer */ + rcu_assign_pointer(b->media_ptr, dev); + memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); + memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); + b->bcast_addr.media_id = b->media->type_id; + b->bcast_addr.broadcast = 1; + b->mtu = dev->mtu; + b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr); + rcu_assign_pointer(dev->tipc_ptr, b); + return 0; +} +/* tipc_disable_l2_media - detach TIPC bearer from an L2 interface + * + * Mark L2 bearer as inactive so that incoming buffers are thrown away, + * then get worker thread to complete bearer cleanup. (Can't do cleanup + * here because cleanup code needs to sleep and caller holds spinlocks.) + */ +void tipc_disable_l2_media(struct tipc_bearer *b) +{ + struct net_device *dev; + + dev = (struct net_device *)rtnl_dereference(b->media_ptr); + RCU_INIT_POINTER(b->media_ptr, NULL); + RCU_INIT_POINTER(dev->tipc_ptr, NULL); + synchronize_net(); + dev_put(dev); +} + +/** + * tipc_l2_send_msg - send a TIPC packet out over an L2 interface + * @buf: the packet to be sent + * @b_ptr: the bearer through which the packet is to be sent + * @dest: peer destination address + */ +int tipc_l2_send_msg(struct sk_buff *buf, struct tipc_bearer *b, + struct tipc_media_addr *dest) +{ + struct sk_buff *clone; + struct net_device *dev; + int delta; + + dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr); + if (!dev) + return 0; + + clone = skb_clone(buf, GFP_ATOMIC); + if (!clone) + return 0; + + delta = dev->hard_header_len - skb_headroom(buf); + if ((delta > 0) && + pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(clone); + return 0; + } + + skb_reset_network_header(clone); + clone->dev = dev; + clone->protocol = htons(ETH_P_TIPC); + dev_hard_header(clone, dev, ETH_P_TIPC, dest->value, + dev->dev_addr, clone->len); + dev_queue_xmit(clone); + return 0; +} + +/* tipc_bearer_send- sends buffer to destination over bearer + * + * IMPORTANT: + * The media send routine must not alter the buffer being passed in + * as it may be needed for later retransmission! + */ +void tipc_bearer_send(u32 bearer_id, struct sk_buff *buf, + struct tipc_media_addr *dest) +{ + struct tipc_bearer *b_ptr; + + rcu_read_lock(); + b_ptr = rcu_dereference_rtnl(bearer_list[bearer_id]); + if (likely(b_ptr)) + b_ptr->media->send_msg(buf, b_ptr, dest); + rcu_read_unlock(); +} + +/** + * tipc_l2_rcv_msg - handle incoming TIPC message from an interface + * @buf: the received packet + * @dev: the net device that the packet was received on + * @pt: the packet_type structure which was used to register this handler + * @orig_dev: the original receive net device in case the device is a bond + * + * Accept only packets explicitly sent to this node, or broadcast packets; + * ignores packets sent using interface multicast, and traffic sent to other + * nodes (which can happen if interface is running in promiscuous mode). + */ +static int tipc_l2_rcv_msg(struct sk_buff *buf, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct tipc_bearer *b_ptr; + + if (!net_eq(dev_net(dev), &init_net)) { + kfree_skb(buf); + return NET_RX_DROP; + } + + rcu_read_lock(); + b_ptr = rcu_dereference_rtnl(dev->tipc_ptr); + if (likely(b_ptr)) { + if (likely(buf->pkt_type <= PACKET_BROADCAST)) { + buf->next = NULL; + tipc_rcv(buf, b_ptr); + rcu_read_unlock(); + return NET_RX_SUCCESS; + } + } + rcu_read_unlock(); + + kfree_skb(buf); + return NET_RX_DROP; +} + +/** + * tipc_l2_device_event - handle device events from network device + * @nb: the context of the notification + * @evt: the type of event + * @ptr: the net device that the event was on + * + * This function is called by the Ethernet driver in case of link + * change event. + */ +static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, + void *ptr) +{ + struct tipc_bearer *b_ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + b_ptr = rtnl_dereference(dev->tipc_ptr); + if (!b_ptr) + return NOTIFY_DONE; + + b_ptr->mtu = dev->mtu; + + switch (evt) { + case NETDEV_CHANGE: + if (netif_carrier_ok(dev)) + break; + case NETDEV_DOWN: + case NETDEV_CHANGEMTU: + tipc_reset_bearer(b_ptr); + break; + case NETDEV_CHANGEADDR: + b_ptr->media->raw2addr(b_ptr, &b_ptr->addr, + (char *)dev->dev_addr); + tipc_reset_bearer(b_ptr); + break; + case NETDEV_UNREGISTER: + case NETDEV_CHANGENAME: + bearer_disable(b_ptr, false); + break; + } + return NOTIFY_OK; +} + +static struct packet_type tipc_packet_type __read_mostly = { + .type = htons(ETH_P_TIPC), + .func = tipc_l2_rcv_msg, +}; + +static struct notifier_block notifier = { + .notifier_call = tipc_l2_device_event, + .priority = 0, +}; + +int tipc_bearer_setup(void) +{ + int err; + + err = register_netdevice_notifier(¬ifier); + if (err) + return err; + dev_add_pack(&tipc_packet_type); + return 0; +} + +void tipc_bearer_cleanup(void) +{ + unregister_netdevice_notifier(¬ifier); + dev_remove_pack(&tipc_packet_type); +} void tipc_bearer_stop(void) { + struct tipc_bearer *b_ptr; u32 i; for (i = 0; i < MAX_BEARERS; i++) { - if (tipc_bearers[i].active) - bearer_disable(&tipc_bearers[i]); + b_ptr = rtnl_dereference(bearer_list[i]); + if (b_ptr) { + bearer_disable(b_ptr, true); + bearer_list[i] = NULL; + } } - media_count = 0; } diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index cc780bbedbb..78fccc49de2 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -1,7 +1,7 @@ /* * net/tipc/bearer.h: Include file for TIPC bearer code * - * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 1996-2006, 2013, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -42,14 +42,12 @@ #define MAX_BEARERS 2 #define MAX_MEDIA 2 -/* - * Identifiers associated with TIPC message header media address info - * - * - address info field is 20 bytes long - * - media type identifier located at offset 3 - * - remaining bytes vary according to media type +/* Identifiers associated with TIPC message header media address info + * - address info field is 32 bytes long + * - the field's actual content and length is defined per media + * - remaining unused bytes in the field are set to zero */ -#define TIPC_MEDIA_ADDR_SIZE 20 +#define TIPC_MEDIA_ADDR_SIZE 32 #define TIPC_MEDIA_TYPE_OFFSET 3 /* @@ -73,18 +71,19 @@ struct tipc_media_addr { struct tipc_bearer; /** - * struct tipc_media - TIPC media information available to internal users + * struct tipc_media - Media specific info exposed to generic bearer layer * @send_msg: routine which handles buffer transmission * @enable_media: routine which enables a media * @disable_media: routine which disables a media - * @addr2str: routine which converts media address to string - * @addr2msg: routine which converts media address to protocol message area - * @msg2addr: routine which converts media address from protocol message area - * @bcast_addr: media address used in broadcasting + * @addr2str: convert media address format to string + * @addr2msg: convert from media addr format to discovery msg addr format + * @msg2addr: convert from discovery msg addr format to media addr format + * @raw2addr: convert from raw addr format to media addr format * @priority: default link (and bearer) priority * @tolerance: default time (in ms) before declaring link failure * @window: default window (in packets) before declaring link congestion * @type_id: TIPC media identifier + * @hwaddr_len: TIPC media address len * @name: media name */ struct tipc_media { @@ -93,32 +92,39 @@ struct tipc_media { struct tipc_media_addr *dest); int (*enable_media)(struct tipc_bearer *b_ptr); void (*disable_media)(struct tipc_bearer *b_ptr); - int (*addr2str)(struct tipc_media_addr *a, char *str_buf, int str_size); - int (*addr2msg)(struct tipc_media_addr *a, char *msg_area); - int (*msg2addr)(const struct tipc_bearer *b_ptr, - struct tipc_media_addr *a, char *msg_area); + int (*addr2str)(struct tipc_media_addr *addr, + char *strbuf, + int bufsz); + int (*addr2msg)(char *msg, struct tipc_media_addr *addr); + int (*msg2addr)(struct tipc_bearer *b, + struct tipc_media_addr *addr, + char *msg); + int (*raw2addr)(struct tipc_bearer *b, + struct tipc_media_addr *addr, + char *raw); u32 priority; u32 tolerance; u32 window; u32 type_id; + u32 hwaddr_len; char name[TIPC_MAX_MEDIA_NAME]; }; /** - * struct tipc_bearer - TIPC bearer structure - * @usr_handle: pointer to additional media-specific information about bearer + * struct tipc_bearer - Generic TIPC bearer structure + * @media_ptr: pointer to additional media-specific information about bearer * @mtu: max packet size bearer can support - * @lock: spinlock for controlling access to bearer * @addr: media-specific address associated with bearer * @name: bearer name (format = media:interface) * @media: ptr to media structure associated with bearer + * @bcast_addr: media address used in broadcasting + * @rcu: rcu struct for tipc_bearer * @priority: default link priority for bearer * @window: default window size for bearer * @tolerance: default link tolerance for bearer + * @domain: network domain to which links can be established * @identity: array index of this bearer within TIPC bearer array * @link_req: ptr to (optional) structure making periodic link setup requests - * @links: list of non-congested links associated with bearer - * @active: non-zero if bearer structure is represents a bearer * @net_plane: network plane ('A' through 'H') currently associated with bearer * @nodes: indicates which nodes in cluster can be reached through bearer * @@ -127,20 +133,19 @@ struct tipc_media { * care of initializing all other fields. */ struct tipc_bearer { - void *usr_handle; /* initalized by media */ + void __rcu *media_ptr; /* initalized by media */ u32 mtu; /* initalized by media */ struct tipc_media_addr addr; /* initalized by media */ char name[TIPC_MAX_BEARER_NAME]; - spinlock_t lock; struct tipc_media *media; struct tipc_media_addr bcast_addr; + struct rcu_head rcu; u32 priority; u32 window; u32 tolerance; + u32 domain; u32 identity; struct tipc_link_req *link_req; - struct list_head links; - int active; char net_plane; struct tipc_node_map nodes; }; @@ -152,58 +157,43 @@ struct tipc_bearer_names { struct tipc_link; -extern struct tipc_bearer tipc_bearers[]; +extern struct tipc_bearer __rcu *bearer_list[]; /* * TIPC routines available to supported media types */ -int tipc_register_media(struct tipc_media *m_ptr); - -void tipc_recv_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr); - -int tipc_reset_bearer(struct tipc_bearer *b_ptr); +void tipc_rcv(struct sk_buff *buf, struct tipc_bearer *tb_ptr); int tipc_enable_bearer(const char *bearer_name, u32 disc_domain, u32 priority); int tipc_disable_bearer(const char *name); /* * Routines made available to TIPC by supported media types */ -int tipc_eth_media_start(void); -void tipc_eth_media_stop(void); +extern struct tipc_media eth_media_info; #ifdef CONFIG_TIPC_MEDIA_IB -int tipc_ib_media_start(void); -void tipc_ib_media_stop(void); -#else -static inline int tipc_ib_media_start(void) { return 0; } -static inline void tipc_ib_media_stop(void) { return; } +extern struct tipc_media ib_media_info; #endif int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); struct sk_buff *tipc_media_get_names(void); +int tipc_enable_l2_media(struct tipc_bearer *b); +void tipc_disable_l2_media(struct tipc_bearer *b); +int tipc_l2_send_msg(struct sk_buff *buf, struct tipc_bearer *b, + struct tipc_media_addr *dest); struct sk_buff *tipc_bearer_get_names(void); -void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest); -void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest); +void tipc_bearer_add_dest(u32 bearer_id, u32 dest); +void tipc_bearer_remove_dest(u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(const char *name); -struct tipc_bearer *tipc_bearer_find_interface(const char *if_name); struct tipc_media *tipc_media_find(const char *name); +int tipc_bearer_setup(void); +void tipc_bearer_cleanup(void); void tipc_bearer_stop(void); - -/** - * tipc_bearer_send- sends buffer to destination over bearer - * - * IMPORTANT: - * The media send routine must not alter the buffer being passed in - * as it may be needed for later retransmission! - */ -static inline void tipc_bearer_send(struct tipc_bearer *b, struct sk_buff *buf, - struct tipc_media_addr *dest) -{ - b->media->send_msg(buf, b, dest); -} +void tipc_bearer_send(u32 bearer_id, struct sk_buff *buf, + struct tipc_media_addr *dest); #endif /* _TIPC_BEARER_H */ diff --git a/net/tipc/config.c b/net/tipc/config.c index c301a9a592d..2b42403ad33 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -42,14 +42,10 @@ #define REPLY_TRUNCATED "<truncated>\n" -static DEFINE_MUTEX(config_mutex); -static struct tipc_server cfgsrv; - static const void *req_tlv_area; /* request message TLV area */ static int req_tlv_space; /* request message TLV area size */ static int rep_headroom; /* reply message headroom to use */ - struct sk_buff *tipc_cfg_reply_alloc(int payload_size) { struct sk_buff *buf; @@ -181,20 +177,10 @@ static struct sk_buff *cfg_set_own_addr(void) if (tipc_own_addr) return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED " (cannot change node address once assigned)"); - tipc_core_start_net(addr); - return tipc_cfg_reply_none(); -} - -static struct sk_buff *cfg_set_remote_mng(void) -{ - u32 value; - - if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) - return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + if (!tipc_net_start(addr)) + return tipc_cfg_reply_none(); - value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); - tipc_remote_management = (value != 0); - return tipc_cfg_reply_none(); + return tipc_cfg_reply_error_string("cannot change to network mode"); } static struct sk_buff *cfg_set_max_ports(void) @@ -237,7 +223,7 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area { struct sk_buff *rep_tlv_buf; - mutex_lock(&config_mutex); + rtnl_lock(); /* Save request and reply details in a well-known location */ req_tlv_area = request_area; @@ -247,21 +233,10 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area /* Check command authorization */ if (likely(in_own_node(orig_node))) { /* command is permitted */ - } else if (cmd >= 0x8000) { + } else { rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED " (cannot be done remotely)"); goto exit; - } else if (!tipc_remote_management) { - rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NO_REMOTE); - goto exit; - } else if (cmd >= 0x4000) { - u32 domain = 0; - - if ((tipc_nametbl_translate(TIPC_ZM_SRV, 0, &domain) == 0) || - (domain != orig_node)) { - rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_ZONE_MSTR); - goto exit; - } } /* Call appropriate processing routine */ @@ -310,18 +285,12 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area case TIPC_CMD_SET_NODE_ADDR: rep_tlv_buf = cfg_set_own_addr(); break; - case TIPC_CMD_SET_REMOTE_MNG: - rep_tlv_buf = cfg_set_remote_mng(); - break; case TIPC_CMD_SET_MAX_PORTS: rep_tlv_buf = cfg_set_max_ports(); break; case TIPC_CMD_SET_NETID: rep_tlv_buf = cfg_set_netid(); break; - case TIPC_CMD_GET_REMOTE_MNG: - rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_remote_management); - break; case TIPC_CMD_GET_MAX_PORTS: rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_ports); break; @@ -345,6 +314,8 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area case TIPC_CMD_SET_MAX_PUBL: case TIPC_CMD_GET_MAX_PUBL: case TIPC_CMD_SET_LOG_SIZE: + case TIPC_CMD_SET_REMOTE_MNG: + case TIPC_CMD_GET_REMOTE_MNG: case TIPC_CMD_DUMP_LOG: rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED " (obsolete command)"); @@ -366,83 +337,6 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area /* Return reply buffer */ exit: - mutex_unlock(&config_mutex); + rtnl_unlock(); return rep_tlv_buf; } - -static void cfg_conn_msg_event(int conid, struct sockaddr_tipc *addr, - void *usr_data, void *buf, size_t len) -{ - struct tipc_cfg_msg_hdr *req_hdr; - struct tipc_cfg_msg_hdr *rep_hdr; - struct sk_buff *rep_buf; - int ret; - - /* Validate configuration message header (ignore invalid message) */ - req_hdr = (struct tipc_cfg_msg_hdr *)buf; - if ((len < sizeof(*req_hdr)) || - (len != TCM_ALIGN(ntohl(req_hdr->tcm_len))) || - (ntohs(req_hdr->tcm_flags) != TCM_F_REQUEST)) { - pr_warn("Invalid configuration message discarded\n"); - return; - } - - /* Generate reply for request (if can't, return request) */ - rep_buf = tipc_cfg_do_cmd(addr->addr.id.node, ntohs(req_hdr->tcm_type), - buf + sizeof(*req_hdr), - len - sizeof(*req_hdr), - BUF_HEADROOM + MAX_H_SIZE + sizeof(*rep_hdr)); - if (rep_buf) { - skb_push(rep_buf, sizeof(*rep_hdr)); - rep_hdr = (struct tipc_cfg_msg_hdr *)rep_buf->data; - memcpy(rep_hdr, req_hdr, sizeof(*rep_hdr)); - rep_hdr->tcm_len = htonl(rep_buf->len); - rep_hdr->tcm_flags &= htons(~TCM_F_REQUEST); - - ret = tipc_conn_sendmsg(&cfgsrv, conid, addr, rep_buf->data, - rep_buf->len); - if (ret < 0) - pr_err("Sending cfg reply message failed, no memory\n"); - - kfree_skb(rep_buf); - } -} - -static struct sockaddr_tipc cfgsrv_addr __read_mostly = { - .family = AF_TIPC, - .addrtype = TIPC_ADDR_NAMESEQ, - .addr.nameseq.type = TIPC_CFG_SRV, - .addr.nameseq.lower = 0, - .addr.nameseq.upper = 0, - .scope = TIPC_ZONE_SCOPE -}; - -static struct tipc_server cfgsrv __read_mostly = { - .saddr = &cfgsrv_addr, - .imp = TIPC_CRITICAL_IMPORTANCE, - .type = SOCK_RDM, - .max_rcvbuf_size = 64 * 1024, - .name = "cfg_server", - .tipc_conn_recvmsg = cfg_conn_msg_event, - .tipc_conn_new = NULL, - .tipc_conn_shutdown = NULL -}; - -int tipc_cfg_init(void) -{ - return tipc_server_start(&cfgsrv); -} - -void tipc_cfg_reinit(void) -{ - tipc_server_stop(&cfgsrv); - - cfgsrv_addr.addr.nameseq.lower = tipc_own_addr; - cfgsrv_addr.addr.nameseq.upper = tipc_own_addr; - tipc_server_start(&cfgsrv); -} - -void tipc_cfg_stop(void) -{ - tipc_server_stop(&cfgsrv); -} diff --git a/net/tipc/config.h b/net/tipc/config.h index 1f252f3fa05..47b1bf18161 100644 --- a/net/tipc/config.h +++ b/net/tipc/config.h @@ -64,9 +64,4 @@ static inline struct sk_buff *tipc_cfg_reply_ultra_string(char *string) struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *req_tlv_area, int req_tlv_space, int headroom); - -int tipc_cfg_init(void); -void tipc_cfg_reinit(void); -void tipc_cfg_stop(void); - #endif diff --git a/net/tipc/core.c b/net/tipc/core.c index fd4eeeaa972..676d18015dd 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -1,7 +1,7 @@ /* * net/tipc/core.c: TIPC module code * - * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2003-2006, 2013, Ericsson AB * Copyright (c) 2005-2006, 2010-2013, Wind River Systems * All rights reserved. * @@ -50,7 +50,6 @@ int tipc_random __read_mostly; u32 tipc_own_addr __read_mostly; int tipc_max_ports __read_mostly; int tipc_net_id __read_mostly; -int tipc_remote_management __read_mostly; int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */ /** @@ -77,44 +76,13 @@ struct sk_buff *tipc_buf_acquire(u32 size) } /** - * tipc_core_stop_net - shut down TIPC networking sub-systems - */ -static void tipc_core_stop_net(void) -{ - tipc_net_stop(); - tipc_eth_media_stop(); - tipc_ib_media_stop(); -} - -/** - * start_net - start TIPC networking sub-systems - */ -int tipc_core_start_net(unsigned long addr) -{ - int res; - - tipc_net_start(addr); - res = tipc_eth_media_start(); - if (res < 0) - goto err; - res = tipc_ib_media_start(); - if (res < 0) - goto err; - return res; - -err: - tipc_core_stop_net(); - return res; -} - -/** * tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode */ static void tipc_core_stop(void) { + tipc_net_stop(); + tipc_bearer_cleanup(); tipc_netlink_stop(); - tipc_handler_stop(); - tipc_cfg_stop(); tipc_subscr_stop(); tipc_nametbl_stop(); tipc_ref_table_stop(); @@ -127,29 +95,53 @@ static void tipc_core_stop(void) */ static int tipc_core_start(void) { - int res; + int err; get_random_bytes(&tipc_random, sizeof(tipc_random)); - res = tipc_handler_start(); - if (!res) - res = tipc_ref_table_init(tipc_max_ports, tipc_random); - if (!res) - res = tipc_nametbl_init(); - if (!res) - res = tipc_netlink_start(); - if (!res) - res = tipc_socket_init(); - if (!res) - res = tipc_register_sysctl(); - if (!res) - res = tipc_subscr_start(); - if (!res) - res = tipc_cfg_init(); - if (res) - tipc_core_stop(); + err = tipc_ref_table_init(tipc_max_ports, tipc_random); + if (err) + goto out_reftbl; - return res; + err = tipc_nametbl_init(); + if (err) + goto out_nametbl; + + err = tipc_netlink_start(); + if (err) + goto out_netlink; + + err = tipc_socket_init(); + if (err) + goto out_socket; + + err = tipc_register_sysctl(); + if (err) + goto out_sysctl; + + err = tipc_subscr_start(); + if (err) + goto out_subscr; + + err = tipc_bearer_setup(); + if (err) + goto out_bearer; + + return 0; +out_bearer: + tipc_subscr_stop(); +out_subscr: + tipc_unregister_sysctl(); +out_sysctl: + tipc_socket_stop(); +out_socket: + tipc_netlink_stop(); +out_netlink: + tipc_nametbl_stop(); +out_nametbl: + tipc_ref_table_stop(); +out_reftbl: + return err; } static int __init tipc_init(void) @@ -159,14 +151,14 @@ static int __init tipc_init(void) pr_info("Activated (version " TIPC_MOD_VER ")\n"); tipc_own_addr = 0; - tipc_remote_management = 1; tipc_max_ports = CONFIG_TIPC_PORTS; tipc_net_id = 4711; - sysctl_tipc_rmem[0] = CONN_OVERLOAD_LIMIT >> 4 << TIPC_LOW_IMPORTANCE; - sysctl_tipc_rmem[1] = CONN_OVERLOAD_LIMIT >> 4 << + sysctl_tipc_rmem[0] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << + TIPC_LOW_IMPORTANCE; + sysctl_tipc_rmem[1] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << TIPC_CRITICAL_IMPORTANCE; - sysctl_tipc_rmem[2] = CONN_OVERLOAD_LIMIT; + sysctl_tipc_rmem[2] = TIPC_CONN_OVERLOAD_LIMIT; res = tipc_core_start(); if (res) @@ -178,7 +170,6 @@ static int __init tipc_init(void) static void __exit tipc_exit(void) { - tipc_core_stop_net(); tipc_core_stop(); pr_info("Deactivated\n"); } diff --git a/net/tipc/core.h b/net/tipc/core.h index 94895d4e86a..bb26ed1ee96 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -47,7 +47,7 @@ #include <linux/mm.h> #include <linux/timer.h> #include <linux/string.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/interrupt.h> #include <linux/atomic.h> #include <asm/hardirq.h> @@ -56,7 +56,8 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/vmalloc.h> - +#include <linux/rtnetlink.h> +#include <linux/etherdevice.h> #define TIPC_MOD_VER "2.0.0" @@ -79,7 +80,6 @@ int tipc_snprintf(char *buf, int len, const char *fmt, ...); extern u32 tipc_own_addr __read_mostly; extern int tipc_max_ports __read_mostly; extern int tipc_net_id __read_mostly; -extern int tipc_remote_management __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; /* @@ -90,9 +90,6 @@ extern int tipc_random __read_mostly; /* * Routines available to privileged subsystems */ -int tipc_core_start_net(unsigned long); -int tipc_handler_start(void); -void tipc_handler_stop(void); int tipc_netlink_start(void); void tipc_netlink_stop(void); int tipc_socket_init(void); @@ -111,12 +108,10 @@ void tipc_unregister_sysctl(void); #endif /* - * TIPC timer and signal code + * TIPC timer code */ typedef void (*Handler) (unsigned long); -u32 tipc_k_signal(Handler routine, unsigned long argument); - /** * k_init_timer - initialize a timer * @timer: pointer to timer structure @@ -192,6 +187,8 @@ static inline void k_term_timer(struct timer_list *timer) struct tipc_skb_cb { void *handle; + bool deferred; + struct sk_buff *tail; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) diff --git a/net/tipc/discover.c b/net/tipc/discover.c index bc849f1efa1..aa722a42ef8 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -1,7 +1,7 @@ /* * net/tipc/discover.c * - * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2003-2006, 2014, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * @@ -46,19 +46,21 @@ /** * struct tipc_link_req - information about an ongoing link setup request - * @bearer: bearer issuing requests + * @bearer_id: identity of bearer issuing requests * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) + * @lock: spinlock for controlling access to requests * @buf: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ struct tipc_link_req { - struct tipc_bearer *bearer; + u32 bearer_id; struct tipc_media_addr dest; u32 domain; int num_nodes; + spinlock_t lock; struct sk_buff *buf; struct timer_list timer; unsigned int timer_intv; @@ -67,25 +69,21 @@ struct tipc_link_req { /** * tipc_disc_init_msg - initialize a link setup message * @type: message type (request or response) - * @dest_domain: network domain of node(s) which should respond to message * @b_ptr: ptr to bearer issuing message */ -static struct sk_buff *tipc_disc_init_msg(u32 type, u32 dest_domain, - struct tipc_bearer *b_ptr) +static void tipc_disc_init_msg(struct sk_buff *buf, u32 type, + struct tipc_bearer *b_ptr) { - struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE); struct tipc_msg *msg; - - if (buf) { - msg = buf_msg(buf); - tipc_msg_init(msg, LINK_CONFIG, type, INT_H_SIZE, dest_domain); - msg_set_non_seq(msg, 1); - msg_set_node_sig(msg, tipc_random); - msg_set_dest_domain(msg, dest_domain); - msg_set_bc_netid(msg, tipc_net_id); - b_ptr->media->addr2msg(&b_ptr->addr, msg_media_addr(msg)); - } - return buf; + u32 dest_domain = b_ptr->domain; + + msg = buf_msg(buf); + tipc_msg_init(msg, LINK_CONFIG, type, INT_H_SIZE, dest_domain); + msg_set_non_seq(msg, 1); + msg_set_node_sig(msg, tipc_random); + msg_set_dest_domain(msg, dest_domain); + msg_set_bc_netid(msg, tipc_net_id); + b_ptr->media->addr2msg(msg_media_addr(msg), &b_ptr->addr); } /** @@ -108,146 +106,150 @@ static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr, } /** - * tipc_disc_recv_msg - handle incoming link setup message (request or response) + * tipc_disc_rcv - handle incoming discovery message (request or response) * @buf: buffer containing message - * @b_ptr: bearer that message arrived on + * @bearer: bearer that message arrived on */ -void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr) +void tipc_disc_rcv(struct sk_buff *buf, struct tipc_bearer *bearer) { - struct tipc_node *n_ptr; + struct tipc_node *node; struct tipc_link *link; - struct tipc_media_addr media_addr; + struct tipc_media_addr maddr; struct sk_buff *rbuf; struct tipc_msg *msg = buf_msg(buf); - u32 dest = msg_dest_domain(msg); - u32 orig = msg_prevnode(msg); + u32 ddom = msg_dest_domain(msg); + u32 onode = msg_prevnode(msg); u32 net_id = msg_bc_netid(msg); - u32 type = msg_type(msg); + u32 mtyp = msg_type(msg); u32 signature = msg_node_sig(msg); - int addr_mismatch; - int link_fully_up; - - media_addr.broadcast = 1; - b_ptr->media->msg2addr(b_ptr, &media_addr, msg_media_addr(msg)); + bool addr_match = false; + bool sign_match = false; + bool link_up = false; + bool accept_addr = false; + bool accept_sign = false; + bool respond = false; + + bearer->media->msg2addr(bearer, &maddr, msg_media_addr(msg)); kfree_skb(buf); /* Ensure message from node is valid and communication is permitted */ if (net_id != tipc_net_id) return; - if (media_addr.broadcast) + if (maddr.broadcast) return; - if (!tipc_addr_domain_valid(dest)) + if (!tipc_addr_domain_valid(ddom)) return; - if (!tipc_addr_node_valid(orig)) + if (!tipc_addr_node_valid(onode)) return; - if (orig == tipc_own_addr) { - if (memcmp(&media_addr, &b_ptr->addr, sizeof(media_addr))) - disc_dupl_alert(b_ptr, tipc_own_addr, &media_addr); + + if (in_own_node(onode)) { + if (memcmp(&maddr, &bearer->addr, sizeof(maddr))) + disc_dupl_alert(bearer, tipc_own_addr, &maddr); return; } - if (!tipc_in_scope(dest, tipc_own_addr)) + if (!tipc_in_scope(ddom, tipc_own_addr)) return; - if (!tipc_in_scope(b_ptr->link_req->domain, orig)) + if (!tipc_in_scope(bearer->domain, onode)) return; - /* Locate structure corresponding to requesting node */ - n_ptr = tipc_node_find(orig); - if (!n_ptr) { - n_ptr = tipc_node_create(orig); - if (!n_ptr) - return; - } - tipc_node_lock(n_ptr); + /* Locate, or if necessary, create, node: */ + node = tipc_node_find(onode); + if (!node) + node = tipc_node_create(onode); + if (!node) + return; - /* Prepare to validate requesting node's signature and media address */ - link = n_ptr->links[b_ptr->identity]; - addr_mismatch = (link != NULL) && - memcmp(&link->media_addr, &media_addr, sizeof(media_addr)); + tipc_node_lock(node); + link = node->links[bearer->identity]; - /* - * Ensure discovery message's signature is correct - * - * If signature is incorrect and there is no working link to the node, - * accept the new signature but invalidate all existing links to the - * node so they won't re-activate without a new discovery message. - * - * If signature is incorrect and the requested link to the node is - * working, accept the new signature. (This is an instance of delayed - * rediscovery, where a link endpoint was able to re-establish contact - * with its peer endpoint on a node that rebooted before receiving a - * discovery message from that node.) - * - * If signature is incorrect and there is a working link to the node - * that is not the requested link, reject the request (must be from - * a duplicate node). - */ - if (signature != n_ptr->signature) { - if (n_ptr->working_links == 0) { - struct tipc_link *curr_link; - int i; - - for (i = 0; i < MAX_BEARERS; i++) { - curr_link = n_ptr->links[i]; - if (curr_link) { - memset(&curr_link->media_addr, 0, - sizeof(media_addr)); - tipc_link_reset(curr_link); - } - } - addr_mismatch = (link != NULL); - } else if (tipc_link_is_up(link) && !addr_mismatch) { - /* delayed rediscovery */ - } else { - disc_dupl_alert(b_ptr, orig, &media_addr); - tipc_node_unlock(n_ptr); - return; - } - n_ptr->signature = signature; + /* Prepare to validate requesting node's signature and media address */ + sign_match = (signature == node->signature); + addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr)); + link_up = link && tipc_link_is_up(link); + + + /* These three flags give us eight permutations: */ + + if (sign_match && addr_match && link_up) { + /* All is fine. Do nothing. */ + } else if (sign_match && addr_match && !link_up) { + /* Respond. The link will come up in due time */ + respond = true; + } else if (sign_match && !addr_match && link_up) { + /* Peer has changed i/f address without rebooting. + * If so, the link will reset soon, and the next + * discovery will be accepted. So we can ignore it. + * It may also be an cloned or malicious peer having + * chosen the same node address and signature as an + * existing one. + * Ignore requests until the link goes down, if ever. + */ + disc_dupl_alert(bearer, onode, &maddr); + } else if (sign_match && !addr_match && !link_up) { + /* Peer link has changed i/f address without rebooting. + * It may also be a cloned or malicious peer; we can't + * distinguish between the two. + * The signature is correct, so we must accept. + */ + accept_addr = true; + respond = true; + } else if (!sign_match && addr_match && link_up) { + /* Peer node rebooted. Two possibilities: + * - Delayed re-discovery; this link endpoint has already + * reset and re-established contact with the peer, before + * receiving a discovery message from that node. + * (The peer happened to receive one from this node first). + * - The peer came back so fast that our side has not + * discovered it yet. Probing from this side will soon + * reset the link, since there can be no working link + * endpoint at the peer end, and the link will re-establish. + * Accept the signature, since it comes from a known peer. + */ + accept_sign = true; + } else if (!sign_match && addr_match && !link_up) { + /* The peer node has rebooted. + * Accept signature, since it is a known peer. + */ + accept_sign = true; + respond = true; + } else if (!sign_match && !addr_match && link_up) { + /* Peer rebooted with new address, or a new/duplicate peer. + * Ignore until the link goes down, if ever. + */ + disc_dupl_alert(bearer, onode, &maddr); + } else if (!sign_match && !addr_match && !link_up) { + /* Peer rebooted with new address, or it is a new peer. + * Accept signature and address. + */ + accept_sign = true; + accept_addr = true; + respond = true; } - /* - * Ensure requesting node's media address is correct - * - * If media address doesn't match and the link is working, reject the - * request (must be from a duplicate node). - * - * If media address doesn't match and the link is not working, accept - * the new media address and reset the link to ensure it starts up - * cleanly. - */ - if (addr_mismatch) { - if (tipc_link_is_up(link)) { - disc_dupl_alert(b_ptr, orig, &media_addr); - tipc_node_unlock(n_ptr); - return; - } else { - memcpy(&link->media_addr, &media_addr, - sizeof(media_addr)); - tipc_link_reset(link); - } - } + if (accept_sign) + node->signature = signature; - /* Create a link endpoint for this bearer, if necessary */ - if (!link) { - link = tipc_link_create(n_ptr, b_ptr, &media_addr); - if (!link) { - tipc_node_unlock(n_ptr); - return; + if (accept_addr) { + if (!link) + link = tipc_link_create(node, bearer, &maddr); + if (link) { + memcpy(&link->media_addr, &maddr, sizeof(maddr)); + tipc_link_reset(link); + } else { + respond = false; } } - /* Accept discovery message & send response, if necessary */ - link_fully_up = link_working_working(link); - - if ((type == DSC_REQ_MSG) && !link_fully_up) { - rbuf = tipc_disc_init_msg(DSC_RESP_MSG, orig, b_ptr); + /* Send response, if necessary */ + if (respond && (mtyp == DSC_REQ_MSG)) { + rbuf = tipc_buf_acquire(INT_H_SIZE); if (rbuf) { - tipc_bearer_send(b_ptr, rbuf, &media_addr); + tipc_disc_init_msg(rbuf, DSC_RESP_MSG, bearer); + tipc_bearer_send(bearer->identity, rbuf, &maddr); kfree_skb(rbuf); } } - - tipc_node_unlock(n_ptr); + tipc_node_unlock(node); } /** @@ -274,7 +276,9 @@ static void disc_update(struct tipc_link_req *req) */ void tipc_disc_add_dest(struct tipc_link_req *req) { + spin_lock_bh(&req->lock); req->num_nodes++; + spin_unlock_bh(&req->lock); } /** @@ -283,8 +287,10 @@ void tipc_disc_add_dest(struct tipc_link_req *req) */ void tipc_disc_remove_dest(struct tipc_link_req *req) { + spin_lock_bh(&req->lock); req->num_nodes--; disc_update(req); + spin_unlock_bh(&req->lock); } /** @@ -297,7 +303,7 @@ static void disc_timeout(struct tipc_link_req *req) { int max_delay; - spin_lock_bh(&req->bearer->lock); + spin_lock_bh(&req->lock); /* Stop searching if only desired node has been found */ if (tipc_node(req->domain) && req->num_nodes) { @@ -312,7 +318,7 @@ static void disc_timeout(struct tipc_link_req *req) * hold at fast polling rate if don't have any associated nodes, * otherwise hold at slow polling rate */ - tipc_bearer_send(req->bearer, req->buf, &req->dest); + tipc_bearer_send(req->bearer_id, req->buf, &req->dest); req->timer_intv *= 2; @@ -325,7 +331,7 @@ static void disc_timeout(struct tipc_link_req *req) k_start_timer(&req->timer, req->timer_intv); exit: - spin_unlock_bh(&req->bearer->lock); + spin_unlock_bh(&req->lock); } /** @@ -336,8 +342,7 @@ exit: * * Returns 0 if successful, otherwise -errno. */ -int tipc_disc_create(struct tipc_bearer *b_ptr, struct tipc_media_addr *dest, - u32 dest_domain) +int tipc_disc_create(struct tipc_bearer *b_ptr, struct tipc_media_addr *dest) { struct tipc_link_req *req; @@ -345,21 +350,23 @@ int tipc_disc_create(struct tipc_bearer *b_ptr, struct tipc_media_addr *dest, if (!req) return -ENOMEM; - req->buf = tipc_disc_init_msg(DSC_REQ_MSG, dest_domain, b_ptr); + req->buf = tipc_buf_acquire(INT_H_SIZE); if (!req->buf) { kfree(req); - return -ENOMSG; + return -ENOMEM; } + tipc_disc_init_msg(req->buf, DSC_REQ_MSG, b_ptr); memcpy(&req->dest, dest, sizeof(*dest)); - req->bearer = b_ptr; - req->domain = dest_domain; + req->bearer_id = b_ptr->identity; + req->domain = b_ptr->domain; req->num_nodes = 0; req->timer_intv = TIPC_LINK_REQ_INIT; + spin_lock_init(&req->lock); k_init_timer(&req->timer, (Handler)disc_timeout, (unsigned long)req); k_start_timer(&req->timer, req->timer_intv); b_ptr->link_req = req; - tipc_bearer_send(req->bearer, req->buf, &req->dest); + tipc_bearer_send(req->bearer_id, req->buf, &req->dest); return 0; } @@ -374,3 +381,23 @@ void tipc_disc_delete(struct tipc_link_req *req) kfree_skb(req->buf); kfree(req); } + +/** + * tipc_disc_reset - reset object to send periodic link setup requests + * @b_ptr: ptr to bearer issuing requests + * @dest_domain: network domain to which links can be established + */ +void tipc_disc_reset(struct tipc_bearer *b_ptr) +{ + struct tipc_link_req *req = b_ptr->link_req; + + spin_lock_bh(&req->lock); + tipc_disc_init_msg(req->buf, DSC_REQ_MSG, b_ptr); + req->bearer_id = b_ptr->identity; + req->domain = b_ptr->domain; + req->num_nodes = 0; + req->timer_intv = TIPC_LINK_REQ_INIT; + k_start_timer(&req->timer, req->timer_intv); + tipc_bearer_send(req->bearer_id, req->buf, &req->dest); + spin_unlock_bh(&req->lock); +} diff --git a/net/tipc/discover.h b/net/tipc/discover.h index 75b67c403aa..515b57392f4 100644 --- a/net/tipc/discover.h +++ b/net/tipc/discover.h @@ -39,11 +39,11 @@ struct tipc_link_req; -int tipc_disc_create(struct tipc_bearer *b_ptr, struct tipc_media_addr *dest, - u32 dest_domain); +int tipc_disc_create(struct tipc_bearer *b_ptr, struct tipc_media_addr *dest); void tipc_disc_delete(struct tipc_link_req *req); +void tipc_disc_reset(struct tipc_bearer *b_ptr); void tipc_disc_add_dest(struct tipc_link_req *req); void tipc_disc_remove_dest(struct tipc_link_req *req); -void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr); +void tipc_disc_rcv(struct sk_buff *buf, struct tipc_bearer *b_ptr); #endif diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 1e3c33250db..5e1426f1751 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -1,7 +1,7 @@ /* * net/tipc/eth_media.c: Ethernet bearer support for TIPC * - * Copyright (c) 2001-2007, Ericsson AB + * Copyright (c) 2001-2007, 2013-2014, Ericsson AB * Copyright (c) 2005-2008, 2011-2013, Wind River Systems * All rights reserved. * @@ -37,330 +37,65 @@ #include "core.h" #include "bearer.h" -#define MAX_ETH_MEDIA MAX_BEARERS +#define ETH_ADDR_OFFSET 4 /* MAC addr position inside address field */ -#define ETH_ADDR_OFFSET 4 /* message header offset of MAC address */ - -/** - * struct eth_media - Ethernet bearer data structure - * @bearer: ptr to associated "generic" bearer structure - * @dev: ptr to associated Ethernet network device - * @tipc_packet_type: used in binding TIPC to Ethernet driver - * @setup: work item used when enabling bearer - * @cleanup: work item used when disabling bearer - */ -struct eth_media { - struct tipc_bearer *bearer; - struct net_device *dev; - struct packet_type tipc_packet_type; - struct work_struct setup; - struct work_struct cleanup; -}; - -static struct tipc_media eth_media_info; -static struct eth_media eth_media_array[MAX_ETH_MEDIA]; -static int eth_started; - -static int recv_notification(struct notifier_block *nb, unsigned long evt, - void *dv); -/* - * Network device notifier info - */ -static struct notifier_block notifier = { - .notifier_call = recv_notification, - .priority = 0 -}; - -/** - * eth_media_addr_set - initialize Ethernet media address structure - * - * Media-dependent "value" field stores MAC address in first 6 bytes - * and zeroes out the remaining bytes. - */ -static void eth_media_addr_set(const struct tipc_bearer *tb_ptr, - struct tipc_media_addr *a, char *mac) +/* Convert Ethernet address (media address format) to string */ +static int tipc_eth_addr2str(struct tipc_media_addr *addr, + char *strbuf, int bufsz) { - memcpy(a->value, mac, ETH_ALEN); - memset(a->value + ETH_ALEN, 0, sizeof(a->value) - ETH_ALEN); - a->media_id = TIPC_MEDIA_TYPE_ETH; - a->broadcast = !memcmp(mac, tb_ptr->bcast_addr.value, ETH_ALEN); -} - -/** - * send_msg - send a TIPC message out over an Ethernet interface - */ -static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, - struct tipc_media_addr *dest) -{ - struct sk_buff *clone; - struct net_device *dev; - int delta; - - clone = skb_clone(buf, GFP_ATOMIC); - if (!clone) - return 0; - - dev = ((struct eth_media *)(tb_ptr->usr_handle))->dev; - delta = dev->hard_header_len - skb_headroom(buf); - - if ((delta > 0) && - pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { - kfree_skb(clone); - return 0; - } + if (bufsz < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */ + return 1; - skb_reset_network_header(clone); - clone->dev = dev; - clone->protocol = htons(ETH_P_TIPC); - dev_hard_header(clone, dev, ETH_P_TIPC, dest->value, - dev->dev_addr, clone->len); - dev_queue_xmit(clone); + sprintf(strbuf, "%pM", addr->value); return 0; } -/** - * recv_msg - handle incoming TIPC message from an Ethernet interface - * - * Accept only packets explicitly sent to this node, or broadcast packets; - * ignores packets sent using Ethernet multicast, and traffic sent to other - * nodes (which can happen if interface is running in promiscuous mode). - */ -static int recv_msg(struct sk_buff *buf, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct eth_media *eb_ptr = (struct eth_media *)pt->af_packet_priv; - - if (!net_eq(dev_net(dev), &init_net)) { - kfree_skb(buf); - return NET_RX_DROP; - } - - if (likely(eb_ptr->bearer)) { - if (likely(buf->pkt_type <= PACKET_BROADCAST)) { - buf->next = NULL; - tipc_recv_msg(buf, eb_ptr->bearer); - return NET_RX_SUCCESS; - } - } - kfree_skb(buf); - return NET_RX_DROP; -} - -/** - * setup_media - setup association between Ethernet bearer and interface - */ -static void setup_media(struct work_struct *work) -{ - struct eth_media *eb_ptr = - container_of(work, struct eth_media, setup); - - dev_add_pack(&eb_ptr->tipc_packet_type); -} - -/** - * enable_media - attach TIPC bearer to an Ethernet interface - */ -static int enable_media(struct tipc_bearer *tb_ptr) +/* Convert from media address format to discovery message addr format */ +static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) { - struct net_device *dev; - struct eth_media *eb_ptr = ð_media_array[0]; - struct eth_media *stop = ð_media_array[MAX_ETH_MEDIA]; - char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; - int pending_dev = 0; - - /* Find unused Ethernet bearer structure */ - while (eb_ptr->dev) { - if (!eb_ptr->bearer) - pending_dev++; - if (++eb_ptr == stop) - return pending_dev ? -EAGAIN : -EDQUOT; - } - - /* Find device with specified name */ - dev = dev_get_by_name(&init_net, driver_name); - if (!dev) - return -ENODEV; - - /* Create Ethernet bearer for device */ - eb_ptr->dev = dev; - eb_ptr->tipc_packet_type.type = htons(ETH_P_TIPC); - eb_ptr->tipc_packet_type.dev = dev; - eb_ptr->tipc_packet_type.func = recv_msg; - eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr; - INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list)); - INIT_WORK(&eb_ptr->setup, setup_media); - schedule_work(&eb_ptr->setup); - - /* Associate TIPC bearer with Ethernet bearer */ - eb_ptr->bearer = tb_ptr; - tb_ptr->usr_handle = (void *)eb_ptr; - memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value)); - memcpy(tb_ptr->bcast_addr.value, dev->broadcast, ETH_ALEN); - tb_ptr->bcast_addr.media_id = TIPC_MEDIA_TYPE_ETH; - tb_ptr->bcast_addr.broadcast = 1; - tb_ptr->mtu = dev->mtu; - eth_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr); + memset(msg, 0, TIPC_MEDIA_ADDR_SIZE); + msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; + memcpy(msg + ETH_ADDR_OFFSET, addr->value, ETH_ALEN); return 0; } -/** - * cleanup_media - break association between Ethernet bearer and interface - * - * This routine must be invoked from a work queue because it can sleep. - */ -static void cleanup_media(struct work_struct *work) +/* Convert raw mac address format to media addr format */ +static int tipc_eth_raw2addr(struct tipc_bearer *b, + struct tipc_media_addr *addr, + char *msg) { - struct eth_media *eb_ptr = - container_of(work, struct eth_media, cleanup); - - dev_remove_pack(&eb_ptr->tipc_packet_type); - dev_put(eb_ptr->dev); - eb_ptr->dev = NULL; -} + char bcast_mac[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -/** - * disable_media - detach TIPC bearer from an Ethernet interface - * - * Mark Ethernet bearer as inactive so that incoming buffers are thrown away, - * then get worker thread to complete bearer cleanup. (Can't do cleanup - * here because cleanup code needs to sleep and caller holds spinlocks.) - */ -static void disable_media(struct tipc_bearer *tb_ptr) -{ - struct eth_media *eb_ptr = (struct eth_media *)tb_ptr->usr_handle; - - eb_ptr->bearer = NULL; - INIT_WORK(&eb_ptr->cleanup, cleanup_media); - schedule_work(&eb_ptr->cleanup); -} - -/** - * recv_notification - handle device updates from OS - * - * Change the state of the Ethernet bearer (if any) associated with the - * specified device. - */ -static int recv_notification(struct notifier_block *nb, unsigned long evt, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct eth_media *eb_ptr = ð_media_array[0]; - struct eth_media *stop = ð_media_array[MAX_ETH_MEDIA]; - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - while ((eb_ptr->dev != dev)) { - if (++eb_ptr == stop) - return NOTIFY_DONE; /* couldn't find device */ - } - if (!eb_ptr->bearer) - return NOTIFY_DONE; /* bearer had been disabled */ - - eb_ptr->bearer->mtu = dev->mtu; - - switch (evt) { - case NETDEV_CHANGE: - if (netif_carrier_ok(dev)) - break; - case NETDEV_DOWN: - case NETDEV_CHANGEMTU: - case NETDEV_CHANGEADDR: - tipc_reset_bearer(eb_ptr->bearer); - break; - case NETDEV_UNREGISTER: - case NETDEV_CHANGENAME: - tipc_disable_bearer(eb_ptr->bearer->name); - break; - } - return NOTIFY_OK; -} - -/** - * eth_addr2str - convert Ethernet address to string - */ -static int eth_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) -{ - if (str_size < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */ - return 1; - - sprintf(str_buf, "%pM", a->value); + memset(addr, 0, sizeof(*addr)); + ether_addr_copy(addr->value, msg); + addr->media_id = TIPC_MEDIA_TYPE_ETH; + addr->broadcast = !memcmp(addr->value, bcast_mac, ETH_ALEN); return 0; } -/** - * eth_str2addr - convert Ethernet address format to message header format - */ -static int eth_addr2msg(struct tipc_media_addr *a, char *msg_area) +/* Convert discovery msg addr format to Ethernet media addr format */ +static int tipc_eth_msg2addr(struct tipc_bearer *b, + struct tipc_media_addr *addr, + char *msg) { - memset(msg_area, 0, TIPC_MEDIA_ADDR_SIZE); - msg_area[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; - memcpy(msg_area + ETH_ADDR_OFFSET, a->value, ETH_ALEN); - return 0; + /* Skip past preamble: */ + msg += ETH_ADDR_OFFSET; + return tipc_eth_raw2addr(b, addr, msg); } -/** - * eth_str2addr - convert message header address format to Ethernet format - */ -static int eth_msg2addr(const struct tipc_bearer *tb_ptr, - struct tipc_media_addr *a, char *msg_area) -{ - if (msg_area[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_ETH) - return 1; - - eth_media_addr_set(tb_ptr, a, msg_area + ETH_ADDR_OFFSET); - return 0; -} - -/* - * Ethernet media registration info - */ -static struct tipc_media eth_media_info = { - .send_msg = send_msg, - .enable_media = enable_media, - .disable_media = disable_media, - .addr2str = eth_addr2str, - .addr2msg = eth_addr2msg, - .msg2addr = eth_msg2addr, +/* Ethernet media registration info */ +struct tipc_media eth_media_info = { + .send_msg = tipc_l2_send_msg, + .enable_media = tipc_enable_l2_media, + .disable_media = tipc_disable_l2_media, + .addr2str = tipc_eth_addr2str, + .addr2msg = tipc_eth_addr2msg, + .msg2addr = tipc_eth_msg2addr, + .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .window = TIPC_DEF_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, + .hwaddr_len = ETH_ALEN, .name = "eth" }; - -/** - * tipc_eth_media_start - activate Ethernet bearer support - * - * Register Ethernet media type with TIPC bearer code. Also register - * with OS for notifications about device state changes. - */ -int tipc_eth_media_start(void) -{ - int res; - - if (eth_started) - return -EINVAL; - - res = tipc_register_media(ð_media_info); - if (res) - return res; - - res = register_netdevice_notifier(¬ifier); - if (!res) - eth_started = 1; - return res; -} - -/** - * tipc_eth_media_stop - deactivate Ethernet bearer support - */ -void tipc_eth_media_stop(void) -{ - if (!eth_started) - return; - - flush_scheduled_work(); - unregister_netdevice_notifier(¬ifier); - eth_started = 0; -} diff --git a/net/tipc/handler.c b/net/tipc/handler.c deleted file mode 100644 index b36f0fcd9bd..00000000000 --- a/net/tipc/handler.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * net/tipc/handler.c: TIPC signal handling - * - * Copyright (c) 2000-2006, Ericsson AB - * Copyright (c) 2005, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "core.h" - -struct queue_item { - struct list_head next_signal; - void (*handler) (unsigned long); - unsigned long data; -}; - -static struct kmem_cache *tipc_queue_item_cache; -static struct list_head signal_queue_head; -static DEFINE_SPINLOCK(qitem_lock); -static int handler_enabled __read_mostly; - -static void process_signal_queue(unsigned long dummy); - -static DECLARE_TASKLET_DISABLED(tipc_tasklet, process_signal_queue, 0); - - -unsigned int tipc_k_signal(Handler routine, unsigned long argument) -{ - struct queue_item *item; - - if (!handler_enabled) { - pr_err("Signal request ignored by handler\n"); - return -ENOPROTOOPT; - } - - spin_lock_bh(&qitem_lock); - item = kmem_cache_alloc(tipc_queue_item_cache, GFP_ATOMIC); - if (!item) { - pr_err("Signal queue out of memory\n"); - spin_unlock_bh(&qitem_lock); - return -ENOMEM; - } - item->handler = routine; - item->data = argument; - list_add_tail(&item->next_signal, &signal_queue_head); - spin_unlock_bh(&qitem_lock); - tasklet_schedule(&tipc_tasklet); - return 0; -} - -static void process_signal_queue(unsigned long dummy) -{ - struct queue_item *__volatile__ item; - struct list_head *l, *n; - - spin_lock_bh(&qitem_lock); - list_for_each_safe(l, n, &signal_queue_head) { - item = list_entry(l, struct queue_item, next_signal); - list_del(&item->next_signal); - spin_unlock_bh(&qitem_lock); - item->handler(item->data); - spin_lock_bh(&qitem_lock); - kmem_cache_free(tipc_queue_item_cache, item); - } - spin_unlock_bh(&qitem_lock); -} - -int tipc_handler_start(void) -{ - tipc_queue_item_cache = - kmem_cache_create("tipc_queue_items", sizeof(struct queue_item), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!tipc_queue_item_cache) - return -ENOMEM; - - INIT_LIST_HEAD(&signal_queue_head); - tasklet_enable(&tipc_tasklet); - handler_enabled = 1; - return 0; -} - -void tipc_handler_stop(void) -{ - struct list_head *l, *n; - struct queue_item *item; - - if (!handler_enabled) - return; - - handler_enabled = 0; - tasklet_kill(&tipc_tasklet); - - spin_lock_bh(&qitem_lock); - list_for_each_safe(l, n, &signal_queue_head) { - item = list_entry(l, struct queue_item, next_signal); - list_del(&item->next_signal); - kmem_cache_free(tipc_queue_item_cache, item); - } - spin_unlock_bh(&qitem_lock); - - kmem_cache_destroy(tipc_queue_item_cache); -} diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index cbe7fe15cc7..8522eef9c13 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -42,242 +42,9 @@ #include "core.h" #include "bearer.h" -#define MAX_IB_MEDIA MAX_BEARERS - -/** - * struct ib_media - Infiniband media data structure - * @bearer: ptr to associated "generic" bearer structure - * @dev: ptr to associated Infiniband network device - * @tipc_packet_type: used in binding TIPC to Infiniband driver - * @cleanup: work item used when disabling bearer - */ - -struct ib_media { - struct tipc_bearer *bearer; - struct net_device *dev; - struct packet_type tipc_packet_type; - struct work_struct setup; - struct work_struct cleanup; -}; - -static struct tipc_media ib_media_info; -static struct ib_media ib_media_array[MAX_IB_MEDIA]; -static int ib_started; - -/** - * ib_media_addr_set - initialize Infiniband media address structure - * - * Media-dependent "value" field stores MAC address in first 6 bytes - * and zeroes out the remaining bytes. - */ -static void ib_media_addr_set(const struct tipc_bearer *tb_ptr, - struct tipc_media_addr *a, char *mac) -{ - BUILD_BUG_ON(sizeof(a->value) < INFINIBAND_ALEN); - memcpy(a->value, mac, INFINIBAND_ALEN); - a->media_id = TIPC_MEDIA_TYPE_IB; - a->broadcast = !memcmp(mac, tb_ptr->bcast_addr.value, INFINIBAND_ALEN); -} - -/** - * send_msg - send a TIPC message out over an InfiniBand interface - */ -static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, - struct tipc_media_addr *dest) -{ - struct sk_buff *clone; - struct net_device *dev; - int delta; - - clone = skb_clone(buf, GFP_ATOMIC); - if (!clone) - return 0; - - dev = ((struct ib_media *)(tb_ptr->usr_handle))->dev; - delta = dev->hard_header_len - skb_headroom(buf); - - if ((delta > 0) && - pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { - kfree_skb(clone); - return 0; - } - - skb_reset_network_header(clone); - clone->dev = dev; - clone->protocol = htons(ETH_P_TIPC); - dev_hard_header(clone, dev, ETH_P_TIPC, dest->value, - dev->dev_addr, clone->len); - dev_queue_xmit(clone); - return 0; -} - -/** - * recv_msg - handle incoming TIPC message from an InfiniBand interface - * - * Accept only packets explicitly sent to this node, or broadcast packets; - * ignores packets sent using InfiniBand multicast, and traffic sent to other - * nodes (which can happen if interface is running in promiscuous mode). - */ -static int recv_msg(struct sk_buff *buf, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct ib_media *ib_ptr = (struct ib_media *)pt->af_packet_priv; - - if (!net_eq(dev_net(dev), &init_net)) { - kfree_skb(buf); - return NET_RX_DROP; - } - - if (likely(ib_ptr->bearer)) { - if (likely(buf->pkt_type <= PACKET_BROADCAST)) { - buf->next = NULL; - tipc_recv_msg(buf, ib_ptr->bearer); - return NET_RX_SUCCESS; - } - } - kfree_skb(buf); - return NET_RX_DROP; -} - -/** - * setup_bearer - setup association between InfiniBand bearer and interface - */ -static void setup_media(struct work_struct *work) -{ - struct ib_media *ib_ptr = - container_of(work, struct ib_media, setup); - - dev_add_pack(&ib_ptr->tipc_packet_type); -} - -/** - * enable_media - attach TIPC bearer to an InfiniBand interface - */ -static int enable_media(struct tipc_bearer *tb_ptr) -{ - struct net_device *dev; - struct ib_media *ib_ptr = &ib_media_array[0]; - struct ib_media *stop = &ib_media_array[MAX_IB_MEDIA]; - char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; - int pending_dev = 0; - - /* Find unused InfiniBand bearer structure */ - while (ib_ptr->dev) { - if (!ib_ptr->bearer) - pending_dev++; - if (++ib_ptr == stop) - return pending_dev ? -EAGAIN : -EDQUOT; - } - - /* Find device with specified name */ - dev = dev_get_by_name(&init_net, driver_name); - if (!dev) - return -ENODEV; - - /* Create InfiniBand bearer for device */ - ib_ptr->dev = dev; - ib_ptr->tipc_packet_type.type = htons(ETH_P_TIPC); - ib_ptr->tipc_packet_type.dev = dev; - ib_ptr->tipc_packet_type.func = recv_msg; - ib_ptr->tipc_packet_type.af_packet_priv = ib_ptr; - INIT_LIST_HEAD(&(ib_ptr->tipc_packet_type.list)); - INIT_WORK(&ib_ptr->setup, setup_media); - schedule_work(&ib_ptr->setup); - - /* Associate TIPC bearer with InfiniBand bearer */ - ib_ptr->bearer = tb_ptr; - tb_ptr->usr_handle = (void *)ib_ptr; - memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value)); - memcpy(tb_ptr->bcast_addr.value, dev->broadcast, INFINIBAND_ALEN); - tb_ptr->bcast_addr.media_id = TIPC_MEDIA_TYPE_IB; - tb_ptr->bcast_addr.broadcast = 1; - tb_ptr->mtu = dev->mtu; - ib_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr); - return 0; -} - -/** - * cleanup_bearer - break association between InfiniBand bearer and interface - * - * This routine must be invoked from a work queue because it can sleep. - */ -static void cleanup_bearer(struct work_struct *work) -{ - struct ib_media *ib_ptr = - container_of(work, struct ib_media, cleanup); - - dev_remove_pack(&ib_ptr->tipc_packet_type); - dev_put(ib_ptr->dev); - ib_ptr->dev = NULL; -} - -/** - * disable_media - detach TIPC bearer from an InfiniBand interface - * - * Mark InfiniBand bearer as inactive so that incoming buffers are thrown away, - * then get worker thread to complete bearer cleanup. (Can't do cleanup - * here because cleanup code needs to sleep and caller holds spinlocks.) - */ -static void disable_media(struct tipc_bearer *tb_ptr) -{ - struct ib_media *ib_ptr = (struct ib_media *)tb_ptr->usr_handle; - - ib_ptr->bearer = NULL; - INIT_WORK(&ib_ptr->cleanup, cleanup_bearer); - schedule_work(&ib_ptr->cleanup); -} - -/** - * recv_notification - handle device updates from OS - * - * Change the state of the InfiniBand bearer (if any) associated with the - * specified device. - */ -static int recv_notification(struct notifier_block *nb, unsigned long evt, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct ib_media *ib_ptr = &ib_media_array[0]; - struct ib_media *stop = &ib_media_array[MAX_IB_MEDIA]; - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - while ((ib_ptr->dev != dev)) { - if (++ib_ptr == stop) - return NOTIFY_DONE; /* couldn't find device */ - } - if (!ib_ptr->bearer) - return NOTIFY_DONE; /* bearer had been disabled */ - - ib_ptr->bearer->mtu = dev->mtu; - - switch (evt) { - case NETDEV_CHANGE: - if (netif_carrier_ok(dev)) - break; - case NETDEV_DOWN: - case NETDEV_CHANGEMTU: - case NETDEV_CHANGEADDR: - tipc_reset_bearer(ib_ptr->bearer); - break; - case NETDEV_UNREGISTER: - case NETDEV_CHANGENAME: - tipc_disable_bearer(ib_ptr->bearer->name); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block notifier = { - .notifier_call = recv_notification, - .priority = 0, -}; - -/** - * ib_addr2str - convert InfiniBand address to string - */ -static int ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) +/* convert InfiniBand address (media address format) media address to string */ +static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, + int str_size) { if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */ return 1; @@ -287,76 +54,48 @@ static int ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) return 0; } -/** - * ib_addr2msg - convert InfiniBand address format to message header format - */ -static int ib_addr2msg(struct tipc_media_addr *a, char *msg_area) +/* Convert from media address format to discovery message addr format */ +static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) { - memset(msg_area, 0, TIPC_MEDIA_ADDR_SIZE); - msg_area[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_IB; - memcpy(msg_area, a->value, INFINIBAND_ALEN); + memset(msg, 0, TIPC_MEDIA_ADDR_SIZE); + memcpy(msg, addr->value, INFINIBAND_ALEN); return 0; } -/** - * ib_msg2addr - convert message header address format to InfiniBand format - */ -static int ib_msg2addr(const struct tipc_bearer *tb_ptr, - struct tipc_media_addr *a, char *msg_area) +/* Convert raw InfiniBand address format to media addr format */ +static int tipc_ib_raw2addr(struct tipc_bearer *b, + struct tipc_media_addr *addr, + char *msg) { - ib_media_addr_set(tb_ptr, a, msg_area); + memset(addr, 0, sizeof(*addr)); + memcpy(addr->value, msg, INFINIBAND_ALEN); + addr->media_id = TIPC_MEDIA_TYPE_IB; + addr->broadcast = !memcmp(msg, b->bcast_addr.value, + INFINIBAND_ALEN); return 0; } -/* - * InfiniBand media registration info - */ -static struct tipc_media ib_media_info = { - .send_msg = send_msg, - .enable_media = enable_media, - .disable_media = disable_media, - .addr2str = ib_addr2str, - .addr2msg = ib_addr2msg, - .msg2addr = ib_msg2addr, +/* Convert discovery msg addr format to InfiniBand media addr format */ +static int tipc_ib_msg2addr(struct tipc_bearer *b, + struct tipc_media_addr *addr, + char *msg) +{ + return tipc_ib_raw2addr(b, addr, msg); +} + +/* InfiniBand media registration info */ +struct tipc_media ib_media_info = { + .send_msg = tipc_l2_send_msg, + .enable_media = tipc_enable_l2_media, + .disable_media = tipc_disable_l2_media, + .addr2str = tipc_ib_addr2str, + .addr2msg = tipc_ib_addr2msg, + .msg2addr = tipc_ib_msg2addr, + .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .window = TIPC_DEF_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, + .hwaddr_len = INFINIBAND_ALEN, .name = "ib" }; - -/** - * tipc_ib_media_start - activate InfiniBand bearer support - * - * Register InfiniBand media type with TIPC bearer code. Also register - * with OS for notifications about device state changes. - */ -int tipc_ib_media_start(void) -{ - int res; - - if (ib_started) - return -EINVAL; - - res = tipc_register_media(&ib_media_info); - if (res) - return res; - - res = register_netdevice_notifier(¬ifier); - if (!res) - ib_started = 1; - return res; -} - -/** - * tipc_ib_media_stop - deactivate InfiniBand bearer support - */ -void tipc_ib_media_stop(void) -{ - if (!ib_started) - return; - - flush_scheduled_work(); - unregister_netdevice_notifier(¬ifier); - ib_started = 0; -} diff --git a/net/tipc/link.c b/net/tipc/link.c index fd340ad742e..ad2c57f5868 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1,7 +1,7 @@ /* * net/tipc/link.c: TIPC link code * - * Copyright (c) 1996-2007, 2012, Ericsson AB + * Copyright (c) 1996-2007, 2012-2014, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -37,6 +37,7 @@ #include "core.h" #include "link.h" #include "port.h" +#include "socket.h" #include "name_distr.h" #include "discover.h" #include "config.h" @@ -77,20 +78,19 @@ static const char *link_unk_evt = "Unknown link event "; static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, struct sk_buff *buf); -static void link_recv_proto_msg(struct tipc_link *l_ptr, struct sk_buff *buf); -static int link_recv_changeover_msg(struct tipc_link **l_ptr, - struct sk_buff **buf); +static void tipc_link_proto_rcv(struct tipc_link *l_ptr, struct sk_buff *buf); +static int tipc_link_tunnel_rcv(struct tipc_node *n_ptr, + struct sk_buff **buf); static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tolerance); -static int link_send_sections_long(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destnode); +static int tipc_link_iovec_long_xmit(struct tipc_port *sender, + struct iovec const *msg_sect, + unsigned int len, u32 destnode); static void link_state_event(struct tipc_link *l_ptr, u32 event); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); -static void link_start(struct tipc_link *l_ptr); -static int link_send_long_buf(struct tipc_link *l_ptr, struct sk_buff *buf); -static void tipc_link_send_sync(struct tipc_link *l); -static void tipc_link_recv_sync(struct tipc_node *n, struct sk_buff *buf); +static int tipc_link_frag_xmit(struct tipc_link *l_ptr, struct sk_buff *buf); +static void tipc_link_sync_xmit(struct tipc_link *l); +static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); /* * Simple link routines @@ -102,9 +102,18 @@ static unsigned int align(unsigned int i) static void link_init_max_pkt(struct tipc_link *l_ptr) { + struct tipc_bearer *b_ptr; u32 max_pkt; - max_pkt = (l_ptr->b_ptr->mtu & ~3); + rcu_read_lock(); + b_ptr = rcu_dereference_rtnl(bearer_list[l_ptr->bearer_id]); + if (!b_ptr) { + rcu_read_unlock(); + return; + } + max_pkt = (b_ptr->mtu & ~3); + rcu_read_unlock(); + if (max_pkt > MAX_MSG_SIZE) max_pkt = MAX_MSG_SIZE; @@ -148,11 +157,6 @@ int tipc_link_is_active(struct tipc_link *l_ptr) /** * link_timeout - handle expiration of link timer * @l_ptr: pointer to link - * - * This routine must not grab "tipc_net_lock" to avoid a potential deadlock conflict - * with tipc_link_delete(). (There is no risk that the node will be deleted by - * another thread because tipc_link_delete() always cancels the link timer before - * tipc_node_delete() is called.) */ static void link_timeout(struct tipc_link *l_ptr) { @@ -214,8 +218,8 @@ static void link_set_timer(struct tipc_link *l_ptr, u32 time) * Returns pointer to link. */ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, - struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr) + struct tipc_bearer *b_ptr, + const struct tipc_media_addr *media_addr) { struct tipc_link *l_ptr; struct tipc_msg *msg; @@ -254,7 +258,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, l_ptr->owner = n_ptr; l_ptr->checkpoint = 1; l_ptr->peer_session = INVALID_SESSION; - l_ptr->b_ptr = b_ptr; + l_ptr->bearer_id = b_ptr->identity; link_set_supervision_props(l_ptr, b_ptr->tolerance); l_ptr->state = RESET_UNKNOWN; @@ -269,6 +273,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, l_ptr->priority = b_ptr->priority; tipc_link_set_queue_limits(l_ptr, b_ptr->window); + l_ptr->net_plane = b_ptr->net_plane; link_init_max_pkt(l_ptr); l_ptr->next_out_no = 1; @@ -278,45 +283,44 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, tipc_node_attach_link(n_ptr, l_ptr); - k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr); - list_add_tail(&l_ptr->link_list, &b_ptr->links); - tipc_k_signal((Handler)link_start, (unsigned long)l_ptr); + k_init_timer(&l_ptr->timer, (Handler)link_timeout, + (unsigned long)l_ptr); + + link_state_event(l_ptr, STARTING_EVT); return l_ptr; } -/** - * tipc_link_delete - delete a link - * @l_ptr: pointer to link - * - * Note: 'tipc_net_lock' is write_locked, bearer is locked. - * This routine must not grab the node lock until after link timer cancellation - * to avoid a potential deadlock situation. - */ -void tipc_link_delete(struct tipc_link *l_ptr) +void tipc_link_delete_list(unsigned int bearer_id, bool shutting_down) { - if (!l_ptr) { - pr_err("Attempt to delete non-existent link\n"); - return; - } - - k_cancel_timer(&l_ptr->timer); + struct tipc_link *l_ptr; + struct tipc_node *n_ptr; - tipc_node_lock(l_ptr->owner); - tipc_link_reset(l_ptr); - tipc_node_detach_link(l_ptr->owner, l_ptr); - tipc_link_stop(l_ptr); - list_del_init(&l_ptr->link_list); - tipc_node_unlock(l_ptr->owner); - k_term_timer(&l_ptr->timer); - kfree(l_ptr); -} + rcu_read_lock(); + list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { + tipc_node_lock(n_ptr); + l_ptr = n_ptr->links[bearer_id]; + if (l_ptr) { + tipc_link_reset(l_ptr); + if (shutting_down || !tipc_node_is_up(n_ptr)) { + tipc_node_detach_link(l_ptr->owner, l_ptr); + tipc_link_reset_fragments(l_ptr); + tipc_node_unlock(n_ptr); -static void link_start(struct tipc_link *l_ptr) -{ - tipc_node_lock(l_ptr->owner); - link_state_event(l_ptr, STARTING_EVT); - tipc_node_unlock(l_ptr->owner); + /* Nobody else can access this link now: */ + del_timer_sync(&l_ptr->timer); + kfree(l_ptr); + } else { + /* Detach/delete when failover is finished: */ + l_ptr->flags |= LINK_STOPPED; + tipc_node_unlock(n_ptr); + del_timer_sync(&l_ptr->timer); + } + continue; + } + tipc_node_unlock(n_ptr); + } + rcu_read_unlock(); } /** @@ -335,8 +339,6 @@ static int link_schedule_port(struct tipc_link *l_ptr, u32 origport, u32 sz) spin_lock_bh(&tipc_port_list_lock); p_ptr = tipc_port_lock(origport); if (p_ptr) { - if (!p_ptr->wakeup) - goto exit; if (!list_empty(&p_ptr->wait_list)) goto exit; p_ptr->congested = 1; @@ -371,7 +373,7 @@ void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all) list_del_init(&p_ptr->wait_list); spin_lock_bh(p_ptr->lock); p_ptr->congested = 0; - p_ptr->wakeup(p_ptr); + tipc_port_wakeup(p_ptr); win -= p_ptr->waiting_pkts; spin_unlock_bh(p_ptr->lock); } @@ -386,14 +388,7 @@ exit: */ static void link_release_outqueue(struct tipc_link *l_ptr) { - struct sk_buff *buf = l_ptr->first_out; - struct sk_buff *next; - - while (buf) { - next = buf->next; - kfree_skb(buf); - buf = next; - } + kfree_skb_list(l_ptr->first_out); l_ptr->first_out = NULL; l_ptr->out_queue_size = 0; } @@ -404,43 +399,25 @@ static void link_release_outqueue(struct tipc_link *l_ptr) */ void tipc_link_reset_fragments(struct tipc_link *l_ptr) { - kfree_skb(l_ptr->reasm_head); - l_ptr->reasm_head = NULL; - l_ptr->reasm_tail = NULL; + kfree_skb(l_ptr->reasm_buf); + l_ptr->reasm_buf = NULL; } /** - * tipc_link_stop - purge all inbound and outbound messages associated with link + * tipc_link_purge_queues - purge all pkt queues associated with link * @l_ptr: pointer to link */ -void tipc_link_stop(struct tipc_link *l_ptr) +void tipc_link_purge_queues(struct tipc_link *l_ptr) { - struct sk_buff *buf; - struct sk_buff *next; - - buf = l_ptr->oldest_deferred_in; - while (buf) { - next = buf->next; - kfree_skb(buf); - buf = next; - } - - buf = l_ptr->first_out; - while (buf) { - next = buf->next; - kfree_skb(buf); - buf = next; - } - + kfree_skb_list(l_ptr->oldest_deferred_in); + kfree_skb_list(l_ptr->first_out); tipc_link_reset_fragments(l_ptr); - kfree_skb(l_ptr->proto_msg_queue); l_ptr->proto_msg_queue = NULL; } void tipc_link_reset(struct tipc_link *l_ptr) { - struct sk_buff *buf; u32 prev_state = l_ptr->state; u32 checkpoint = l_ptr->next_in_no; int was_active_link = tipc_link_is_active(l_ptr); @@ -459,10 +436,9 @@ void tipc_link_reset(struct tipc_link *l_ptr) return; tipc_node_link_down(l_ptr->owner, l_ptr); - tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr); + tipc_bearer_remove_dest(l_ptr->bearer_id, l_ptr->addr); - if (was_active_link && tipc_node_active_links(l_ptr->owner) && - l_ptr->owner->permit_changeover) { + if (was_active_link && tipc_node_active_links(l_ptr->owner)) { l_ptr->reset_checkpoint = checkpoint; l_ptr->exp_msg_count = START_CHANGEOVER; } @@ -471,12 +447,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) link_release_outqueue(l_ptr); kfree_skb(l_ptr->proto_msg_queue); l_ptr->proto_msg_queue = NULL; - buf = l_ptr->oldest_deferred_in; - while (buf) { - struct sk_buff *next = buf->next; - kfree_skb(buf); - buf = next; - } + kfree_skb_list(l_ptr->oldest_deferred_in); if (!list_empty(&l_ptr->waiting_ports)) tipc_link_wakeup_ports(l_ptr, 1); @@ -496,12 +467,27 @@ void tipc_link_reset(struct tipc_link *l_ptr) link_reset_statistics(l_ptr); } +void tipc_link_reset_list(unsigned int bearer_id) +{ + struct tipc_link *l_ptr; + struct tipc_node *n_ptr; + + rcu_read_lock(); + list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { + tipc_node_lock(n_ptr); + l_ptr = n_ptr->links[bearer_id]; + if (l_ptr) + tipc_link_reset(l_ptr); + tipc_node_unlock(n_ptr); + } + rcu_read_unlock(); +} static void link_activate(struct tipc_link *l_ptr) { l_ptr->next_in_no = l_ptr->stats.recv_info = 1; tipc_node_link_up(l_ptr->owner, l_ptr); - tipc_bearer_add_dest(l_ptr->b_ptr, l_ptr->addr); + tipc_bearer_add_dest(l_ptr->bearer_id, l_ptr->addr); } /** @@ -514,13 +500,17 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) struct tipc_link *other; u32 cont_intv = l_ptr->continuity_interval; - if (!l_ptr->started && (event != STARTING_EVT)) + if (l_ptr->flags & LINK_STOPPED) + return; + + if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT)) return; /* Not yet. */ - if (link_blocked(l_ptr)) { + /* Check whether changeover is going on */ + if (l_ptr->exp_msg_count) { if (event == TIMEOUT_EVT) link_set_timer(l_ptr, cont_intv); - return; /* Changeover going on */ + return; } switch (l_ptr->state) { @@ -533,12 +523,12 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (l_ptr->next_in_no != l_ptr->checkpoint) { l_ptr->checkpoint = l_ptr->next_in_no; if (tipc_bclink_acks_missing(l_ptr->owner)) { - tipc_link_send_proto_msg(l_ptr, STATE_MSG, - 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, + 0, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; } else if (l_ptr->max_pkt < l_ptr->max_pkt_target) { - tipc_link_send_proto_msg(l_ptr, STATE_MSG, - 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, + 1, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; } link_set_timer(l_ptr, cont_intv); @@ -546,7 +536,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) } l_ptr->state = WORKING_UNKNOWN; l_ptr->fsm_msg_cnt = 0; - tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv / 4); break; @@ -556,7 +546,8 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; - tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, + 0, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -578,7 +569,8 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; - tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, + 0, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -588,14 +580,14 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->fsm_msg_cnt = 0; l_ptr->checkpoint = l_ptr->next_in_no; if (tipc_bclink_acks_missing(l_ptr->owner)) { - tipc_link_send_proto_msg(l_ptr, STATE_MSG, - 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, + 0, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; } link_set_timer(l_ptr, cont_intv); } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) { - tipc_link_send_proto_msg(l_ptr, STATE_MSG, - 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, + 1, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv / 4); } else { /* Link has failed */ @@ -604,8 +596,8 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) tipc_link_reset(l_ptr); l_ptr->state = RESET_UNKNOWN; l_ptr->fsm_msg_cnt = 0; - tipc_link_send_proto_msg(l_ptr, RESET_MSG, - 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, RESET_MSG, + 0, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); } @@ -625,24 +617,25 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = WORKING_WORKING; l_ptr->fsm_msg_cnt = 0; link_activate(l_ptr); - tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; if (l_ptr->owner->working_links == 1) - tipc_link_send_sync(l_ptr); + tipc_link_sync_xmit(l_ptr); link_set_timer(l_ptr, cont_intv); break; case RESET_MSG: l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; - tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, + 1, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; case STARTING_EVT: - l_ptr->started = 1; + l_ptr->flags |= LINK_STARTED; /* fall through */ case TIMEOUT_EVT: - tipc_link_send_proto_msg(l_ptr, RESET_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -660,16 +653,17 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = WORKING_WORKING; l_ptr->fsm_msg_cnt = 0; link_activate(l_ptr); - tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; if (l_ptr->owner->working_links == 1) - tipc_link_send_sync(l_ptr); + tipc_link_sync_xmit(l_ptr); link_set_timer(l_ptr, cont_intv); break; case RESET_MSG: break; case TIMEOUT_EVT: - tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, + 0, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -755,11 +749,11 @@ static void link_add_chain_to_outqueue(struct tipc_link *l_ptr, } /* - * tipc_link_send_buf() is the 'full path' for messages, called from - * inside TIPC when the 'fast path' in tipc_send_buf + * tipc_link_xmit() is the 'full path' for messages, called from + * inside TIPC when the 'fast path' in tipc_send_xmit * has failed, and from link_send() */ -int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf) +int __tipc_link_xmit(struct tipc_link *l_ptr, struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); u32 size = msg_size(msg); @@ -787,13 +781,13 @@ int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf) /* Fragmentation needed ? */ if (size > max_packet) - return link_send_long_buf(l_ptr, buf); + return tipc_link_frag_xmit(l_ptr, buf); /* Packet can be queued or sent. */ if (likely(!link_congested(l_ptr))) { link_add_to_outqueue(l_ptr, buf, msg); - tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr); + tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); l_ptr->unacked_window = 0; return dsz; } @@ -831,42 +825,40 @@ int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf) } /* - * tipc_link_send(): same as tipc_link_send_buf(), but the link to use has - * not been selected yet, and the the owner node is not locked + * tipc_link_xmit(): same as __tipc_link_xmit(), but the link to use + * has not been selected yet, and the the owner node is not locked * Called by TIPC internal users, e.g. the name distributor */ -int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector) +int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector) { struct tipc_link *l_ptr; struct tipc_node *n_ptr; int res = -ELINKCONG; - read_lock_bh(&tipc_net_lock); n_ptr = tipc_node_find(dest); if (n_ptr) { tipc_node_lock(n_ptr); l_ptr = n_ptr->active_links[selector & 1]; if (l_ptr) - res = tipc_link_send_buf(l_ptr, buf); + res = __tipc_link_xmit(l_ptr, buf); else kfree_skb(buf); tipc_node_unlock(n_ptr); } else { kfree_skb(buf); } - read_unlock_bh(&tipc_net_lock); return res; } /* - * tipc_link_send_sync - synchronize broadcast link endpoints. + * tipc_link_sync_xmit - synchronize broadcast link endpoints. * * Give a newly added peer node the sequence number where it should * start receiving and acking broadcast packets. * * Called with node locked */ -static void tipc_link_send_sync(struct tipc_link *l) +static void tipc_link_sync_xmit(struct tipc_link *l) { struct sk_buff *buf; struct tipc_msg *msg; @@ -883,14 +875,14 @@ static void tipc_link_send_sync(struct tipc_link *l) } /* - * tipc_link_recv_sync - synchronize broadcast link endpoints. + * tipc_link_sync_rcv - synchronize broadcast link endpoints. * Receive the sequence number where we should start receiving and * acking broadcast packets from a newly added peer node, and open * up for reception of such packets. * * Called with node locked */ -static void tipc_link_recv_sync(struct tipc_node *n, struct sk_buff *buf) +static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); @@ -900,7 +892,7 @@ static void tipc_link_recv_sync(struct tipc_node *n, struct sk_buff *buf) } /* - * tipc_link_send_names - send name table entries to new neighbor + * tipc_link_names_xmit - send name table entries to new neighbor * * Send routine for bulk delivery of name table messages when contact * with a new neighbor occurs. No link congestion checking is performed @@ -908,7 +900,7 @@ static void tipc_link_recv_sync(struct tipc_node *n, struct sk_buff *buf) * small enough not to require fragmentation. * Called without any locks held. */ -void tipc_link_send_names(struct list_head *message_list, u32 dest) +void tipc_link_names_xmit(struct list_head *message_list, u32 dest) { struct tipc_node *n_ptr; struct tipc_link *l_ptr; @@ -918,7 +910,6 @@ void tipc_link_send_names(struct list_head *message_list, u32 dest) if (list_empty(message_list)) return; - read_lock_bh(&tipc_net_lock); n_ptr = tipc_node_find(dest); if (n_ptr) { tipc_node_lock(n_ptr); @@ -933,7 +924,6 @@ void tipc_link_send_names(struct list_head *message_list, u32 dest) } tipc_node_unlock(n_ptr); } - read_unlock_bh(&tipc_net_lock); /* discard the messages if they couldn't be sent */ list_for_each_safe(buf, temp_buf, ((struct sk_buff *)message_list)) { @@ -943,13 +933,13 @@ void tipc_link_send_names(struct list_head *message_list, u32 dest) } /* - * link_send_buf_fast: Entry for data messages where the + * tipc_link_xmit_fast: Entry for data messages where the * destination link is known and the header is complete, * inclusive total message length. Very time critical. * Link is locked. Returns user data length. */ -static int link_send_buf_fast(struct tipc_link *l_ptr, struct sk_buff *buf, - u32 *used_max_pkt) +static int tipc_link_xmit_fast(struct tipc_link *l_ptr, struct sk_buff *buf, + u32 *used_max_pkt) { struct tipc_msg *msg = buf_msg(buf); int res = msg_data_sz(msg); @@ -957,7 +947,7 @@ static int link_send_buf_fast(struct tipc_link *l_ptr, struct sk_buff *buf, if (likely(!link_congested(l_ptr))) { if (likely(msg_size(msg) <= l_ptr->max_pkt)) { link_add_to_outqueue(l_ptr, buf, msg); - tipc_bearer_send(l_ptr->b_ptr, buf, + tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); l_ptr->unacked_window = 0; return res; @@ -965,18 +955,18 @@ static int link_send_buf_fast(struct tipc_link *l_ptr, struct sk_buff *buf, else *used_max_pkt = l_ptr->max_pkt; } - return tipc_link_send_buf(l_ptr, buf); /* All other cases */ + return __tipc_link_xmit(l_ptr, buf); /* All other cases */ } /* - * tipc_link_send_sections_fast: Entry for messages where the + * tipc_link_iovec_xmit_fast: Entry for messages where the * destination processor is known and the header is complete, * except for total message length. * Returns user data length or errno. */ -int tipc_link_send_sections_fast(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destaddr) +int tipc_link_iovec_xmit_fast(struct tipc_port *sender, + struct iovec const *msg_sect, + unsigned int len, u32 destaddr) { struct tipc_msg *hdr = &sender->phdr; struct tipc_link *l_ptr; @@ -995,18 +985,16 @@ again: if (unlikely(res < 0)) return res; - read_lock_bh(&tipc_net_lock); node = tipc_node_find(destaddr); if (likely(node)) { tipc_node_lock(node); l_ptr = node->active_links[selector]; if (likely(l_ptr)) { if (likely(buf)) { - res = link_send_buf_fast(l_ptr, buf, - &sender->max_pkt); + res = tipc_link_xmit_fast(l_ptr, buf, + &sender->max_pkt); exit: tipc_node_unlock(node); - read_unlock_bh(&tipc_net_lock); return res; } @@ -1023,30 +1011,25 @@ exit: */ sender->max_pkt = l_ptr->max_pkt; tipc_node_unlock(node); - read_unlock_bh(&tipc_net_lock); if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt) goto again; - return link_send_sections_long(sender, msg_sect, len, - destaddr); + return tipc_link_iovec_long_xmit(sender, msg_sect, + len, destaddr); } tipc_node_unlock(node); } - read_unlock_bh(&tipc_net_lock); /* Couldn't find a link to the destination node */ - if (buf) - return tipc_reject_msg(buf, TIPC_ERR_NO_NODE); - if (res >= 0) - return tipc_port_reject_sections(sender, hdr, msg_sect, - len, TIPC_ERR_NO_NODE); - return res; + kfree_skb(buf); + tipc_port_iovec_reject(sender, hdr, msg_sect, len, TIPC_ERR_NO_NODE); + return -ENETUNREACH; } /* - * link_send_sections_long(): Entry for long messages where the + * tipc_link_iovec_long_xmit(): Entry for long messages where the * destination node is known and the header is complete, * inclusive total message length. * Link and bearer congestion status have been checked to be ok, @@ -1059,9 +1042,9 @@ exit: * * Returns user data length or errno. */ -static int link_send_sections_long(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destaddr) +static int tipc_link_iovec_long_xmit(struct tipc_port *sender, + struct iovec const *msg_sect, + unsigned int len, u32 destaddr) { struct tipc_link *l_ptr; struct tipc_node *node; @@ -1124,10 +1107,7 @@ again: if (copy_from_user(buf->data + fragm_crs, sect_crs, sz)) { res = -EFAULT; error: - for (; buf_chain; buf_chain = buf) { - buf = buf_chain->next; - kfree_skb(buf_chain); - } + kfree_skb_list(buf_chain); return res; } sect_crs += sz; @@ -1177,20 +1157,15 @@ error: if (l_ptr->max_pkt < max_pkt) { sender->max_pkt = l_ptr->max_pkt; tipc_node_unlock(node); - for (; buf_chain; buf_chain = buf) { - buf = buf_chain->next; - kfree_skb(buf_chain); - } + kfree_skb_list(buf_chain); goto again; } } else { reject: - for (; buf_chain; buf_chain = buf) { - buf = buf_chain->next; - kfree_skb(buf_chain); - } - return tipc_port_reject_sections(sender, hdr, msg_sect, - len, TIPC_ERR_NO_NODE); + kfree_skb_list(buf_chain); + tipc_port_iovec_reject(sender, hdr, msg_sect, len, + TIPC_ERR_NO_NODE); + return -ENETUNREACH; } /* Append chain of fragments to send queue & send them */ @@ -1206,7 +1181,7 @@ reject: /* * tipc_link_push_packet: Push one unsent packet to the media */ -u32 tipc_link_push_packet(struct tipc_link *l_ptr) +static u32 tipc_link_push_packet(struct tipc_link *l_ptr) { struct sk_buff *buf = l_ptr->first_out; u32 r_q_size = l_ptr->retransm_queue_size; @@ -1231,7 +1206,7 @@ u32 tipc_link_push_packet(struct tipc_link *l_ptr) if (r_q_size && buf) { msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1)); msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in); - tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr); + tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); l_ptr->retransm_queue_head = mod(++r_q_head); l_ptr->retransm_queue_size = --r_q_size; l_ptr->stats.retransmitted++; @@ -1243,7 +1218,7 @@ u32 tipc_link_push_packet(struct tipc_link *l_ptr) if (buf) { msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1)); msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in); - tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr); + tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); l_ptr->unacked_window = 0; kfree_skb(buf); l_ptr->proto_msg_queue = NULL; @@ -1260,7 +1235,8 @@ u32 tipc_link_push_packet(struct tipc_link *l_ptr) if (mod(next - first) < l_ptr->queue_limit[0]) { msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr); + tipc_bearer_send(l_ptr->bearer_id, buf, + &l_ptr->media_addr); if (msg_user(msg) == MSG_BUNDLER) msg_set_type(msg, CLOSED_MSG); l_ptr->next_out = buf->next; @@ -1283,33 +1259,24 @@ void tipc_link_push_queue(struct tipc_link *l_ptr) } while (!res); } -static void link_reset_all(unsigned long addr) +void tipc_link_reset_all(struct tipc_node *node) { - struct tipc_node *n_ptr; char addr_string[16]; u32 i; - read_lock_bh(&tipc_net_lock); - n_ptr = tipc_node_find((u32)addr); - if (!n_ptr) { - read_unlock_bh(&tipc_net_lock); - return; /* node no longer exists */ - } - - tipc_node_lock(n_ptr); + tipc_node_lock(node); pr_warn("Resetting all links to %s\n", - tipc_addr_string_fill(addr_string, n_ptr->addr)); + tipc_addr_string_fill(addr_string, node->addr)); for (i = 0; i < MAX_BEARERS; i++) { - if (n_ptr->links[i]) { - link_print(n_ptr->links[i], "Resetting link\n"); - tipc_link_reset(n_ptr->links[i]); + if (node->links[i]) { + link_print(node->links[i], "Resetting link\n"); + tipc_link_reset(node->links[i]); } } - tipc_node_unlock(n_ptr); - read_unlock_bh(&tipc_net_lock); + tipc_node_unlock(node); } static void link_retransmit_failure(struct tipc_link *l_ptr, @@ -1346,10 +1313,9 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, n_ptr->bclink.oos_state, n_ptr->bclink.last_sent); - tipc_k_signal((Handler)link_reset_all, (unsigned long)n_ptr->addr); - tipc_node_unlock(n_ptr); + tipc_bclink_set_flags(TIPC_BCLINK_RESET); l_ptr->stale_count = 0; } } @@ -1379,7 +1345,7 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *buf, msg = buf_msg(buf); msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr); + tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); buf = buf->next; retransmits--; l_ptr->stats.retransmitted++; @@ -1434,6 +1400,12 @@ static int link_recv_buf_validate(struct sk_buff *buf) u32 hdr_size; u32 min_hdr_size; + /* If this packet comes from the defer queue, the skb has already + * been validated + */ + if (unlikely(TIPC_SKB_CB(buf)->deferred)) + return 1; + if (unlikely(buf->len < MIN_H_SIZE)) return 0; @@ -1459,16 +1431,15 @@ static int link_recv_buf_validate(struct sk_buff *buf) } /** - * tipc_recv_msg - process TIPC messages arriving from off-node + * tipc_rcv - process TIPC packets/messages arriving from off-node * @head: pointer to message buffer chain - * @tb_ptr: pointer to bearer message arrived on + * @b_ptr: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ -void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr) +void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr) { - read_lock_bh(&tipc_net_lock); while (head) { struct tipc_node *n_ptr; struct tipc_link *l_ptr; @@ -1478,13 +1449,9 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr) u32 seq_no; u32 ackd; u32 released = 0; - int type; head = head->next; - - /* Ensure bearer is still enabled */ - if (unlikely(!b_ptr->active)) - goto discard; + buf->next = NULL; /* Ensure message is well-formed */ if (unlikely(!link_recv_buf_validate(buf))) @@ -1499,9 +1466,9 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr) if (unlikely(msg_non_seq(msg))) { if (msg_user(msg) == LINK_CONFIG) - tipc_disc_recv_msg(buf, b_ptr); + tipc_disc_rcv(buf, b_ptr); else - tipc_bclink_recv_pkt(buf); + tipc_bclink_rcv(buf); continue; } @@ -1522,14 +1489,14 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr) goto unlock_discard; /* Verify that communication with node is currently allowed */ - if ((n_ptr->block_setup & WAIT_PEER_DOWN) && - msg_user(msg) == LINK_PROTOCOL && - (msg_type(msg) == RESET_MSG || - msg_type(msg) == ACTIVATE_MSG) && - !msg_redundant_link(msg)) - n_ptr->block_setup &= ~WAIT_PEER_DOWN; - - if (n_ptr->block_setup) + if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) && + msg_user(msg) == LINK_PROTOCOL && + (msg_type(msg) == RESET_MSG || + msg_type(msg) == ACTIVATE_MSG) && + !msg_redundant_link(msg)) + n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN; + + if (tipc_node_blocked(n_ptr)) goto unlock_discard; /* Validate message sequence number info */ @@ -1544,7 +1511,6 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr) while ((crs != l_ptr->next_out) && less_eq(buf_seqno(crs), ackd)) { struct sk_buff *next = crs->next; - kfree_skb(crs); crs = next; released++; @@ -1557,18 +1523,19 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr) /* Try sending any messages link endpoint has pending */ if (unlikely(l_ptr->next_out)) tipc_link_push_queue(l_ptr); + if (unlikely(!list_empty(&l_ptr->waiting_ports))) tipc_link_wakeup_ports(l_ptr, 0); + if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { l_ptr->stats.sent_acks++; - tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } - /* Now (finally!) process the incoming message */ -protocol_check: + /* Process the incoming packet */ if (unlikely(!link_working_working(l_ptr))) { if (msg_user(msg) == LINK_PROTOCOL) { - link_recv_proto_msg(l_ptr, buf); + tipc_link_proto_rcv(l_ptr, buf); head = link_insert_deferred_queue(l_ptr, head); tipc_node_unlock(n_ptr); continue; @@ -1597,72 +1564,64 @@ protocol_check: l_ptr->next_in_no++; if (unlikely(l_ptr->oldest_deferred_in)) head = link_insert_deferred_queue(l_ptr, head); -deliver: - if (likely(msg_isdata(msg))) { - tipc_node_unlock(n_ptr); - tipc_port_recv_msg(buf); - continue; + + /* Deliver packet/message to correct user: */ + if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) { + if (!tipc_link_tunnel_rcv(n_ptr, &buf)) { + tipc_node_unlock(n_ptr); + continue; + } + msg = buf_msg(buf); + } else if (msg_user(msg) == MSG_FRAGMENTER) { + l_ptr->stats.recv_fragments++; + if (tipc_buf_append(&l_ptr->reasm_buf, &buf)) { + l_ptr->stats.recv_fragmented++; + msg = buf_msg(buf); + } else { + if (!l_ptr->reasm_buf) + tipc_link_reset(l_ptr); + tipc_node_unlock(n_ptr); + continue; + } } + switch (msg_user(msg)) { - int ret; + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + tipc_node_unlock(n_ptr); + tipc_sk_rcv(buf); + continue; case MSG_BUNDLER: l_ptr->stats.recv_bundles++; l_ptr->stats.recv_bundled += msg_msgcnt(msg); tipc_node_unlock(n_ptr); - tipc_link_recv_bundle(buf); + tipc_link_bundle_rcv(buf); continue; case NAME_DISTRIBUTOR: n_ptr->bclink.recv_permitted = true; tipc_node_unlock(n_ptr); - tipc_named_recv(buf); - continue; - case BCAST_PROTOCOL: - tipc_link_recv_sync(n_ptr, buf); - tipc_node_unlock(n_ptr); + tipc_named_rcv(buf); continue; case CONN_MANAGER: tipc_node_unlock(n_ptr); - tipc_port_recv_proto_msg(buf); - continue; - case MSG_FRAGMENTER: - l_ptr->stats.recv_fragments++; - ret = tipc_link_recv_fragment(&l_ptr->reasm_head, - &l_ptr->reasm_tail, - &buf); - if (ret == LINK_REASM_COMPLETE) { - l_ptr->stats.recv_fragmented++; - msg = buf_msg(buf); - goto deliver; - } - if (ret == LINK_REASM_ERROR) - tipc_link_reset(l_ptr); - tipc_node_unlock(n_ptr); + tipc_port_proto_rcv(buf); continue; - case CHANGEOVER_PROTOCOL: - type = msg_type(msg); - if (link_recv_changeover_msg(&l_ptr, &buf)) { - msg = buf_msg(buf); - seq_no = msg_seqno(msg); - if (type == ORIGINAL_MSG) - goto deliver; - goto protocol_check; - } + case BCAST_PROTOCOL: + tipc_link_sync_rcv(n_ptr, buf); break; default: kfree_skb(buf); - buf = NULL; break; } tipc_node_unlock(n_ptr); - tipc_net_route_msg(buf); continue; unlock_discard: - tipc_node_unlock(n_ptr); discard: kfree_skb(buf); } - read_unlock_bh(&tipc_net_lock); } /** @@ -1724,7 +1683,7 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, u32 seq_no = buf_seqno(buf); if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) { - link_recv_proto_msg(l_ptr, buf); + tipc_link_proto_rcv(l_ptr, buf); return; } @@ -1745,8 +1704,9 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, &l_ptr->newest_deferred_in, buf)) { l_ptr->deferred_inqueue_sz++; l_ptr->stats.deferred_recv++; + TIPC_SKB_CB(buf)->deferred = true; if ((l_ptr->deferred_inqueue_sz % 16) == 1) - tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } else l_ptr->stats.duplicates++; } @@ -1754,9 +1714,8 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, /* * Send protocol message to the other endpoint. */ -void tipc_link_send_proto_msg(struct tipc_link *l_ptr, u32 msg_typ, - int probe_msg, u32 gap, u32 tolerance, - u32 priority, u32 ack_mtu) +void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, + u32 gap, u32 tolerance, u32 priority, u32 ack_mtu) { struct sk_buff *buf = NULL; struct tipc_msg *msg = l_ptr->pmsg; @@ -1769,16 +1728,17 @@ void tipc_link_send_proto_msg(struct tipc_link *l_ptr, u32 msg_typ, l_ptr->proto_msg_queue = NULL; } - if (link_blocked(l_ptr)) + /* Don't send protocol message during link changeover */ + if (l_ptr->exp_msg_count) return; /* Abort non-RESET send if communication with node is prohibited */ - if ((l_ptr->owner->block_setup) && (msg_typ != RESET_MSG)) + if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG)) return; /* Create protocol message with "out-of-sequence" sequence number */ msg_set_type(msg, msg_typ); - msg_set_net_plane(msg, l_ptr->b_ptr->net_plane); + msg_set_net_plane(msg, l_ptr->net_plane); msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); msg_set_last_bcast(msg, tipc_bclink_get_last_sent()); @@ -1844,7 +1804,7 @@ void tipc_link_send_proto_msg(struct tipc_link *l_ptr, u32 msg_typ, skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); buf->priority = TC_PRIO_CONTROL; - tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr); + tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); l_ptr->unacked_window = 0; kfree_skb(buf); } @@ -1854,7 +1814,7 @@ void tipc_link_send_proto_msg(struct tipc_link *l_ptr, u32 msg_typ, * Note that network plane id propagates through the network, and may * change at any time. The node with lowest address rules */ -static void link_recv_proto_msg(struct tipc_link *l_ptr, struct sk_buff *buf) +static void tipc_link_proto_rcv(struct tipc_link *l_ptr, struct sk_buff *buf) { u32 rec_gap = 0; u32 max_pkt_info; @@ -1862,17 +1822,13 @@ static void link_recv_proto_msg(struct tipc_link *l_ptr, struct sk_buff *buf) u32 msg_tol; struct tipc_msg *msg = buf_msg(buf); - if (link_blocked(l_ptr)) + /* Discard protocol message during link changeover */ + if (l_ptr->exp_msg_count) goto exit; - /* record unnumbered packet arrival (force mismatch on next timeout) */ - l_ptr->checkpoint--; - - if (l_ptr->b_ptr->net_plane != msg_net_plane(msg)) + if (l_ptr->net_plane != msg_net_plane(msg)) if (tipc_own_addr > msg_prevnode(msg)) - l_ptr->b_ptr->net_plane = msg_net_plane(msg); - - l_ptr->owner->permit_changeover = msg_redundant_link(msg); + l_ptr->net_plane = msg_net_plane(msg); switch (msg_type(msg)) { @@ -1889,7 +1845,7 @@ static void link_recv_proto_msg(struct tipc_link *l_ptr, struct sk_buff *buf) * peer has lost contact -- don't allow peer's links * to reactivate before we recognize loss & clean up */ - l_ptr->owner->block_setup = WAIT_NODE_DOWN; + l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN; } link_state_event(l_ptr, RESET_MSG); @@ -1945,6 +1901,10 @@ static void link_recv_proto_msg(struct tipc_link *l_ptr, struct sk_buff *buf) tipc_link_reset(l_ptr); /* Enforce change to take effect */ break; } + + /* Record reception; force mismatch at next timeout: */ + l_ptr->checkpoint--; + link_state_event(l_ptr, TRAFFIC_MSG_EVT); l_ptr->stats.recv_states++; if (link_reset_unknown(l_ptr)) @@ -1974,8 +1934,8 @@ static void link_recv_proto_msg(struct tipc_link *l_ptr, struct sk_buff *buf) msg_last_bcast(msg)); if (rec_gap || (msg_probe(msg))) { - tipc_link_send_proto_msg(l_ptr, STATE_MSG, - 0, rec_gap, 0, 0, max_pkt_ack); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, rec_gap, 0, + 0, max_pkt_ack); } if (msg_seq_gap(msg)) { l_ptr->stats.recv_nacks++; @@ -1989,13 +1949,13 @@ exit: } -/* - * tipc_link_tunnel(): Send one message via a link belonging to - * another bearer. Owner node is locked. +/* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to + * a different bearer. Owner node is locked. */ -static void tipc_link_tunnel(struct tipc_link *l_ptr, - struct tipc_msg *tunnel_hdr, struct tipc_msg *msg, - u32 selector) +static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, + struct tipc_msg *tunnel_hdr, + struct tipc_msg *msg, + u32 selector) { struct tipc_link *tunnel; struct sk_buff *buf; @@ -2014,16 +1974,17 @@ static void tipc_link_tunnel(struct tipc_link *l_ptr, } skb_copy_to_linear_data(buf, tunnel_hdr, INT_H_SIZE); skb_copy_to_linear_data_offset(buf, INT_H_SIZE, msg, length); - tipc_link_send_buf(tunnel, buf); + __tipc_link_xmit(tunnel, buf); } - -/* - * changeover(): Send whole message queue via the remaining link - * Owner node is locked. +/* tipc_link_failover_send_queue(): A link has gone down, but a second + * link is still active. We can do failover. Tunnel the failing link's + * whole send queue via the remaining link. This way, we don't lose + * any packets, and sequence order is preserved for subsequent traffic + * sent over the remaining link. Owner node is locked. */ -void tipc_link_changeover(struct tipc_link *l_ptr) +void tipc_link_failover_send_queue(struct tipc_link *l_ptr) { u32 msgcount = l_ptr->out_queue_size; struct sk_buff *crs = l_ptr->first_out; @@ -2034,11 +1995,6 @@ void tipc_link_changeover(struct tipc_link *l_ptr) if (!tunnel) return; - if (!l_ptr->owner->permit_changeover) { - pr_warn("%speer did not permit changeover\n", link_co_err); - return; - } - tipc_msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL, ORIGINAL_MSG, INT_H_SIZE, l_ptr->addr); msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); @@ -2051,7 +2007,7 @@ void tipc_link_changeover(struct tipc_link *l_ptr) if (buf) { skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE); msg_set_size(&tunnel_hdr, INT_H_SIZE); - tipc_link_send_buf(tunnel, buf); + __tipc_link_xmit(tunnel, buf); } else { pr_warn("%sunable to send changeover msg\n", link_co_err); @@ -2072,20 +2028,30 @@ void tipc_link_changeover(struct tipc_link *l_ptr) msgcount = msg_msgcnt(msg); while (msgcount--) { msg_set_seqno(m, msg_seqno(msg)); - tipc_link_tunnel(l_ptr, &tunnel_hdr, m, - msg_link_selector(m)); + tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, m, + msg_link_selector(m)); pos += align(msg_size(m)); m = (struct tipc_msg *)pos; } } else { - tipc_link_tunnel(l_ptr, &tunnel_hdr, msg, - msg_link_selector(msg)); + tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, msg, + msg_link_selector(msg)); } crs = crs->next; } } -void tipc_link_send_duplicate(struct tipc_link *l_ptr, struct tipc_link *tunnel) +/* tipc_link_dup_queue_xmit(): A second link has become active. Tunnel a + * duplicate of the first link's send queue via the new link. This way, we + * are guaranteed that currently queued packets from a socket are delivered + * before future traffic from the same socket, even if this is using the + * new link. The last arriving copy of each duplicate packet is dropped at + * the receiving end by the regular protocol check, so packet cardinality + * and sequence order is preserved per sender/receiver socket pair. + * Owner node is locked. + */ +void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, + struct tipc_link *tunnel) { struct sk_buff *iter; struct tipc_msg tunnel_hdr; @@ -2114,7 +2080,7 @@ void tipc_link_send_duplicate(struct tipc_link *l_ptr, struct tipc_link *tunnel) skb_copy_to_linear_data(outbuf, &tunnel_hdr, INT_H_SIZE); skb_copy_to_linear_data_offset(outbuf, INT_H_SIZE, iter->data, length); - tipc_link_send_buf(tunnel, outbuf); + __tipc_link_xmit(tunnel, outbuf); if (!tipc_link_is_up(l_ptr)) return; iter = iter->next; @@ -2141,87 +2107,112 @@ static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos) return eb; } -/* - * link_recv_changeover_msg(): Receive tunneled packet sent - * via other link. Node is locked. Return extracted buffer. + + +/* tipc_link_dup_rcv(): Receive a tunnelled DUPLICATE_MSG packet. + * Owner node is locked. */ -static int link_recv_changeover_msg(struct tipc_link **l_ptr, - struct sk_buff **buf) +static void tipc_link_dup_rcv(struct tipc_link *l_ptr, + struct sk_buff *t_buf) { - struct sk_buff *tunnel_buf = *buf; - struct tipc_link *dest_link; - struct tipc_msg *msg; - struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf); - u32 msg_typ = msg_type(tunnel_msg); - u32 msg_count = msg_msgcnt(tunnel_msg); - u32 bearer_id = msg_bearer_id(tunnel_msg); + struct sk_buff *buf; - if (bearer_id >= MAX_BEARERS) - goto exit; - dest_link = (*l_ptr)->owner->links[bearer_id]; - if (!dest_link) - goto exit; - if (dest_link == *l_ptr) { - pr_err("Unexpected changeover message on link <%s>\n", - (*l_ptr)->name); - goto exit; + if (!tipc_link_is_up(l_ptr)) + return; + + buf = buf_extract(t_buf, INT_H_SIZE); + if (buf == NULL) { + pr_warn("%sfailed to extract inner dup pkt\n", link_co_err); + return; } - *l_ptr = dest_link; - msg = msg_get_wrapped(tunnel_msg); - if (msg_typ == DUPLICATE_MSG) { - if (less(msg_seqno(msg), mod(dest_link->next_in_no))) - goto exit; - *buf = buf_extract(tunnel_buf, INT_H_SIZE); - if (*buf == NULL) { - pr_warn("%sduplicate msg dropped\n", link_co_err); + /* Add buffer to deferred queue, if applicable: */ + link_handle_out_of_seq_msg(l_ptr, buf); +} + +/* tipc_link_failover_rcv(): Receive a tunnelled ORIGINAL_MSG packet + * Owner node is locked. + */ +static struct sk_buff *tipc_link_failover_rcv(struct tipc_link *l_ptr, + struct sk_buff *t_buf) +{ + struct tipc_msg *t_msg = buf_msg(t_buf); + struct sk_buff *buf = NULL; + struct tipc_msg *msg; + + if (tipc_link_is_up(l_ptr)) + tipc_link_reset(l_ptr); + + /* First failover packet? */ + if (l_ptr->exp_msg_count == START_CHANGEOVER) + l_ptr->exp_msg_count = msg_msgcnt(t_msg); + + /* Should there be an inner packet? */ + if (l_ptr->exp_msg_count) { + l_ptr->exp_msg_count--; + buf = buf_extract(t_buf, INT_H_SIZE); + if (buf == NULL) { + pr_warn("%sno inner failover pkt\n", link_co_err); goto exit; } - kfree_skb(tunnel_buf); - return 1; - } + msg = buf_msg(buf); - /* First original message ?: */ - if (tipc_link_is_up(dest_link)) { - pr_info("%s<%s>, changeover initiated by peer\n", link_rst_msg, - dest_link->name); - tipc_link_reset(dest_link); - dest_link->exp_msg_count = msg_count; - if (!msg_count) - goto exit; - } else if (dest_link->exp_msg_count == START_CHANGEOVER) { - dest_link->exp_msg_count = msg_count; - if (!msg_count) + if (less(msg_seqno(msg), l_ptr->reset_checkpoint)) { + kfree_skb(buf); + buf = NULL; goto exit; + } + if (msg_user(msg) == MSG_FRAGMENTER) { + l_ptr->stats.recv_fragments++; + tipc_buf_append(&l_ptr->reasm_buf, &buf); + } } +exit: + if ((l_ptr->exp_msg_count == 0) && (l_ptr->flags & LINK_STOPPED)) { + tipc_node_detach_link(l_ptr->owner, l_ptr); + kfree(l_ptr); + } + return buf; +} + +/* tipc_link_tunnel_rcv(): Receive a tunnelled packet, sent + * via other link as result of a failover (ORIGINAL_MSG) or + * a new active link (DUPLICATE_MSG). Failover packets are + * returned to the active link for delivery upwards. + * Owner node is locked. + */ +static int tipc_link_tunnel_rcv(struct tipc_node *n_ptr, + struct sk_buff **buf) +{ + struct sk_buff *t_buf = *buf; + struct tipc_link *l_ptr; + struct tipc_msg *t_msg = buf_msg(t_buf); + u32 bearer_id = msg_bearer_id(t_msg); - /* Receive original message */ - if (dest_link->exp_msg_count == 0) { - pr_warn("%sgot too many tunnelled messages\n", link_co_err); + *buf = NULL; + + if (bearer_id >= MAX_BEARERS) goto exit; - } - dest_link->exp_msg_count--; - if (less(msg_seqno(msg), dest_link->reset_checkpoint)) { + + l_ptr = n_ptr->links[bearer_id]; + if (!l_ptr) goto exit; - } else { - *buf = buf_extract(tunnel_buf, INT_H_SIZE); - if (*buf != NULL) { - kfree_skb(tunnel_buf); - return 1; - } else { - pr_warn("%soriginal msg dropped\n", link_co_err); - } - } + + if (msg_type(t_msg) == DUPLICATE_MSG) + tipc_link_dup_rcv(l_ptr, t_buf); + else if (msg_type(t_msg) == ORIGINAL_MSG) + *buf = tipc_link_failover_rcv(l_ptr, t_buf); + else + pr_warn("%sunknown tunnel pkt received\n", link_co_err); exit: - *buf = NULL; - kfree_skb(tunnel_buf); - return 0; + kfree_skb(t_buf); + return *buf != NULL; } /* * Bundler functionality: */ -void tipc_link_recv_bundle(struct sk_buff *buf) +void tipc_link_bundle_rcv(struct sk_buff *buf) { u32 msgcount = msg_msgcnt(buf_msg(buf)); u32 pos = INT_H_SIZE; @@ -2244,11 +2235,11 @@ void tipc_link_recv_bundle(struct sk_buff *buf) */ /* - * link_send_long_buf: Entry for buffers needing fragmentation. + * tipc_link_frag_xmit: Entry for buffers needing fragmentation. * The buffer is complete, inclusive total message length. * Returns user data length. */ -static int link_send_long_buf(struct tipc_link *l_ptr, struct sk_buff *buf) +static int tipc_link_frag_xmit(struct tipc_link *l_ptr, struct sk_buff *buf) { struct sk_buff *buf_chain = NULL; struct sk_buff *buf_chain_tail = (struct sk_buff *)&buf_chain; @@ -2283,11 +2274,7 @@ static int link_send_long_buf(struct tipc_link *l_ptr, struct sk_buff *buf) fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE); if (fragm == NULL) { kfree_skb(buf); - while (buf_chain) { - buf = buf_chain; - buf_chain = buf_chain->next; - kfree_skb(buf); - } + kfree_skb_list(buf_chain); return -ENOMEM; } msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); @@ -2315,51 +2302,6 @@ static int link_send_long_buf(struct tipc_link *l_ptr, struct sk_buff *buf) return dsz; } -/* - * tipc_link_recv_fragment(): Called with node lock on. Returns - * the reassembled buffer if message is complete. - */ -int tipc_link_recv_fragment(struct sk_buff **head, struct sk_buff **tail, - struct sk_buff **fbuf) -{ - struct sk_buff *frag = *fbuf; - struct tipc_msg *msg = buf_msg(frag); - u32 fragid = msg_type(msg); - bool headstolen; - int delta; - - skb_pull(frag, msg_hdr_sz(msg)); - if (fragid == FIRST_FRAGMENT) { - if (*head || skb_unclone(frag, GFP_ATOMIC)) - goto out_free; - *head = frag; - skb_frag_list_init(*head); - return 0; - } else if (*head && - skb_try_coalesce(*head, frag, &headstolen, &delta)) { - kfree_skb_partial(frag, headstolen); - } else { - if (!*head) - goto out_free; - if (!skb_has_frag_list(*head)) - skb_shinfo(*head)->frag_list = frag; - else - (*tail)->next = frag; - *tail = frag; - (*head)->truesize += frag->truesize; - } - if (fragid == LAST_FRAGMENT) { - *fbuf = *head; - *tail = *head = NULL; - return LINK_REASM_COMPLETE; - } - return 0; -out_free: - pr_warn_ratelimited("Link unable to reassemble fragmented message\n"); - kfree_skb(*fbuf); - return LINK_REASM_ERROR; -} - static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tolerance) { if ((tolerance < TIPC_MIN_LINK_TOL) || (tolerance > TIPC_MAX_LINK_TOL)) @@ -2390,35 +2332,39 @@ void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window) l_ptr->queue_limit[MSG_FRAGMENTER] = 4000; } -/** - * link_find_link - locate link by name - * @name: ptr to link name string - * @node: ptr to area to be filled with ptr to associated node +/* tipc_link_find_owner - locate owner node of link by link's name + * @name: pointer to link name string + * @bearer_id: pointer to index in 'node->links' array where the link was found. * - * Caller must hold 'tipc_net_lock' to ensure node and bearer are not deleted; - * this also prevents link deletion. - * - * Returns pointer to link (or 0 if invalid link name). + * Returns pointer to node owning the link, or 0 if no matching link is found. */ -static struct tipc_link *link_find_link(const char *name, - struct tipc_node **node) +static struct tipc_node *tipc_link_find_owner(const char *link_name, + unsigned int *bearer_id) { struct tipc_link *l_ptr; struct tipc_node *n_ptr; + struct tipc_node *found_node = 0; int i; - list_for_each_entry(n_ptr, &tipc_node_list, list) { + *bearer_id = 0; + rcu_read_lock(); + list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { + tipc_node_lock(n_ptr); for (i = 0; i < MAX_BEARERS; i++) { l_ptr = n_ptr->links[i]; - if (l_ptr && !strcmp(l_ptr->name, name)) - goto found; + if (l_ptr && !strcmp(l_ptr->name, link_name)) { + *bearer_id = i; + found_node = n_ptr; + break; + } } + tipc_node_unlock(n_ptr); + if (found_node) + break; } - l_ptr = NULL; - n_ptr = NULL; -found: - *node = n_ptr; - return l_ptr; + rcu_read_unlock(); + + return found_node; } /** @@ -2450,7 +2396,7 @@ static int link_value_is_valid(u16 cmd, u32 new_value) * @new_value: new value of link, bearer, or media setting * @cmd: which link, bearer, or media attribute to set (TIPC_CMD_SET_LINK_*) * - * Caller must hold 'tipc_net_lock' to ensure link/bearer/media is not deleted. + * Caller must hold RTNL lock to ensure link/bearer/media is not deleted. * * Returns 0 if value updated and negative value on error. */ @@ -2460,32 +2406,33 @@ static int link_cmd_set_value(const char *name, u32 new_value, u16 cmd) struct tipc_link *l_ptr; struct tipc_bearer *b_ptr; struct tipc_media *m_ptr; + int bearer_id; int res = 0; - l_ptr = link_find_link(name, &node); - if (l_ptr) { - /* - * acquire node lock for tipc_link_send_proto_msg(). - * see "TIPC locking policy" in net.c. - */ + node = tipc_link_find_owner(name, &bearer_id); + if (node) { tipc_node_lock(node); - switch (cmd) { - case TIPC_CMD_SET_LINK_TOL: - link_set_supervision_props(l_ptr, new_value); - tipc_link_send_proto_msg(l_ptr, - STATE_MSG, 0, 0, new_value, 0, 0); - break; - case TIPC_CMD_SET_LINK_PRI: - l_ptr->priority = new_value; - tipc_link_send_proto_msg(l_ptr, - STATE_MSG, 0, 0, 0, new_value, 0); - break; - case TIPC_CMD_SET_LINK_WINDOW: - tipc_link_set_queue_limits(l_ptr, new_value); - break; - default: - res = -EINVAL; - break; + l_ptr = node->links[bearer_id]; + + if (l_ptr) { + switch (cmd) { + case TIPC_CMD_SET_LINK_TOL: + link_set_supervision_props(l_ptr, new_value); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, + new_value, 0, 0); + break; + case TIPC_CMD_SET_LINK_PRI: + l_ptr->priority = new_value; + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, + 0, new_value, 0); + break; + case TIPC_CMD_SET_LINK_WINDOW: + tipc_link_set_queue_limits(l_ptr, new_value); + break; + default: + res = -EINVAL; + break; + } } tipc_node_unlock(node); return res; @@ -2555,9 +2502,7 @@ struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space " (cannot change setting on broadcast link)"); } - read_lock_bh(&tipc_net_lock); res = link_cmd_set_value(args->name, new_value, cmd); - read_unlock_bh(&tipc_net_lock); if (res) return tipc_cfg_reply_error_string("cannot change link setting"); @@ -2580,6 +2525,7 @@ struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_ char *link_name; struct tipc_link *l_ptr; struct tipc_node *node; + unsigned int bearer_id; if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME)) return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); @@ -2590,18 +2536,18 @@ struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_ return tipc_cfg_reply_error_string("link not found"); return tipc_cfg_reply_none(); } + node = tipc_link_find_owner(link_name, &bearer_id); + if (!node) + return tipc_cfg_reply_error_string("link not found"); - read_lock_bh(&tipc_net_lock); - l_ptr = link_find_link(link_name, &node); + tipc_node_lock(node); + l_ptr = node->links[bearer_id]; if (!l_ptr) { - read_unlock_bh(&tipc_net_lock); + tipc_node_unlock(node); return tipc_cfg_reply_error_string("link not found"); } - - tipc_node_lock(node); link_reset_statistics(l_ptr); tipc_node_unlock(node); - read_unlock_bh(&tipc_net_lock); return tipc_cfg_reply_none(); } @@ -2628,18 +2574,24 @@ static int tipc_link_stats(const char *name, char *buf, const u32 buf_size) struct tipc_node *node; char *status; u32 profile_total = 0; + unsigned int bearer_id; int ret; if (!strcmp(name, tipc_bclink_name)) return tipc_bclink_stats(buf, buf_size); - read_lock_bh(&tipc_net_lock); - l = link_find_link(name, &node); + node = tipc_link_find_owner(name, &bearer_id); + if (!node) + return 0; + + tipc_node_lock(node); + + l = node->links[bearer_id]; if (!l) { - read_unlock_bh(&tipc_net_lock); + tipc_node_unlock(node); return 0; } - tipc_node_lock(node); + s = &l->stats; if (tipc_link_is_active(l)) @@ -2702,7 +2654,6 @@ static int tipc_link_stats(const char *name, char *buf, const u32 buf_size) (s->accu_queue_sz / s->queue_sz_counts) : 0); tipc_node_unlock(node); - read_unlock_bh(&tipc_net_lock); return ret; } @@ -2753,7 +2704,6 @@ u32 tipc_link_get_max_pkt(u32 dest, u32 selector) if (dest == tipc_own_addr) return MAX_MSG_SIZE; - read_lock_bh(&tipc_net_lock); n_ptr = tipc_node_find(dest); if (n_ptr) { tipc_node_lock(n_ptr); @@ -2762,13 +2712,18 @@ u32 tipc_link_get_max_pkt(u32 dest, u32 selector) res = l_ptr->max_pkt; tipc_node_unlock(n_ptr); } - read_unlock_bh(&tipc_net_lock); return res; } static void link_print(struct tipc_link *l_ptr, const char *str) { - pr_info("%s Link %x<%s>:", str, l_ptr->addr, l_ptr->b_ptr->name); + struct tipc_bearer *b_ptr; + + rcu_read_lock(); + b_ptr = rcu_dereference_rtnl(bearer_list[l_ptr->bearer_id]); + if (b_ptr) + pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name); + rcu_read_unlock(); if (link_working_unknown(l_ptr)) pr_cont(":WU\n"); diff --git a/net/tipc/link.h b/net/tipc/link.h index 8a6c1026644..200d518b218 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -1,7 +1,7 @@ /* * net/tipc/link.h: Include file for TIPC link code * - * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 1995-2006, 2013, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -40,27 +40,23 @@ #include "msg.h" #include "node.h" -/* - * Link reassembly status codes - */ -#define LINK_REASM_ERROR -1 -#define LINK_REASM_COMPLETE 1 - -/* - * Out-of-range value for link sequence numbers +/* Out-of-range value for link sequence numbers */ #define INVALID_LINK_SEQ 0x10000 -/* - * Link states +/* Link working states */ #define WORKING_WORKING 560810u #define WORKING_UNKNOWN 560811u #define RESET_UNKNOWN 560812u #define RESET_RESET 560813u -/* - * Starting value for maximum packet size negotiation on unicast links +/* Link endpoint execution states + */ +#define LINK_STARTED 0x0001 +#define LINK_STOPPED 0x0002 + +/* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) */ #define MAX_PKT_DEFAULT 1500 @@ -102,21 +98,20 @@ struct tipc_stats { * @media_addr: media address to use when sending messages over link * @timer: link timer * @owner: pointer to peer node - * @link_list: adjacent links in bearer's list of links - * @started: indicates if link has been started + * @flags: execution state flags for link endpoint instance * @checkpoint: reference point for triggering link continuity checking * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint - * @b_ptr: pointer to bearer used by link + * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] * @continuity_interval: link continuity testing interval [in ms] * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM - * @blocked: indicates if link has been administratively blocked * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state * @proto_msg: template for control messages generated by link * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority + * @net_plane: current link network plane ('A' through 'H') * @queue_limit: outbound message queue congestion thresholds (indexed by user) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_checkpoint: seq # of last acknowledged message at time of link reset @@ -140,8 +135,7 @@ struct tipc_stats { * @next_out: ptr to first unsent outbound message in queue * @waiting_ports: linked list of ports waiting for link congestion to abate * @long_msg_seq_no: next identifier to use for outbound fragmented messages - * @reasm_head: list head of partially reassembled inbound message fragments - * @reasm_tail: last fragment received + * @reasm_buf: head of partially reassembled inbound message fragments * @stats: collects statistics regarding link activity */ struct tipc_link { @@ -150,19 +144,17 @@ struct tipc_link { struct tipc_media_addr media_addr; struct timer_list timer; struct tipc_node *owner; - struct list_head link_list; /* Management and link supervision data */ - int started; + unsigned int flags; u32 checkpoint; u32 peer_session; u32 peer_bearer_id; - struct tipc_bearer *b_ptr; + u32 bearer_id; u32 tolerance; u32 continuity_interval; u32 abort_limit; int state; - int blocked; u32 fsm_msg_cnt; struct { unchar hdr[INT_H_SIZE]; @@ -170,6 +162,7 @@ struct tipc_link { } proto_msg; struct tipc_msg *pmsg; u32 priority; + char net_plane; u32 queue_limit[15]; /* queue_limit[0]==window limit */ /* Changeover */ @@ -205,8 +198,7 @@ struct tipc_link { /* Fragmentation/reassembly */ u32 long_msg_seq_no; - struct sk_buff *reasm_head; - struct sk_buff *reasm_tail; + struct sk_buff *reasm_buf; /* Statistics */ struct tipc_stats stats; @@ -217,35 +209,37 @@ struct tipc_port; struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, struct tipc_bearer *b_ptr, const struct tipc_media_addr *media_addr); -void tipc_link_delete(struct tipc_link *l_ptr); -void tipc_link_changeover(struct tipc_link *l_ptr); -void tipc_link_send_duplicate(struct tipc_link *l_ptr, struct tipc_link *dest); +void tipc_link_delete_list(unsigned int bearer_id, bool shutting_down); +void tipc_link_failover_send_queue(struct tipc_link *l_ptr); +void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest); void tipc_link_reset_fragments(struct tipc_link *l_ptr); int tipc_link_is_up(struct tipc_link *l_ptr); int tipc_link_is_active(struct tipc_link *l_ptr); -u32 tipc_link_push_packet(struct tipc_link *l_ptr); -void tipc_link_stop(struct tipc_link *l_ptr); -struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd); -struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space); -struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space); +void tipc_link_purge_queues(struct tipc_link *l_ptr); +struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, + int req_tlv_space, + u16 cmd); +struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, + int req_tlv_space); +struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, + int req_tlv_space); +void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); -int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector); -void tipc_link_send_names(struct list_head *message_list, u32 dest); +void tipc_link_reset_list(unsigned int bearer_id); +int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector); +void tipc_link_names_xmit(struct list_head *message_list, u32 dest); +int __tipc_link_xmit(struct tipc_link *l_ptr, struct sk_buff *buf); int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf); u32 tipc_link_get_max_pkt(u32 dest, u32 selector); -int tipc_link_send_sections_fast(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destnode); -void tipc_link_recv_bundle(struct sk_buff *buf); -int tipc_link_recv_fragment(struct sk_buff **reasm_head, - struct sk_buff **reasm_tail, - struct sk_buff **fbuf); -void tipc_link_send_proto_msg(struct tipc_link *l_ptr, u32 msg_typ, int prob, - u32 gap, u32 tolerance, u32 priority, - u32 acked_mtu); +int tipc_link_iovec_xmit_fast(struct tipc_port *sender, + struct iovec const *msg_sect, + unsigned int len, u32 destnode); +void tipc_link_bundle_rcv(struct sk_buff *buf); +void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, + u32 gap, u32 tolerance, u32 priority, u32 acked_mtu); void tipc_link_push_queue(struct tipc_link *l_ptr); u32 tipc_link_defer_pkt(struct sk_buff **head, struct sk_buff **tail, - struct sk_buff *buf); + struct sk_buff *buf); void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all); void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); void tipc_link_retransmit(struct tipc_link *l_ptr, @@ -312,11 +306,6 @@ static inline int link_reset_reset(struct tipc_link *l_ptr) return l_ptr->state == RESET_RESET; } -static inline int link_blocked(struct tipc_link *l_ptr) -{ - return l_ptr->exp_msg_count || l_ptr->blocked; -} - static inline int link_congested(struct tipc_link *l_ptr) { return l_ptr->out_queue_size >= l_ptr->queue_limit[0]; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index e525f8ce1de..0a37a472c29 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -1,7 +1,7 @@ /* * net/tipc/msg.c: TIPC message header routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -99,3 +99,61 @@ int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, } return dsz; } + +/* tipc_buf_append(): Append a buffer to the fragment list of another buffer + * @*headbuf: in: NULL for first frag, otherwise value returned from prev call + * out: set when successful non-complete reassembly, otherwise NULL + * @*buf: in: the buffer to append. Always defined + * out: head buf after sucessful complete reassembly, otherwise NULL + * Returns 1 when reassembly complete, otherwise 0 + */ +int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) +{ + struct sk_buff *head = *headbuf; + struct sk_buff *frag = *buf; + struct sk_buff *tail; + struct tipc_msg *msg = buf_msg(frag); + u32 fragid = msg_type(msg); + bool headstolen; + int delta; + + skb_pull(frag, msg_hdr_sz(msg)); + + if (fragid == FIRST_FRAGMENT) { + if (head || skb_unclone(frag, GFP_ATOMIC)) + goto out_free; + head = *headbuf = frag; + skb_frag_list_init(head); + *buf = NULL; + return 0; + } + if (!head) + goto out_free; + tail = TIPC_SKB_CB(head)->tail; + if (skb_try_coalesce(head, frag, &headstolen, &delta)) { + kfree_skb_partial(frag, headstolen); + } else { + if (!skb_has_frag_list(head)) + skb_shinfo(head)->frag_list = frag; + else + tail->next = frag; + head->truesize += frag->truesize; + head->data_len += frag->len; + head->len += frag->len; + TIPC_SKB_CB(head)->tail = frag; + } + if (fragid == LAST_FRAGMENT) { + *buf = head; + TIPC_SKB_CB(head)->tail = NULL; + *headbuf = NULL; + return 1; + } + *buf = NULL; + return 0; +out_free: + pr_warn_ratelimited("Unable to build fragment list\n"); + kfree_skb(*buf); + kfree_skb(*headbuf); + *buf = *headbuf = NULL; + return 0; +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 76d1269b944..503511903d1 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1,7 +1,7 @@ /* * net/tipc/msg.h: Include file for TIPC message header routines * - * Copyright (c) 2000-2007, Ericsson AB + * Copyright (c) 2000-2007, 2014, Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * @@ -711,4 +711,7 @@ void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, unsigned int len, int max_size, struct sk_buff **buf); + +int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); + #endif diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index e0d08055754..8ce730984aa 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -38,34 +38,6 @@ #include "link.h" #include "name_distr.h" -#define ITEM_SIZE sizeof(struct distr_item) - -/** - * struct distr_item - publication info distributed to other nodes - * @type: name sequence type - * @lower: name sequence lower bound - * @upper: name sequence upper bound - * @ref: publishing port reference - * @key: publication key - * - * ===> All fields are stored in network byte order. <=== - * - * First 3 fields identify (name or) name sequence being published. - * Reference field uniquely identifies port that published name sequence. - * Key field uniquely identifies publication, in the event a port has - * multiple publications of the same name sequence. - * - * Note: There is no field that identifies the publishing node because it is - * the same for all items contained within a publication message. - */ -struct distr_item { - __be32 type; - __be32 lower; - __be32 upper; - __be32 ref; - __be32 key; -}; - /** * struct publ_list - list of publications made by this node * @list: circular list of publications @@ -127,20 +99,28 @@ static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest) return buf; } -static void named_cluster_distribute(struct sk_buff *buf) +void named_cluster_distribute(struct sk_buff *buf) { struct sk_buff *buf_copy; struct tipc_node *n_ptr; + struct tipc_link *l_ptr; - list_for_each_entry(n_ptr, &tipc_node_list, list) { - if (tipc_node_active_links(n_ptr)) { + rcu_read_lock(); + list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { + tipc_node_lock(n_ptr); + l_ptr = n_ptr->active_links[n_ptr->addr & 1]; + if (l_ptr) { buf_copy = skb_copy(buf, GFP_ATOMIC); - if (!buf_copy) + if (!buf_copy) { + tipc_node_unlock(n_ptr); break; + } msg_set_destnode(buf_msg(buf_copy), n_ptr->addr); - tipc_link_send(buf_copy, n_ptr->addr, n_ptr->addr); + __tipc_link_xmit(l_ptr, buf_copy); } + tipc_node_unlock(n_ptr); } + rcu_read_unlock(); kfree_skb(buf); } @@ -148,7 +128,7 @@ static void named_cluster_distribute(struct sk_buff *buf) /** * tipc_named_publish - tell other nodes about a new publication by this node */ -void tipc_named_publish(struct publication *publ) +struct sk_buff *tipc_named_publish(struct publication *publ) { struct sk_buff *buf; struct distr_item *item; @@ -157,23 +137,23 @@ void tipc_named_publish(struct publication *publ) publ_lists[publ->scope]->size++; if (publ->scope == TIPC_NODE_SCOPE) - return; + return NULL; buf = named_prepare_buf(PUBLICATION, ITEM_SIZE, 0); if (!buf) { pr_warn("Publication distribution failure\n"); - return; + return NULL; } item = (struct distr_item *)msg_data(buf_msg(buf)); publ_to_item(item, publ); - named_cluster_distribute(buf); + return buf; } /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node */ -void tipc_named_withdraw(struct publication *publ) +struct sk_buff *tipc_named_withdraw(struct publication *publ) { struct sk_buff *buf; struct distr_item *item; @@ -182,17 +162,17 @@ void tipc_named_withdraw(struct publication *publ) publ_lists[publ->scope]->size--; if (publ->scope == TIPC_NODE_SCOPE) - return; + return NULL; buf = named_prepare_buf(WITHDRAWAL, ITEM_SIZE, 0); if (!buf) { pr_warn("Withdrawal distribution failure\n"); - return; + return NULL; } item = (struct distr_item *)msg_data(buf_msg(buf)); publ_to_item(item, publ); - named_cluster_distribute(buf); + return buf; } /* @@ -231,38 +211,16 @@ static void named_distribute(struct list_head *message_list, u32 node, /** * tipc_named_node_up - tell specified node about all publications by this node */ -void tipc_named_node_up(unsigned long nodearg) +void tipc_named_node_up(u32 max_item_buf, u32 node) { - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; - struct list_head message_list; - u32 node = (u32)nodearg; - u32 max_item_buf = 0; - - /* compute maximum amount of publication data to send per message */ - read_lock_bh(&tipc_net_lock); - n_ptr = tipc_node_find(node); - if (n_ptr) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->active_links[0]; - if (l_ptr) - max_item_buf = ((l_ptr->max_pkt - INT_H_SIZE) / - ITEM_SIZE) * ITEM_SIZE; - tipc_node_unlock(n_ptr); - } - read_unlock_bh(&tipc_net_lock); - if (!max_item_buf) - return; - - /* create list of publication messages, then send them as a unit */ - INIT_LIST_HEAD(&message_list); + LIST_HEAD(message_list); read_lock_bh(&tipc_nametbl_lock); named_distribute(&message_list, node, &publ_cluster, max_item_buf); named_distribute(&message_list, node, &publ_zone, max_item_buf); read_unlock_bh(&tipc_nametbl_lock); - tipc_link_send_names(&message_list, node); + tipc_link_names_xmit(&message_list, node); } /** @@ -293,9 +251,9 @@ static void named_purge_publ(struct publication *publ) } /** - * tipc_named_recv - process name table update message sent by another node + * tipc_named_rcv - process name table update message sent by another node */ -void tipc_named_recv(struct sk_buff *buf) +void tipc_named_rcv(struct sk_buff *buf) { struct publication *publ; struct tipc_msg *msg = buf_msg(buf); diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 1e41bdd4f25..b2eed4ec152 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -39,10 +39,39 @@ #include "name_table.h" -void tipc_named_publish(struct publication *publ); -void tipc_named_withdraw(struct publication *publ); -void tipc_named_node_up(unsigned long node); -void tipc_named_recv(struct sk_buff *buf); +#define ITEM_SIZE sizeof(struct distr_item) + +/** + * struct distr_item - publication info distributed to other nodes + * @type: name sequence type + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @ref: publishing port reference + * @key: publication key + * + * ===> All fields are stored in network byte order. <=== + * + * First 3 fields identify (name or) name sequence being published. + * Reference field uniquely identifies port that published name sequence. + * Key field uniquely identifies publication, in the event a port has + * multiple publications of the same name sequence. + * + * Note: There is no field that identifies the publishing node because it is + * the same for all items contained within a publication message. + */ +struct distr_item { + __be32 type; + __be32 lower; + __be32 upper; + __be32 ref; + __be32 key; +}; + +struct sk_buff *tipc_named_publish(struct publication *publ); +struct sk_buff *tipc_named_withdraw(struct publication *publ); +void named_cluster_distribute(struct sk_buff *buf); +void tipc_named_node_up(u32 max_item_buf, u32 node); +void tipc_named_rcv(struct sk_buff *buf); void tipc_named_reinit(void); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 09dcd54b04e..9d7d37d9518 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -148,8 +148,7 @@ static struct publication *publ_create(u32 type, u32 lower, u32 upper, */ static struct sub_seq *tipc_subseq_alloc(u32 cnt) { - struct sub_seq *sseq = kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC); - return sseq; + return kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC); } /** @@ -665,6 +664,7 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, u32 scope, u32 port_ref, u32 key) { struct publication *publ; + struct sk_buff *buf = NULL; if (table.local_publ_count >= TIPC_MAX_PUBLICATIONS) { pr_warn("Publication failed, local publication limit reached (%u)\n", @@ -677,9 +677,12 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, tipc_own_addr, port_ref, key); if (likely(publ)) { table.local_publ_count++; - tipc_named_publish(publ); + buf = tipc_named_publish(publ); } write_unlock_bh(&tipc_nametbl_lock); + + if (buf) + named_cluster_distribute(buf); return publ; } @@ -689,15 +692,19 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) { struct publication *publ; + struct sk_buff *buf; write_lock_bh(&tipc_nametbl_lock); publ = tipc_nametbl_remove_publ(type, lower, tipc_own_addr, ref, key); if (likely(publ)) { table.local_publ_count--; - tipc_named_withdraw(publ); + buf = tipc_named_withdraw(publ); write_unlock_bh(&tipc_nametbl_lock); list_del_init(&publ->pport_list); kfree(publ); + + if (buf) + named_cluster_distribute(buf); return 1; } write_unlock_bh(&tipc_nametbl_lock); @@ -942,20 +949,48 @@ int tipc_nametbl_init(void) return 0; } -void tipc_nametbl_stop(void) +/** + * tipc_purge_publications - remove all publications for a given type + * + * tipc_nametbl_lock must be held when calling this function + */ +static void tipc_purge_publications(struct name_seq *seq) { - u32 i; + struct publication *publ, *safe; + struct sub_seq *sseq; + struct name_info *info; - if (!table.types) + if (!seq->sseqs) { + nameseq_delete_empty(seq); return; + } + sseq = seq->sseqs; + info = sseq->info; + list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) { + tipc_nametbl_remove_publ(publ->type, publ->lower, publ->node, + publ->ref, publ->key); + kfree(publ); + } +} + +void tipc_nametbl_stop(void) +{ + u32 i; + struct name_seq *seq; + struct hlist_head *seq_head; + struct hlist_node *safe; - /* Verify name table is empty, then release it */ + /* Verify name table is empty and purge any lingering + * publications, then release the name table + */ write_lock_bh(&tipc_nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { if (hlist_empty(&table.types[i])) continue; - pr_err("nametbl_stop(): orphaned hash chain detected\n"); - break; + seq_head = &table.types[i]; + hlist_for_each_entry_safe(seq, safe, seq_head, ns_list) { + tipc_purge_publications(seq); + } } kfree(table.types); table.types = NULL; diff --git a/net/tipc/net.c b/net/tipc/net.c index 7d305ecc09c..f64375e7f99 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -39,45 +39,41 @@ #include "name_distr.h" #include "subscr.h" #include "port.h" +#include "socket.h" #include "node.h" #include "config.h" /* * The TIPC locking policy is designed to ensure a very fine locking * granularity, permitting complete parallel access to individual - * port and node/link instances. The code consists of three major + * port and node/link instances. The code consists of four major * locking domains, each protected with their own disjunct set of locks. * - * 1: The routing hierarchy. - * Comprises the structures 'zone', 'cluster', 'node', 'link' - * and 'bearer'. The whole hierarchy is protected by a big - * read/write lock, tipc_net_lock, to enssure that nothing is added - * or removed while code is accessing any of these structures. - * This layer must not be called from the two others while they - * hold any of their own locks. - * Neither must it itself do any upcalls to the other two before - * it has released tipc_net_lock and other protective locks. + * 1: The bearer level. + * RTNL lock is used to serialize the process of configuring bearer + * on update side, and RCU lock is applied on read side to make + * bearer instance valid on both paths of message transmission and + * reception. * - * Within the tipc_net_lock domain there are two sub-domains;'node' and - * 'bearer', where local write operations are permitted, - * provided that those are protected by individual spin_locks - * per instance. Code holding tipc_net_lock(read) and a node spin_lock - * is permitted to poke around in both the node itself and its - * subordinate links. I.e, it can update link counters and queues, - * change link state, send protocol messages, and alter the - * "active_links" array in the node; but it can _not_ remove a link - * or a node from the overall structure. - * Correspondingly, individual bearers may change status within a - * tipc_net_lock(read), protected by an individual spin_lock ber bearer - * instance, but it needs tipc_net_lock(write) to remove/add any bearers. + * 2: The node and link level. + * All node instances are saved into two tipc_node_list and node_htable + * lists. The two lists are protected by node_list_lock on write side, + * and they are guarded with RCU lock on read side. Especially node + * instance is destroyed only when TIPC module is removed, and we can + * confirm that there has no any user who is accessing the node at the + * moment. Therefore, Except for iterating the two lists within RCU + * protection, it's no needed to hold RCU that we access node instance + * in other places. * + * In addition, all members in node structure including link instances + * are protected by node spin lock. * - * 2: The transport level of the protocol. - * This consists of the structures port, (and its user level - * representations, such as user_port and tipc_sock), reference and - * tipc_user (port.c, reg.c, socket.c). + * 3: The transport level of the protocol. + * This consists of the structures port, (and its user level + * representations, such as user_port and tipc_sock), reference and + * tipc_user (port.c, reg.c, socket.c). * - * This layer has four different locks: + * This layer has four different locks: * - The tipc_port spin_lock. This is protecting each port instance * from parallel data access and removal. Since we can not place * this lock in the port itself, it has been placed in the @@ -96,7 +92,7 @@ * There are two such lists; 'port_list', which is used for management, * and 'wait_list', which is used to queue ports during congestion. * - * 3: The name table (name_table.c, name_distr.c, subscription.c) + * 4: The name table (name_table.c, name_distr.c, subscription.c) * - There is one big read/write-lock (tipc_nametbl_lock) protecting the * overall name table structure. Nothing must be added/removed to * this structure without holding write access to it. @@ -108,8 +104,6 @@ * - A local spin_lock protecting the queue of subscriber events. */ -DEFINE_RWLOCK(tipc_net_lock); - static void net_route_named_msg(struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); @@ -146,19 +140,19 @@ void tipc_net_route_msg(struct sk_buff *buf) if (tipc_in_scope(dnode, tipc_own_addr)) { if (msg_isdata(msg)) { if (msg_mcast(msg)) - tipc_port_recv_mcast(buf, NULL); + tipc_port_mcast_rcv(buf, NULL); else if (msg_destport(msg)) - tipc_port_recv_msg(buf); + tipc_sk_rcv(buf); else net_route_named_msg(buf); return; } switch (msg_user(msg)) { case NAME_DISTRIBUTOR: - tipc_named_recv(buf); + tipc_named_rcv(buf); break; case CONN_MANAGER: - tipc_port_recv_proto_msg(buf); + tipc_port_proto_rcv(buf); break; default: kfree_skb(buf); @@ -168,38 +162,41 @@ void tipc_net_route_msg(struct sk_buff *buf) /* Handle message for another node */ skb_trim(buf, msg_size(msg)); - tipc_link_send(buf, dnode, msg_link_selector(msg)); + tipc_link_xmit(buf, dnode, msg_link_selector(msg)); } -void tipc_net_start(u32 addr) +int tipc_net_start(u32 addr) { char addr_string[16]; + int res; - write_lock_bh(&tipc_net_lock); tipc_own_addr = addr; tipc_named_reinit(); tipc_port_reinit(); - tipc_bclink_init(); - write_unlock_bh(&tipc_net_lock); + res = tipc_bclink_init(); + if (res) + return res; - tipc_cfg_reinit(); + tipc_nametbl_publish(TIPC_CFG_SRV, tipc_own_addr, tipc_own_addr, + TIPC_ZONE_SCOPE, 0, tipc_own_addr); pr_info("Started in network mode\n"); pr_info("Own node address %s, network identity %u\n", tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id); + return 0; } void tipc_net_stop(void) { - struct tipc_node *node, *t_node; - if (!tipc_own_addr) return; - write_lock_bh(&tipc_net_lock); + + tipc_nametbl_withdraw(TIPC_CFG_SRV, tipc_own_addr, 0, tipc_own_addr); + rtnl_lock(); tipc_bearer_stop(); tipc_bclink_stop(); - list_for_each_entry_safe(node, t_node, &tipc_node_list, list) - tipc_node_delete(node); - write_unlock_bh(&tipc_net_lock); + tipc_node_stop(); + rtnl_unlock(); + pr_info("Left network mode\n"); } diff --git a/net/tipc/net.h b/net/tipc/net.h index 079daadb3f7..c6c2b46f7c2 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -37,11 +37,9 @@ #ifndef _TIPC_NET_H #define _TIPC_NET_H -extern rwlock_t tipc_net_lock; - void tipc_net_route_msg(struct sk_buff *buf); -void tipc_net_start(u32 addr); +int tipc_net_start(u32 addr); void tipc_net_stop(void); #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 9f72a637636..ad844d36534 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -47,7 +47,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) int hdr_space = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN); u16 cmd; - if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN))) + if ((req_userhdr->cmd & 0xC000) && (!netlink_capable(skb, CAP_NET_ADMIN))) cmd = TIPC_CMD_NOT_NET_ADMIN; else cmd = req_userhdr->cmd; @@ -83,8 +83,6 @@ static struct genl_ops tipc_genl_ops[] = { }, }; -static int tipc_genl_family_registered; - int tipc_netlink_start(void) { int res; @@ -94,16 +92,10 @@ int tipc_netlink_start(void) pr_err("Failed to register netlink interface\n"); return res; } - - tipc_genl_family_registered = 1; return 0; } void tipc_netlink_stop(void) { - if (!tipc_genl_family_registered) - return; - genl_unregister_family(&tipc_genl_family); - tipc_genl_family_registered = 0; } diff --git a/net/tipc/node.c b/net/tipc/node.c index 25100c0a6fe..5b44c3041be 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2,7 +2,7 @@ * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012 Ericsson AB - * Copyright (c) 2005-2006, 2010-2011, Wind River Systems + * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -44,13 +44,11 @@ static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); -static DEFINE_SPINLOCK(node_create_lock); - static struct hlist_head node_htable[NODE_HTABLE_SIZE]; LIST_HEAD(tipc_node_list); static u32 tipc_num_nodes; - -static atomic_t tipc_num_links = ATOMIC_INIT(0); +static u32 tipc_num_links; +static DEFINE_SPINLOCK(node_list_lock); /* * A trivial power-of-two bitmask technique is used for speed, since this @@ -73,37 +71,26 @@ struct tipc_node *tipc_node_find(u32 addr) if (unlikely(!in_own_cluster_exact(addr))) return NULL; - hlist_for_each_entry(node, &node_htable[tipc_hashfn(addr)], hash) { - if (node->addr == addr) + rcu_read_lock(); + hlist_for_each_entry_rcu(node, &node_htable[tipc_hashfn(addr)], hash) { + if (node->addr == addr) { + rcu_read_unlock(); return node; + } } + rcu_read_unlock(); return NULL; } -/** - * tipc_node_create - create neighboring node - * - * Currently, this routine is called by neighbor discovery code, which holds - * net_lock for reading only. We must take node_create_lock to ensure a node - * isn't created twice if two different bearers discover the node at the same - * time. (It would be preferable to switch to holding net_lock in write mode, - * but this is a non-trivial change.) - */ struct tipc_node *tipc_node_create(u32 addr) { struct tipc_node *n_ptr, *temp_node; - spin_lock_bh(&node_create_lock); - - n_ptr = tipc_node_find(addr); - if (n_ptr) { - spin_unlock_bh(&node_create_lock); - return n_ptr; - } + spin_lock_bh(&node_list_lock); n_ptr = kzalloc(sizeof(*n_ptr), GFP_ATOMIC); if (!n_ptr) { - spin_unlock_bh(&node_create_lock); + spin_unlock_bh(&node_list_lock); pr_warn("Node creation failed, no memory\n"); return NULL; } @@ -114,31 +101,41 @@ struct tipc_node *tipc_node_create(u32 addr) INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->nsub); - hlist_add_head(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]); + hlist_add_head_rcu(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]); - list_for_each_entry(temp_node, &tipc_node_list, list) { + list_for_each_entry_rcu(temp_node, &tipc_node_list, list) { if (n_ptr->addr < temp_node->addr) break; } - list_add_tail(&n_ptr->list, &temp_node->list); - n_ptr->block_setup = WAIT_PEER_DOWN; + list_add_tail_rcu(&n_ptr->list, &temp_node->list); + n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; n_ptr->signature = INVALID_NODE_SIG; tipc_num_nodes++; - spin_unlock_bh(&node_create_lock); + spin_unlock_bh(&node_list_lock); return n_ptr; } -void tipc_node_delete(struct tipc_node *n_ptr) +static void tipc_node_delete(struct tipc_node *n_ptr) { - list_del(&n_ptr->list); - hlist_del(&n_ptr->hash); - kfree(n_ptr); + list_del_rcu(&n_ptr->list); + hlist_del_rcu(&n_ptr->hash); + kfree_rcu(n_ptr, rcu); tipc_num_nodes--; } +void tipc_node_stop(void) +{ + struct tipc_node *node, *t_node; + + spin_lock_bh(&node_list_lock); + list_for_each_entry_safe(node, t_node, &tipc_node_list, list) + tipc_node_delete(node); + spin_unlock_bh(&node_list_lock); +} + /** * tipc_node_link_up - handle addition of link * @@ -147,11 +144,13 @@ void tipc_node_delete(struct tipc_node *n_ptr) void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { struct tipc_link **active = &n_ptr->active_links[0]; + u32 addr = n_ptr->addr; n_ptr->working_links++; - + tipc_nametbl_publish(TIPC_LINK_STATE, addr, addr, TIPC_NODE_SCOPE, + l_ptr->bearer_id, addr); pr_info("Established link <%s> on network plane %c\n", - l_ptr->name, l_ptr->b_ptr->net_plane); + l_ptr->name, l_ptr->net_plane); if (!active[0]) { active[0] = active[1] = l_ptr; @@ -162,7 +161,7 @@ void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) pr_info("New link <%s> becomes standby\n", l_ptr->name); return; } - tipc_link_send_duplicate(active[0], l_ptr); + tipc_link_dup_queue_xmit(active[0], l_ptr); if (l_ptr->priority == active[0]->priority) { active[0] = l_ptr; return; @@ -206,16 +205,18 @@ static void node_select_active_links(struct tipc_node *n_ptr) void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { struct tipc_link **active; + u32 addr = n_ptr->addr; n_ptr->working_links--; + tipc_nametbl_withdraw(TIPC_LINK_STATE, addr, l_ptr->bearer_id, addr); if (!tipc_link_is_active(l_ptr)) { pr_info("Lost standby link <%s> on network plane %c\n", - l_ptr->name, l_ptr->b_ptr->net_plane); + l_ptr->name, l_ptr->net_plane); return; } pr_info("Lost link <%s> on network plane %c\n", - l_ptr->name, l_ptr->b_ptr->net_plane); + l_ptr->name, l_ptr->net_plane); active = &n_ptr->active_links[0]; if (active[0] == l_ptr) @@ -225,7 +226,7 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) if (active[0] == l_ptr) node_select_active_links(n_ptr); if (tipc_node_is_up(n_ptr)) - tipc_link_changeover(l_ptr); + tipc_link_failover_send_queue(l_ptr); else node_lost_contact(n_ptr); } @@ -235,11 +236,6 @@ int tipc_node_active_links(struct tipc_node *n_ptr) return n_ptr->active_links[0] != NULL; } -int tipc_node_redundant_links(struct tipc_node *n_ptr) -{ - return n_ptr->working_links > 1; -} - int tipc_node_is_up(struct tipc_node *n_ptr) { return tipc_node_active_links(n_ptr); @@ -247,40 +243,36 @@ int tipc_node_is_up(struct tipc_node *n_ptr) void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { - n_ptr->links[l_ptr->b_ptr->identity] = l_ptr; - atomic_inc(&tipc_num_links); + n_ptr->links[l_ptr->bearer_id] = l_ptr; + spin_lock_bh(&node_list_lock); + tipc_num_links++; + spin_unlock_bh(&node_list_lock); n_ptr->link_cnt++; } void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { - n_ptr->links[l_ptr->b_ptr->identity] = NULL; - atomic_dec(&tipc_num_links); - n_ptr->link_cnt--; + int i; + + for (i = 0; i < MAX_BEARERS; i++) { + if (l_ptr != n_ptr->links[i]) + continue; + n_ptr->links[i] = NULL; + spin_lock_bh(&node_list_lock); + tipc_num_links--; + spin_unlock_bh(&node_list_lock); + n_ptr->link_cnt--; + } } static void node_established_contact(struct tipc_node *n_ptr) { - tipc_k_signal((Handler)tipc_named_node_up, n_ptr->addr); + n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP; n_ptr->bclink.oos_state = 0; n_ptr->bclink.acked = tipc_bclink_get_last_sent(); tipc_bclink_add_node(n_ptr->addr); } -static void node_name_purge_complete(unsigned long node_addr) -{ - struct tipc_node *n_ptr; - - read_lock_bh(&tipc_net_lock); - n_ptr = tipc_node_find(node_addr); - if (n_ptr) { - tipc_node_lock(n_ptr); - n_ptr->block_setup &= ~WAIT_NAMES_GONE; - tipc_node_unlock(n_ptr); - } - read_unlock_bh(&tipc_net_lock); -} - static void node_lost_contact(struct tipc_node *n_ptr) { char addr_string[16]; @@ -291,17 +283,12 @@ static void node_lost_contact(struct tipc_node *n_ptr) /* Flush broadcast link info associated with lost node */ if (n_ptr->bclink.recv_permitted) { - while (n_ptr->bclink.deferred_head) { - struct sk_buff *buf = n_ptr->bclink.deferred_head; - n_ptr->bclink.deferred_head = buf->next; - kfree_skb(buf); - } + kfree_skb_list(n_ptr->bclink.deferred_head); n_ptr->bclink.deferred_size = 0; - if (n_ptr->bclink.reasm_head) { - kfree_skb(n_ptr->bclink.reasm_head); - n_ptr->bclink.reasm_head = NULL; - n_ptr->bclink.reasm_tail = NULL; + if (n_ptr->bclink.reasm_buf) { + kfree_skb(n_ptr->bclink.reasm_buf); + n_ptr->bclink.reasm_buf = NULL; } tipc_bclink_remove_node(n_ptr->addr); @@ -320,12 +307,13 @@ static void node_lost_contact(struct tipc_node *n_ptr) tipc_link_reset_fragments(l_ptr); } - /* Notify subscribers */ - tipc_nodesub_notify(n_ptr); + n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN; - /* Prevent re-contact with node until cleanup is done */ - n_ptr->block_setup = WAIT_PEER_DOWN | WAIT_NAMES_GONE; - tipc_k_signal((Handler)node_name_purge_complete, n_ptr->addr); + /* Notify subscribers and prevent re-contact with node until + * cleanup is done. + */ + n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN | + TIPC_NOTIFY_NODE_DOWN; } struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space) @@ -344,27 +332,28 @@ struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space) return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE " (network address)"); - read_lock_bh(&tipc_net_lock); + spin_lock_bh(&node_list_lock); if (!tipc_num_nodes) { - read_unlock_bh(&tipc_net_lock); + spin_unlock_bh(&node_list_lock); return tipc_cfg_reply_none(); } /* For now, get space for all other nodes */ payload_size = TLV_SPACE(sizeof(node_info)) * tipc_num_nodes; if (payload_size > 32768u) { - read_unlock_bh(&tipc_net_lock); + spin_unlock_bh(&node_list_lock); return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED " (too many nodes)"); } + spin_unlock_bh(&node_list_lock); + buf = tipc_cfg_reply_alloc(payload_size); - if (!buf) { - read_unlock_bh(&tipc_net_lock); + if (!buf) return NULL; - } /* Add TLVs for all nodes in scope */ - list_for_each_entry(n_ptr, &tipc_node_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { if (!tipc_in_scope(domain, n_ptr->addr)) continue; node_info.addr = htonl(n_ptr->addr); @@ -372,8 +361,7 @@ struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space) tipc_cfg_append_tlv(buf, TIPC_TLV_NODE_INFO, &node_info, sizeof(node_info)); } - - read_unlock_bh(&tipc_net_lock); + rcu_read_unlock(); return buf; } @@ -396,21 +384,19 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) if (!tipc_own_addr) return tipc_cfg_reply_none(); - read_lock_bh(&tipc_net_lock); - + spin_lock_bh(&node_list_lock); /* Get space for all unicast links + broadcast link */ - payload_size = TLV_SPACE(sizeof(link_info)) * - (atomic_read(&tipc_num_links) + 1); + payload_size = TLV_SPACE((sizeof(link_info)) * (tipc_num_links + 1)); if (payload_size > 32768u) { - read_unlock_bh(&tipc_net_lock); + spin_unlock_bh(&node_list_lock); return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED " (too many links)"); } + spin_unlock_bh(&node_list_lock); + buf = tipc_cfg_reply_alloc(payload_size); - if (!buf) { - read_unlock_bh(&tipc_net_lock); + if (!buf) return NULL; - } /* Add TLV for broadcast link */ link_info.dest = htonl(tipc_cluster_mask(tipc_own_addr)); @@ -419,7 +405,8 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); /* Add TLVs for any other links in scope */ - list_for_each_entry(n_ptr, &tipc_node_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { u32 i; if (!tipc_in_scope(domain, n_ptr->addr)) @@ -436,7 +423,66 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) } tipc_node_unlock(n_ptr); } - - read_unlock_bh(&tipc_net_lock); + rcu_read_unlock(); return buf; } + +/** + * tipc_node_get_linkname - get the name of a link + * + * @bearer_id: id of the bearer + * @node: peer node address + * @linkname: link name output buffer + * + * Returns 0 on success + */ +int tipc_node_get_linkname(u32 bearer_id, u32 addr, char *linkname, size_t len) +{ + struct tipc_link *link; + struct tipc_node *node = tipc_node_find(addr); + + if ((bearer_id >= MAX_BEARERS) || !node) + return -EINVAL; + tipc_node_lock(node); + link = node->links[bearer_id]; + if (link) { + strncpy(linkname, link->name, len); + tipc_node_unlock(node); + return 0; + } + tipc_node_unlock(node); + return -EINVAL; +} + +void tipc_node_unlock(struct tipc_node *node) +{ + LIST_HEAD(nsub_list); + struct tipc_link *link; + int pkt_sz = 0; + u32 addr = 0; + + if (likely(!node->action_flags)) { + spin_unlock_bh(&node->lock); + return; + } + + if (node->action_flags & TIPC_NOTIFY_NODE_DOWN) { + list_replace_init(&node->nsub, &nsub_list); + node->action_flags &= ~TIPC_NOTIFY_NODE_DOWN; + } + if (node->action_flags & TIPC_NOTIFY_NODE_UP) { + link = node->active_links[0]; + node->action_flags &= ~TIPC_NOTIFY_NODE_UP; + if (link) { + pkt_sz = ((link->max_pkt - INT_H_SIZE) / ITEM_SIZE) * + ITEM_SIZE; + addr = node->addr; + } + } + spin_unlock_bh(&node->lock); + + if (!list_empty(&nsub_list)) + tipc_nodesub_notify(&nsub_list); + if (pkt_sz) + tipc_named_node_up(pkt_sz, addr); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index e5e96c04e16..9087063793f 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -2,7 +2,7 @@ * net/tipc/node.h: Include file for TIPC node management routines * * Copyright (c) 2000-2006, Ericsson AB - * Copyright (c) 2005, 2010-2011, Wind River Systems + * Copyright (c) 2005, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,87 +47,100 @@ */ #define INVALID_NODE_SIG 0x10000 -/* Flags used to block (re)establishment of contact with a neighboring node */ -#define WAIT_PEER_DOWN 0x0001 /* wait to see that peer's links are down */ -#define WAIT_NAMES_GONE 0x0002 /* wait for peer's publications to be purged */ -#define WAIT_NODE_DOWN 0x0004 /* wait until peer node is declared down */ +/* Flags used to take different actions according to flag type + * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down + * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down + * TIPC_NOTIFY_NODE_DOWN: notify node is down + * TIPC_NOTIFY_NODE_UP: notify node is up + */ +enum { + TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1), + TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2), + TIPC_NOTIFY_NODE_DOWN = (1 << 3), + TIPC_NOTIFY_NODE_UP = (1 << 4) +}; + +/** + * struct tipc_node_bclink - TIPC node bclink structure + * @acked: sequence # of last outbound b'cast message acknowledged by node + * @last_in: sequence # of last in-sequence b'cast message received from node + * @last_sent: sequence # of last b'cast message sent by node + * @oos_state: state tracker for handling OOS b'cast messages + * @deferred_size: number of OOS b'cast messages in deferred queue + * @deferred_head: oldest OOS b'cast message received from node + * @deferred_tail: newest OOS b'cast message received from node + * @reasm_buf: broadcast reassembly queue head from node + * @recv_permitted: true if node is allowed to receive b'cast messages + */ +struct tipc_node_bclink { + u32 acked; + u32 last_in; + u32 last_sent; + u32 oos_state; + u32 deferred_size; + struct sk_buff *deferred_head; + struct sk_buff *deferred_tail; + struct sk_buff *reasm_buf; + bool recv_permitted; +}; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @lock: spinlock governing access to structure * @hash: links to adjacent nodes in unsorted hash chain - * @list: links to adjacent nodes in sorted list of cluster's nodes - * @nsub: list of "node down" subscriptions monitoring node * @active_links: pointers to active links to node * @links: pointers to all links to node + * @action_flags: bit mask of different types of node actions + * @bclink: broadcast-related info + * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) - * @block_setup: bit mask of conditions preventing link establishment to node * @link_cnt: number of links to node - * @permit_changeover: non-zero if node has redundant links to this system * @signature: node instance identifier - * @bclink: broadcast-related info - * @acked: sequence # of last outbound b'cast message acknowledged by node - * @last_in: sequence # of last in-sequence b'cast message received from node - * @last_sent: sequence # of last b'cast message sent by node - * @oos_state: state tracker for handling OOS b'cast messages - * @deferred_size: number of OOS b'cast messages in deferred queue - * @deferred_head: oldest OOS b'cast message received from node - * @deferred_tail: newest OOS b'cast message received from node - * @reasm_head: broadcast reassembly queue head from node - * @reasm_tail: last broadcast fragment received from node - * @recv_permitted: true if node is allowed to receive b'cast messages + * @nsub: list of "node down" subscriptions monitoring node + * @rcu: rcu struct for tipc_node */ struct tipc_node { u32 addr; spinlock_t lock; struct hlist_node hash; - struct list_head list; - struct list_head nsub; struct tipc_link *active_links[2]; struct tipc_link *links[MAX_BEARERS]; + unsigned int action_flags; + struct tipc_node_bclink bclink; + struct list_head list; int link_cnt; int working_links; - int block_setup; - int permit_changeover; u32 signature; - struct { - u32 acked; - u32 last_in; - u32 last_sent; - u32 oos_state; - u32 deferred_size; - struct sk_buff *deferred_head; - struct sk_buff *deferred_tail; - struct sk_buff *reasm_head; - struct sk_buff *reasm_tail; - bool recv_permitted; - } bclink; + struct list_head nsub; + struct rcu_head rcu; }; extern struct list_head tipc_node_list; struct tipc_node *tipc_node_find(u32 addr); struct tipc_node *tipc_node_create(u32 addr); -void tipc_node_delete(struct tipc_node *n_ptr); +void tipc_node_stop(void); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr); int tipc_node_active_links(struct tipc_node *n_ptr); -int tipc_node_redundant_links(struct tipc_node *n_ptr); int tipc_node_is_up(struct tipc_node *n_ptr); struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space); struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space); +int tipc_node_get_linkname(u32 bearer_id, u32 node, char *linkname, size_t len); +void tipc_node_unlock(struct tipc_node *node); -static inline void tipc_node_lock(struct tipc_node *n_ptr) +static inline void tipc_node_lock(struct tipc_node *node) { - spin_lock_bh(&n_ptr->lock); + spin_lock_bh(&node->lock); } -static inline void tipc_node_unlock(struct tipc_node *n_ptr) +static inline bool tipc_node_blocked(struct tipc_node *node) { - spin_unlock_bh(&n_ptr->lock); + return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN | + TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN)); } #endif diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c index 8a7384c04ad..7c59ab1d6ec 100644 --- a/net/tipc/node_subscr.c +++ b/net/tipc/node_subscr.c @@ -81,14 +81,13 @@ void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub) * * Note: node is locked by caller */ -void tipc_nodesub_notify(struct tipc_node *node) +void tipc_nodesub_notify(struct list_head *nsub_list) { - struct tipc_node_subscr *ns; + struct tipc_node_subscr *ns, *safe; - list_for_each_entry(ns, &node->nsub, nodesub_list) { + list_for_each_entry_safe(ns, safe, nsub_list, nodesub_list) { if (ns->handle_node_down) { - tipc_k_signal((Handler)ns->handle_node_down, - (unsigned long)ns->usr_handle); + ns->handle_node_down(ns->usr_handle); ns->handle_node_down = NULL; } } diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h index c95d20727de..d91b8cc81e3 100644 --- a/net/tipc/node_subscr.h +++ b/net/tipc/node_subscr.h @@ -58,6 +58,6 @@ struct tipc_node_subscr { void tipc_nodesub_subscribe(struct tipc_node_subscr *node_sub, u32 addr, void *usr_handle, net_ev_handler handle_down); void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub); -void tipc_nodesub_notify(struct tipc_node *node); +void tipc_nodesub_notify(struct list_head *nsub_list); #endif diff --git a/net/tipc/port.c b/net/tipc/port.c index c081a763230..5fd7acce01e 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -1,7 +1,7 @@ /* * net/tipc/port.c: TIPC port code * - * Copyright (c) 1992-2007, Ericsson AB + * Copyright (c) 1992-2007, 2014, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * @@ -38,6 +38,7 @@ #include "config.h" #include "port.h" #include "name_table.h" +#include "socket.h" /* Connection management: */ #define PROBING_INTERVAL 3600000 /* [ms] => 1 h */ @@ -54,17 +55,6 @@ static struct sk_buff *port_build_self_abort_msg(struct tipc_port *, u32 err); static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *, u32 err); static void port_timeout(unsigned long ref); - -static u32 port_peernode(struct tipc_port *p_ptr) -{ - return msg_destnode(&p_ptr->phdr); -} - -static u32 port_peerport(struct tipc_port *p_ptr) -{ - return msg_destport(&p_ptr->phdr); -} - /** * tipc_port_peer_msg - verify message was sent by connected port's peer * @@ -76,33 +66,32 @@ int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg) u32 peernode; u32 orignode; - if (msg_origport(msg) != port_peerport(p_ptr)) + if (msg_origport(msg) != tipc_port_peerport(p_ptr)) return 0; orignode = msg_orignode(msg); - peernode = port_peernode(p_ptr); + peernode = tipc_port_peernode(p_ptr); return (orignode == peernode) || (!orignode && (peernode == tipc_own_addr)) || (!peernode && (orignode == tipc_own_addr)); } /** - * tipc_multicast - send a multicast message to local and remote destinations + * tipc_port_mcast_xmit - send a multicast message to local and remote + * destinations */ -int tipc_multicast(u32 ref, struct tipc_name_seq const *seq, - struct iovec const *msg_sect, unsigned int len) +int tipc_port_mcast_xmit(struct tipc_port *oport, + struct tipc_name_seq const *seq, + struct iovec const *msg_sect, + unsigned int len) { struct tipc_msg *hdr; struct sk_buff *buf; struct sk_buff *ibuf = NULL; struct tipc_port_list dports = {0, NULL, }; - struct tipc_port *oport = tipc_port_deref(ref); int ext_targets; int res; - if (unlikely(!oport)) - return -EINVAL; - /* Create multicast message */ hdr = &oport->phdr; msg_set_type(hdr, TIPC_MCAST_MSG); @@ -131,7 +120,7 @@ int tipc_multicast(u32 ref, struct tipc_name_seq const *seq, return -ENOMEM; } } - res = tipc_bclink_send_msg(buf); + res = tipc_bclink_xmit(buf); if ((res < 0) && (dports.count != 0)) kfree_skb(ibuf); } else { @@ -140,7 +129,7 @@ int tipc_multicast(u32 ref, struct tipc_name_seq const *seq, if (res >= 0) { if (ibuf) - tipc_port_recv_mcast(ibuf, &dports); + tipc_port_mcast_rcv(ibuf, &dports); } else { tipc_port_list_free(&dports); } @@ -148,11 +137,11 @@ int tipc_multicast(u32 ref, struct tipc_name_seq const *seq, } /** - * tipc_port_recv_mcast - deliver multicast message to all destination ports + * tipc_port_mcast_rcv - deliver multicast message to all destination ports * * If there is no port list, perform a lookup to create one */ -void tipc_port_recv_mcast(struct sk_buff *buf, struct tipc_port_list *dp) +void tipc_port_mcast_rcv(struct sk_buff *buf, struct tipc_port_list *dp) { struct tipc_msg *msg; struct tipc_port_list dports = {0, NULL, }; @@ -176,7 +165,7 @@ void tipc_port_recv_mcast(struct sk_buff *buf, struct tipc_port_list *dp) msg_set_destnode(msg, tipc_own_addr); if (dp->count == 1) { msg_set_destport(msg, dp->ports[0]); - tipc_port_recv_msg(buf); + tipc_sk_rcv(buf); tipc_port_list_free(dp); return; } @@ -191,7 +180,7 @@ void tipc_port_recv_mcast(struct sk_buff *buf, struct tipc_port_list *dp) if ((index == 0) && (cnt != 0)) item = item->next; msg_set_destport(buf_msg(b), item->ports[index]); - tipc_port_recv_msg(b); + tipc_sk_rcv(b); } } exit: @@ -199,40 +188,32 @@ exit: tipc_port_list_free(dp); } -/** - * tipc_createport - create a generic TIPC port + +void tipc_port_wakeup(struct tipc_port *port) +{ + tipc_sock_wakeup(tipc_port_to_sock(port)); +} + +/* tipc_port_init - intiate TIPC port and lock it * - * Returns pointer to (locked) TIPC port, or NULL if unable to create it + * Returns obtained reference if initialization is successful, zero otherwise */ -struct tipc_port *tipc_createport(struct sock *sk, - u32 (*dispatcher)(struct tipc_port *, - struct sk_buff *), - void (*wakeup)(struct tipc_port *), - const u32 importance) +u32 tipc_port_init(struct tipc_port *p_ptr, + const unsigned int importance) { - struct tipc_port *p_ptr; struct tipc_msg *msg; u32 ref; - p_ptr = kzalloc(sizeof(*p_ptr), GFP_ATOMIC); - if (!p_ptr) { - pr_warn("Port creation failed, no memory\n"); - return NULL; - } ref = tipc_ref_acquire(p_ptr, &p_ptr->lock); if (!ref) { - pr_warn("Port creation failed, ref. table exhausted\n"); - kfree(p_ptr); - return NULL; + pr_warn("Port registration failed, ref. table exhausted\n"); + return 0; } - p_ptr->sk = sk; p_ptr->max_pkt = MAX_PKT_DEFAULT; p_ptr->ref = ref; INIT_LIST_HEAD(&p_ptr->wait_list); INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list); - p_ptr->dispatcher = dispatcher; - p_ptr->wakeup = wakeup; k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref); INIT_LIST_HEAD(&p_ptr->publications); INIT_LIST_HEAD(&p_ptr->port_list); @@ -248,21 +229,18 @@ struct tipc_port *tipc_createport(struct sock *sk, msg_set_origport(msg, ref); list_add_tail(&p_ptr->port_list, &ports); spin_unlock_bh(&tipc_port_list_lock); - return p_ptr; + return ref; } -int tipc_deleteport(u32 ref) +void tipc_port_destroy(struct tipc_port *p_ptr) { - struct tipc_port *p_ptr; struct sk_buff *buf = NULL; - tipc_withdraw(ref, 0, NULL); - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; + tipc_withdraw(p_ptr, 0, NULL); - tipc_ref_discard(ref); - tipc_port_unlock(p_ptr); + spin_lock_bh(p_ptr->lock); + tipc_ref_discard(p_ptr->ref); + spin_unlock_bh(p_ptr->lock); k_cancel_timer(&p_ptr->timer); if (p_ptr->connected) { @@ -275,67 +253,7 @@ int tipc_deleteport(u32 ref) list_del(&p_ptr->wait_list); spin_unlock_bh(&tipc_port_list_lock); k_term_timer(&p_ptr->timer); - kfree(p_ptr); tipc_net_route_msg(buf); - return 0; -} - -static int port_unreliable(struct tipc_port *p_ptr) -{ - return msg_src_droppable(&p_ptr->phdr); -} - -int tipc_portunreliable(u32 ref, unsigned int *isunreliable) -{ - struct tipc_port *p_ptr; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - *isunreliable = port_unreliable(p_ptr); - tipc_port_unlock(p_ptr); - return 0; -} - -int tipc_set_portunreliable(u32 ref, unsigned int isunreliable) -{ - struct tipc_port *p_ptr; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - msg_set_src_droppable(&p_ptr->phdr, (isunreliable != 0)); - tipc_port_unlock(p_ptr); - return 0; -} - -static int port_unreturnable(struct tipc_port *p_ptr) -{ - return msg_dest_droppable(&p_ptr->phdr); -} - -int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable) -{ - struct tipc_port *p_ptr; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - *isunrejectable = port_unreturnable(p_ptr); - tipc_port_unlock(p_ptr); - return 0; -} - -int tipc_set_portunreturnable(u32 ref, unsigned int isunrejectable) -{ - struct tipc_port *p_ptr; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - msg_set_dest_droppable(&p_ptr->phdr, (isunrejectable != 0)); - tipc_port_unlock(p_ptr); - return 0; } /* @@ -353,8 +271,8 @@ static struct sk_buff *port_build_proto_msg(struct tipc_port *p_ptr, if (buf) { msg = buf_msg(buf); tipc_msg_init(msg, CONN_MANAGER, type, INT_H_SIZE, - port_peernode(p_ptr)); - msg_set_destport(msg, port_peerport(p_ptr)); + tipc_port_peernode(p_ptr)); + msg_set_destport(msg, tipc_port_peerport(p_ptr)); msg_set_origport(msg, p_ptr->ref); msg_set_msgcnt(msg, ack); } @@ -425,17 +343,17 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err) /* send returned message & dispose of rejected message */ src_node = msg_prevnode(msg); if (in_own_node(src_node)) - tipc_port_recv_msg(rbuf); + tipc_sk_rcv(rbuf); else - tipc_link_send(rbuf, src_node, msg_link_selector(rmsg)); + tipc_link_xmit(rbuf, src_node, msg_link_selector(rmsg)); exit: kfree_skb(buf); return data_sz; } -int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr, - struct iovec const *msg_sect, unsigned int len, - int err) +int tipc_port_iovec_reject(struct tipc_port *p_ptr, struct tipc_msg *hdr, + struct iovec const *msg_sect, unsigned int len, + int err) { struct sk_buff *buf; int res; @@ -522,7 +440,7 @@ static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 er return buf; } -void tipc_port_recv_proto_msg(struct sk_buff *buf) +void tipc_port_proto_rcv(struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); struct tipc_port *p_ptr; @@ -550,13 +468,12 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf) /* Process protocol message sent by peer */ switch (msg_type(msg)) { case CONN_ACK: - wakeable = tipc_port_congested(p_ptr) && p_ptr->congested && - p_ptr->wakeup; + wakeable = tipc_port_congested(p_ptr) && p_ptr->congested; p_ptr->acked += msg_msgcnt(msg); if (!tipc_port_congested(p_ptr)) { p_ptr->congested = 0; if (wakeable) - p_ptr->wakeup(p_ptr); + tipc_port_wakeup(p_ptr); } break; case CONN_PROBE: @@ -587,8 +504,8 @@ static int port_print(struct tipc_port *p_ptr, char *buf, int len, int full_id) ret = tipc_snprintf(buf, len, "%-10u:", p_ptr->ref); if (p_ptr->connected) { - u32 dport = port_peerport(p_ptr); - u32 destnode = port_peernode(p_ptr); + u32 dport = tipc_port_peerport(p_ptr); + u32 destnode = tipc_port_peernode(p_ptr); ret += tipc_snprintf(buf + ret, len - ret, " connected to <%u.%u.%u:%u>", @@ -676,75 +593,36 @@ void tipc_acknowledge(u32 ref, u32 ack) tipc_net_route_msg(buf); } -int tipc_portimportance(u32 ref, unsigned int *importance) -{ - struct tipc_port *p_ptr; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - *importance = (unsigned int)msg_importance(&p_ptr->phdr); - tipc_port_unlock(p_ptr); - return 0; -} - -int tipc_set_portimportance(u32 ref, unsigned int imp) -{ - struct tipc_port *p_ptr; - - if (imp > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - msg_set_importance(&p_ptr->phdr, (u32)imp); - tipc_port_unlock(p_ptr); - return 0; -} - - -int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) +int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, + struct tipc_name_seq const *seq) { - struct tipc_port *p_ptr; struct publication *publ; u32 key; - int res = -EINVAL; - p_ptr = tipc_port_lock(ref); - if (!p_ptr) + if (p_ptr->connected) return -EINVAL; + key = p_ptr->ref + p_ptr->pub_count + 1; + if (key == p_ptr->ref) + return -EADDRINUSE; - if (p_ptr->connected) - goto exit; - key = ref + p_ptr->pub_count + 1; - if (key == ref) { - res = -EADDRINUSE; - goto exit; - } publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper, scope, p_ptr->ref, key); if (publ) { list_add(&publ->pport_list, &p_ptr->publications); p_ptr->pub_count++; p_ptr->published = 1; - res = 0; + return 0; } -exit: - tipc_port_unlock(p_ptr); - return res; + return -EINVAL; } -int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) +int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, + struct tipc_name_seq const *seq) { - struct tipc_port *p_ptr; struct publication *publ; struct publication *tpubl; int res = -EINVAL; - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; if (!seq) { list_for_each_entry_safe(publ, tpubl, &p_ptr->publications, pport_list) { @@ -771,11 +649,10 @@ int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) } if (list_empty(&p_ptr->publications)) p_ptr->published = 0; - tipc_port_unlock(p_ptr); return res; } -int tipc_connect(u32 ref, struct tipc_portid const *peer) +int tipc_port_connect(u32 ref, struct tipc_portid const *peer) { struct tipc_port *p_ptr; int res; @@ -783,17 +660,17 @@ int tipc_connect(u32 ref, struct tipc_portid const *peer) p_ptr = tipc_port_lock(ref); if (!p_ptr) return -EINVAL; - res = __tipc_connect(ref, p_ptr, peer); + res = __tipc_port_connect(ref, p_ptr, peer); tipc_port_unlock(p_ptr); return res; } /* - * __tipc_connect - connect to a remote peer + * __tipc_port_connect - connect to a remote peer * * Port must be locked. */ -int __tipc_connect(u32 ref, struct tipc_port *p_ptr, +int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, struct tipc_portid const *peer) { struct tipc_msg *msg; @@ -830,26 +707,23 @@ exit: * * Port must be locked. */ -int __tipc_disconnect(struct tipc_port *tp_ptr) +int __tipc_port_disconnect(struct tipc_port *tp_ptr) { - int res; - if (tp_ptr->connected) { tp_ptr->connected = 0; /* let timer expire on it's own to avoid deadlock! */ tipc_nodesub_unsubscribe(&tp_ptr->subscription); - res = 0; - } else { - res = -ENOTCONN; + return 0; } - return res; + + return -ENOTCONN; } /* - * tipc_disconnect(): Disconnect port form peer. + * tipc_port_disconnect(): Disconnect port form peer. * This is a node local operation. */ -int tipc_disconnect(u32 ref) +int tipc_port_disconnect(u32 ref) { struct tipc_port *p_ptr; int res; @@ -857,15 +731,15 @@ int tipc_disconnect(u32 ref) p_ptr = tipc_port_lock(ref); if (!p_ptr) return -EINVAL; - res = __tipc_disconnect(p_ptr); + res = __tipc_port_disconnect(p_ptr); tipc_port_unlock(p_ptr); return res; } /* - * tipc_shutdown(): Send a SHUTDOWN msg to peer and disconnect + * tipc_port_shutdown(): Send a SHUTDOWN msg to peer and disconnect */ -int tipc_shutdown(u32 ref) +int tipc_port_shutdown(u32 ref) { struct tipc_port *p_ptr; struct sk_buff *buf = NULL; @@ -877,78 +751,47 @@ int tipc_shutdown(u32 ref) buf = port_build_peer_abort_msg(p_ptr, TIPC_CONN_SHUTDOWN); tipc_port_unlock(p_ptr); tipc_net_route_msg(buf); - return tipc_disconnect(ref); -} - -/** - * tipc_port_recv_msg - receive message from lower layer and deliver to port user - */ -int tipc_port_recv_msg(struct sk_buff *buf) -{ - struct tipc_port *p_ptr; - struct tipc_msg *msg = buf_msg(buf); - u32 destport = msg_destport(msg); - u32 dsz = msg_data_sz(msg); - u32 err; - - /* forward unresolved named message */ - if (unlikely(!destport)) { - tipc_net_route_msg(buf); - return dsz; - } - - /* validate destination & pass to port, otherwise reject message */ - p_ptr = tipc_port_lock(destport); - if (likely(p_ptr)) { - err = p_ptr->dispatcher(p_ptr, buf); - tipc_port_unlock(p_ptr); - if (likely(!err)) - return dsz; - } else { - err = TIPC_ERR_NO_PORT; - } - - return tipc_reject_msg(buf, err); + return tipc_port_disconnect(ref); } /* - * tipc_port_recv_sections(): Concatenate and deliver sectioned - * message for this node. + * tipc_port_iovec_rcv: Concatenate and deliver sectioned + * message for this node. */ -static int tipc_port_recv_sections(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len) +static int tipc_port_iovec_rcv(struct tipc_port *sender, + struct iovec const *msg_sect, + unsigned int len) { struct sk_buff *buf; int res; res = tipc_msg_build(&sender->phdr, msg_sect, len, MAX_MSG_SIZE, &buf); if (likely(buf)) - tipc_port_recv_msg(buf); + tipc_sk_rcv(buf); return res; } /** * tipc_send - send message sections on connection */ -int tipc_send(u32 ref, struct iovec const *msg_sect, unsigned int len) +int tipc_send(struct tipc_port *p_ptr, + struct iovec const *msg_sect, + unsigned int len) { - struct tipc_port *p_ptr; u32 destnode; int res; - p_ptr = tipc_port_deref(ref); - if (!p_ptr || !p_ptr->connected) + if (!p_ptr->connected) return -EINVAL; p_ptr->congested = 1; if (!tipc_port_congested(p_ptr)) { - destnode = port_peernode(p_ptr); + destnode = tipc_port_peernode(p_ptr); if (likely(!in_own_node(destnode))) - res = tipc_link_send_sections_fast(p_ptr, msg_sect, - len, destnode); + res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, + destnode); else - res = tipc_port_recv_sections(p_ptr, msg_sect, len); + res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); if (likely(res != -ELINKCONG)) { p_ptr->congested = 0; @@ -957,7 +800,7 @@ int tipc_send(u32 ref, struct iovec const *msg_sect, unsigned int len) return res; } } - if (port_unreliable(p_ptr)) { + if (tipc_port_unreliable(p_ptr)) { p_ptr->congested = 0; return len; } @@ -967,17 +810,18 @@ int tipc_send(u32 ref, struct iovec const *msg_sect, unsigned int len) /** * tipc_send2name - send message sections to port name */ -int tipc_send2name(u32 ref, struct tipc_name const *name, unsigned int domain, - struct iovec const *msg_sect, unsigned int len) +int tipc_send2name(struct tipc_port *p_ptr, + struct tipc_name const *name, + unsigned int domain, + struct iovec const *msg_sect, + unsigned int len) { - struct tipc_port *p_ptr; struct tipc_msg *msg; u32 destnode = domain; u32 destport; int res; - p_ptr = tipc_port_deref(ref); - if (!p_ptr || p_ptr->connected) + if (p_ptr->connected) return -EINVAL; msg = &p_ptr->phdr; @@ -992,39 +836,39 @@ int tipc_send2name(u32 ref, struct tipc_name const *name, unsigned int domain, if (likely(destport || destnode)) { if (likely(in_own_node(destnode))) - res = tipc_port_recv_sections(p_ptr, msg_sect, len); + res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); else if (tipc_own_addr) - res = tipc_link_send_sections_fast(p_ptr, msg_sect, - len, destnode); + res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, + destnode); else - res = tipc_port_reject_sections(p_ptr, msg, msg_sect, - len, TIPC_ERR_NO_NODE); + res = tipc_port_iovec_reject(p_ptr, msg, msg_sect, + len, TIPC_ERR_NO_NODE); if (likely(res != -ELINKCONG)) { if (res > 0) p_ptr->sent++; return res; } - if (port_unreliable(p_ptr)) { + if (tipc_port_unreliable(p_ptr)) return len; - } + return -ELINKCONG; } - return tipc_port_reject_sections(p_ptr, msg, msg_sect, len, - TIPC_ERR_NO_NAME); + return tipc_port_iovec_reject(p_ptr, msg, msg_sect, len, + TIPC_ERR_NO_NAME); } /** * tipc_send2port - send message sections to port identity */ -int tipc_send2port(u32 ref, struct tipc_portid const *dest, - struct iovec const *msg_sect, unsigned int len) +int tipc_send2port(struct tipc_port *p_ptr, + struct tipc_portid const *dest, + struct iovec const *msg_sect, + unsigned int len) { - struct tipc_port *p_ptr; struct tipc_msg *msg; int res; - p_ptr = tipc_port_deref(ref); - if (!p_ptr || p_ptr->connected) + if (p_ptr->connected) return -EINVAL; msg = &p_ptr->phdr; @@ -1035,20 +879,20 @@ int tipc_send2port(u32 ref, struct tipc_portid const *dest, msg_set_hdr_sz(msg, BASIC_H_SIZE); if (in_own_node(dest->node)) - res = tipc_port_recv_sections(p_ptr, msg_sect, len); + res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); else if (tipc_own_addr) - res = tipc_link_send_sections_fast(p_ptr, msg_sect, len, - dest->node); + res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, + dest->node); else - res = tipc_port_reject_sections(p_ptr, msg, msg_sect, len, + res = tipc_port_iovec_reject(p_ptr, msg, msg_sect, len, TIPC_ERR_NO_NODE); if (likely(res != -ELINKCONG)) { if (res > 0) p_ptr->sent++; return res; } - if (port_unreliable(p_ptr)) { + if (tipc_port_unreliable(p_ptr)) return len; - } + return -ELINKCONG; } diff --git a/net/tipc/port.h b/net/tipc/port.h index 91225359734..cf4ca5b1d9a 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -1,7 +1,7 @@ /* * net/tipc/port.h: Include file for TIPC port code * - * Copyright (c) 1994-2007, Ericsson AB + * Copyright (c) 1994-2007, 2014, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -42,13 +42,13 @@ #include "msg.h" #include "node_subscr.h" -#define TIPC_FLOW_CONTROL_WIN 512 -#define CONN_OVERLOAD_LIMIT ((TIPC_FLOW_CONTROL_WIN * 2 + 1) * \ - SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) +#define TIPC_CONNACK_INTV 256 +#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) +#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ + SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) /** * struct tipc_port - TIPC port structure - * @sk: pointer to socket handle * @lock: pointer to spinlock for controlling access to port * @connected: non-zero if port is currently connected to a peer port * @conn_type: TIPC type used when connection was established @@ -60,8 +60,6 @@ * @ref: unique reference to port in TIPC object registry * @phdr: preformatted message header used when sending messages * @port_list: adjacent ports in TIPC's global list of ports - * @dispatcher: ptr to routine which handles received messages - * @wakeup: ptr to routine to call when port is no longer congested * @wait_list: adjacent ports in list of ports waiting on link congestion * @waiting_pkts: * @sent: # of non-empty messages sent by port @@ -74,7 +72,6 @@ * @subscription: "node down" subscription used to terminate failed connections */ struct tipc_port { - struct sock *sk; spinlock_t *lock; int connected; u32 conn_type; @@ -86,8 +83,6 @@ struct tipc_port { u32 ref; struct tipc_msg phdr; struct list_head port_list; - u32 (*dispatcher)(struct tipc_port *, struct sk_buff *); - void (*wakeup)(struct tipc_port *); struct list_head wait_list; u32 waiting_pkts; u32 sent; @@ -106,68 +101,70 @@ struct tipc_port_list; /* * TIPC port manipulation routines */ -struct tipc_port *tipc_createport(struct sock *sk, - u32 (*dispatcher)(struct tipc_port *, - struct sk_buff *), - void (*wakeup)(struct tipc_port *), - const u32 importance); +u32 tipc_port_init(struct tipc_port *p_ptr, + const unsigned int importance); int tipc_reject_msg(struct sk_buff *buf, u32 err); void tipc_acknowledge(u32 port_ref, u32 ack); -int tipc_deleteport(u32 portref); +void tipc_port_destroy(struct tipc_port *p_ptr); -int tipc_portimportance(u32 portref, unsigned int *importance); -int tipc_set_portimportance(u32 portref, unsigned int importance); - -int tipc_portunreliable(u32 portref, unsigned int *isunreliable); -int tipc_set_portunreliable(u32 portref, unsigned int isunreliable); - -int tipc_portunreturnable(u32 portref, unsigned int *isunreturnable); -int tipc_set_portunreturnable(u32 portref, unsigned int isunreturnable); - -int tipc_publish(u32 portref, unsigned int scope, +int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, struct tipc_name_seq const *name_seq); -int tipc_withdraw(u32 portref, unsigned int scope, + +int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, struct tipc_name_seq const *name_seq); -int tipc_connect(u32 portref, struct tipc_portid const *port); +int tipc_port_connect(u32 portref, struct tipc_portid const *port); -int tipc_disconnect(u32 portref); +int tipc_port_disconnect(u32 portref); -int tipc_shutdown(u32 ref); +int tipc_port_shutdown(u32 ref); +void tipc_port_wakeup(struct tipc_port *port); /* * The following routines require that the port be locked on entry */ -int __tipc_disconnect(struct tipc_port *tp_ptr); -int __tipc_connect(u32 ref, struct tipc_port *p_ptr, +int __tipc_port_disconnect(struct tipc_port *tp_ptr); +int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, struct tipc_portid const *peer); int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg); /* * TIPC messaging routines */ -int tipc_port_recv_msg(struct sk_buff *buf); -int tipc_send(u32 portref, struct iovec const *msg_sect, unsigned int len); -int tipc_send2name(u32 portref, struct tipc_name const *name, u32 domain, - struct iovec const *msg_sect, unsigned int len); +int tipc_send(struct tipc_port *port, + struct iovec const *msg_sect, + unsigned int len); -int tipc_send2port(u32 portref, struct tipc_portid const *dest, - struct iovec const *msg_sect, unsigned int len); +int tipc_send2name(struct tipc_port *port, + struct tipc_name const *name, + u32 domain, + struct iovec const *msg_sect, + unsigned int len); -int tipc_multicast(u32 portref, struct tipc_name_seq const *seq, - struct iovec const *msg, unsigned int len); +int tipc_send2port(struct tipc_port *port, + struct tipc_portid const *dest, + struct iovec const *msg_sect, + unsigned int len); + +int tipc_port_mcast_xmit(struct tipc_port *port, + struct tipc_name_seq const *seq, + struct iovec const *msg, + unsigned int len); + +int tipc_port_iovec_reject(struct tipc_port *p_ptr, + struct tipc_msg *hdr, + struct iovec const *msg_sect, + unsigned int len, + int err); -int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr, - struct iovec const *msg_sect, unsigned int len, - int err); struct sk_buff *tipc_port_get_ports(void); -void tipc_port_recv_proto_msg(struct sk_buff *buf); -void tipc_port_recv_mcast(struct sk_buff *buf, struct tipc_port_list *dp); +void tipc_port_proto_rcv(struct sk_buff *buf); +void tipc_port_mcast_rcv(struct sk_buff *buf, struct tipc_port_list *dp); void tipc_port_reinit(void); /** @@ -188,14 +185,53 @@ static inline void tipc_port_unlock(struct tipc_port *p_ptr) spin_unlock_bh(p_ptr->lock); } -static inline struct tipc_port *tipc_port_deref(u32 ref) +static inline int tipc_port_congested(struct tipc_port *p_ptr) { - return (struct tipc_port *)tipc_ref_deref(ref); + return ((p_ptr->sent - p_ptr->acked) >= TIPC_FLOWCTRL_WIN); } -static inline int tipc_port_congested(struct tipc_port *p_ptr) + +static inline u32 tipc_port_peernode(struct tipc_port *p_ptr) +{ + return msg_destnode(&p_ptr->phdr); +} + +static inline u32 tipc_port_peerport(struct tipc_port *p_ptr) +{ + return msg_destport(&p_ptr->phdr); +} + +static inline bool tipc_port_unreliable(struct tipc_port *port) +{ + return msg_src_droppable(&port->phdr) != 0; +} + +static inline void tipc_port_set_unreliable(struct tipc_port *port, + bool unreliable) +{ + msg_set_src_droppable(&port->phdr, unreliable ? 1 : 0); +} + +static inline bool tipc_port_unreturnable(struct tipc_port *port) +{ + return msg_dest_droppable(&port->phdr) != 0; +} + +static inline void tipc_port_set_unreturnable(struct tipc_port *port, + bool unreturnable) +{ + msg_set_dest_droppable(&port->phdr, unreturnable ? 1 : 0); +} + + +static inline int tipc_port_importance(struct tipc_port *port) +{ + return msg_importance(&port->phdr); +} + +static inline void tipc_port_set_importance(struct tipc_port *port, int imp) { - return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2); + msg_set_importance(&port->phdr, (u32)imp); } #endif diff --git a/net/tipc/ref.c b/net/tipc/ref.c index 2a2a938dc22..3d4ecd754ee 100644 --- a/net/tipc/ref.c +++ b/net/tipc/ref.c @@ -89,7 +89,7 @@ struct ref_table { static struct ref_table tipc_ref_table; -static DEFINE_RWLOCK(ref_table_lock); +static DEFINE_SPINLOCK(ref_table_lock); /** * tipc_ref_table_init - create reference table for objects @@ -126,9 +126,6 @@ int tipc_ref_table_init(u32 requested_size, u32 start) */ void tipc_ref_table_stop(void) { - if (!tipc_ref_table.entries) - return; - vfree(tipc_ref_table.entries); tipc_ref_table.entries = NULL; } @@ -162,7 +159,7 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock) } /* take a free entry, if available; otherwise initialize a new entry */ - write_lock_bh(&ref_table_lock); + spin_lock_bh(&ref_table_lock); if (tipc_ref_table.first_free) { index = tipc_ref_table.first_free; entry = &(tipc_ref_table.entries[index]); @@ -178,7 +175,7 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock) } else { ref = 0; } - write_unlock_bh(&ref_table_lock); + spin_unlock_bh(&ref_table_lock); /* * Grab the lock so no one else can modify this entry @@ -219,7 +216,7 @@ void tipc_ref_discard(u32 ref) index = ref & index_mask; entry = &(tipc_ref_table.entries[index]); - write_lock_bh(&ref_table_lock); + spin_lock_bh(&ref_table_lock); if (!entry->object) { pr_err("Attempt to discard ref. to non-existent obj\n"); @@ -245,7 +242,7 @@ void tipc_ref_discard(u32 ref) tipc_ref_table.last_free = index; exit: - write_unlock_bh(&ref_table_lock); + spin_unlock_bh(&ref_table_lock); } /** @@ -267,20 +264,3 @@ void *tipc_ref_lock(u32 ref) } return NULL; } - - -/** - * tipc_ref_deref - return pointer referenced object (without locking it) - */ -void *tipc_ref_deref(u32 ref) -{ - if (likely(tipc_ref_table.entries)) { - struct reference *entry; - - entry = &tipc_ref_table.entries[ref & - tipc_ref_table.index_mask]; - if (likely(entry->ref == ref)) - return entry->object; - } - return NULL; -} diff --git a/net/tipc/ref.h b/net/tipc/ref.h index 5bc8e7ab84d..d01aa1df63b 100644 --- a/net/tipc/ref.h +++ b/net/tipc/ref.h @@ -44,6 +44,5 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock); void tipc_ref_discard(u32 ref); void *tipc_ref_lock(u32 ref); -void *tipc_ref_deref(u32 ref); #endif diff --git a/net/tipc/server.c b/net/tipc/server.c index fd3fa57a410..a538a02f869 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -55,7 +55,7 @@ * @usr_data: user-specified field * @rx_action: what to do when connection socket is active * @outqueue: pointer to first outbound message in queue - * @outqueue_lock: controll access to the outqueue + * @outqueue_lock: control access to the outqueue * @outqueue: list of connection objects for its server * @swork: send work item */ @@ -87,7 +87,6 @@ static void tipc_clean_outqueues(struct tipc_conn *con); static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); - struct tipc_server *s = con->server; if (con->sock) { tipc_sock_release_local(con->sock); @@ -95,10 +94,6 @@ static void tipc_conn_kref_release(struct kref *kref) } tipc_clean_outqueues(con); - - if (con->conid) - s->tipc_conn_shutdown(con->conid, con->usr_data); - kfree(con); } @@ -124,7 +119,7 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) return con; } -static void sock_data_ready(struct sock *sk, int unused) +static void sock_data_ready(struct sock *sk) { struct tipc_conn *con; @@ -181,6 +176,9 @@ static void tipc_close_conn(struct tipc_conn *con) struct tipc_server *s = con->server; if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { + if (con->conid) + s->tipc_conn_shutdown(con->conid, con->usr_data); + spin_lock_bh(&s->idr_lock); idr_remove(&s->conn_idr, con->conid); s->idr_in_use--; @@ -299,7 +297,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con) newcon->usr_data = s->tipc_conn_new(newcon->conid); /* Wake up receive process in case of 'SYN+' message */ - newsock->sk->sk_data_ready(newsock->sk, 0); + newsock->sk->sk_data_ready(newsock->sk); return ret; } @@ -429,10 +427,12 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); - if (test_bit(CF_CONNECTED, &con->flags)) + if (test_bit(CF_CONNECTED, &con->flags)) { if (!queue_work(s->send_wq, &con->swork)) conn_put(con); - + } else { + conn_put(con); + } return 0; } @@ -573,7 +573,6 @@ int tipc_server_start(struct tipc_server *s) kmem_cache_destroy(s->rcvbuf_cache); return ret; } - s->enabled = 1; return ret; } @@ -583,10 +582,6 @@ void tipc_server_stop(struct tipc_server *s) int total = 0; int id; - if (!s->enabled) - return; - - s->enabled = 0; spin_lock_bh(&s->idr_lock); for (id = 0; total < s->idr_in_use; id++) { con = idr_find(&s->conn_idr, id); diff --git a/net/tipc/server.h b/net/tipc/server.h index 98b23f20bc0..be817b0b547 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -56,7 +56,6 @@ * @name: server name * @imp: message importance * @type: socket type - * @enabled: identify whether server is launched or not */ struct tipc_server { struct idr conn_idr; @@ -74,7 +73,6 @@ struct tipc_server { const char name[TIPC_SERVER_NAME_LEN]; int imp; int type; - int enabled; }; int tipc_conn_sendmsg(struct tipc_server *s, int conid, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3b61851bb92..ef0475568f9 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,7 +1,7 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012 Ericsson AB + * Copyright (c) 2001-2007, 2012-2014, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * @@ -36,35 +36,20 @@ #include "core.h" #include "port.h" +#include "node.h" #include <linux/export.h> -#include <net/sock.h> #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ -struct tipc_sock { - struct sock sk; - struct tipc_port *p; - struct tipc_portid peer_name; - unsigned int conn_timeout; -}; - -#define tipc_sk(sk) ((struct tipc_sock *)(sk)) -#define tipc_sk_port(sk) (tipc_sk(sk)->p) - -#define tipc_rx_ready(sock) (!skb_queue_empty(&sock->sk->sk_receive_queue) || \ - (sock->state == SS_DISCONNECTING)) - -static int backlog_rcv(struct sock *sk, struct sk_buff *skb); -static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf); -static void wakeupdispatch(struct tipc_port *tport); -static void tipc_data_ready(struct sock *sk, int len); +static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); -static int release(struct socket *sock); -static int accept(struct socket *sock, struct socket *new_sock, int flags); +static int tipc_release(struct socket *sock); +static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -73,8 +58,6 @@ static const struct proto_ops msg_ops; static struct proto tipc_proto; static struct proto tipc_proto_kern; -static int sockets_enabled; - /* * Revised TIPC socket locking policy: * @@ -120,6 +103,8 @@ static int sockets_enabled; * - port reference */ +#include "socket.h" + /** * advance_rx_queue - discard first buffer in socket receive queue * @@ -155,13 +140,15 @@ static void reject_rx_queue(struct sock *sk) * * Returns 0 on success, errno otherwise */ -static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, - int kern) +static int tipc_sk_create(struct net *net, struct socket *sock, + int protocol, int kern) { const struct proto_ops *ops; socket_state state; struct sock *sk; - struct tipc_port *tp_ptr; + struct tipc_sock *tsk; + struct tipc_port *port; + u32 ref; /* Validate arguments */ if (unlikely(protocol != 0)) @@ -194,10 +181,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, if (sk == NULL) return -ENOMEM; - /* Allocate TIPC port for socket to use */ - tp_ptr = tipc_createport(sk, &dispatch, &wakeupdispatch, - TIPC_LOW_IMPORTANCE); - if (unlikely(!tp_ptr)) { + tsk = tipc_sk(sk); + port = &tsk->port; + + ref = tipc_port_init(port, TIPC_LOW_IMPORTANCE); + if (!ref) { + pr_warn("Socket registration failed, ref. table exhausted\n"); sk_free(sk); return -ENOMEM; } @@ -207,21 +196,19 @@ static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, sock->state = state; sock_init_data(sock, sk); - sk->sk_backlog_rcv = backlog_rcv; + sk->sk_backlog_rcv = tipc_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; - tipc_sk(sk)->p = tp_ptr; - tipc_sk(sk)->conn_timeout = CONN_TIMEOUT_DEFAULT; - - spin_unlock_bh(tp_ptr->lock); + tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; + atomic_set(&tsk->dupl_rcvcnt, 0); + tipc_port_unlock(port); if (sock->state == SS_READY) { - tipc_set_portunreturnable(tp_ptr->ref, 1); + tipc_port_set_unreturnable(port, true); if (sock->type == SOCK_DGRAM) - tipc_set_portunreliable(tp_ptr->ref, 1); + tipc_port_set_unreliable(port, true); } - return 0; } @@ -239,7 +226,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, int tipc_sock_create_local(int type, struct socket **res) { int rc; - struct sock *sk; rc = sock_create_lite(AF_TIPC, type, 0, res); if (rc < 0) { @@ -248,8 +234,6 @@ int tipc_sock_create_local(int type, struct socket **res) } tipc_sk_create(&init_net, *res, 0, 1); - sk = (*res)->sk; - return 0; } @@ -262,7 +246,7 @@ int tipc_sock_create_local(int type, struct socket **res) */ void tipc_sock_release_local(struct socket *sock) { - release(sock); + tipc_release(sock); sock->ops = NULL; sock_release(sock); } @@ -288,7 +272,7 @@ int tipc_sock_accept_local(struct socket *sock, struct socket **newsock, if (ret < 0) return ret; - ret = accept(sock, *newsock, flags); + ret = tipc_accept(sock, *newsock, flags); if (ret < 0) { sock_release(*newsock); return ret; @@ -298,7 +282,7 @@ int tipc_sock_accept_local(struct socket *sock, struct socket **newsock, } /** - * release - destroy a TIPC socket + * tipc_release - destroy a TIPC socket * @sock: socket to destroy * * This routine cleans up any messages that are still queued on the socket. @@ -313,12 +297,12 @@ int tipc_sock_accept_local(struct socket *sock, struct socket **newsock, * * Returns 0 on success, errno otherwise */ -static int release(struct socket *sock) +static int tipc_release(struct socket *sock) { struct sock *sk = sock->sk; - struct tipc_port *tport; + struct tipc_sock *tsk; + struct tipc_port *port; struct sk_buff *buf; - int res; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -327,7 +311,8 @@ static int release(struct socket *sock) if (sk == NULL) return 0; - tport = tipc_sk_port(sk); + tsk = tipc_sk(sk); + port = &tsk->port; lock_sock(sk); /* @@ -344,17 +329,16 @@ static int release(struct socket *sock) if ((sock->state == SS_CONNECTING) || (sock->state == SS_CONNECTED)) { sock->state = SS_DISCONNECTING; - tipc_disconnect(tport->ref); + tipc_port_disconnect(port->ref); } tipc_reject_msg(buf, TIPC_ERR_NO_PORT); } } - /* - * Delete TIPC port; this ensures no more messages are queued - * (also disconnects an active connection & sends a 'FIN-' to peer) + /* Destroy TIPC port; also disconnects an active connection and + * sends a 'FIN-' to peer. */ - res = tipc_deleteport(tport->ref); + tipc_port_destroy(port); /* Discard any remaining (connection-based) messages in receive queue */ __skb_queue_purge(&sk->sk_receive_queue); @@ -366,11 +350,11 @@ static int release(struct socket *sock) sock_put(sk); sock->sk = NULL; - return res; + return 0; } /** - * bind - associate or disassocate TIPC name(s) with a socket + * tipc_bind - associate or disassocate TIPC name(s) with a socket * @sock: socket structure * @uaddr: socket address describing name(s) and desired operation * @uaddr_len: size of socket address data structure @@ -384,36 +368,53 @@ static int release(struct socket *sock) * NOTE: This routine doesn't need to take the socket lock since it doesn't * access any non-constant socket information. */ -static int bind(struct socket *sock, struct sockaddr *uaddr, int uaddr_len) +static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, + int uaddr_len) { + struct sock *sk = sock->sk; struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; - u32 portref = tipc_sk_port(sock->sk)->ref; + struct tipc_sock *tsk = tipc_sk(sk); + int res = -EINVAL; - if (unlikely(!uaddr_len)) - return tipc_withdraw(portref, 0, NULL); + lock_sock(sk); + if (unlikely(!uaddr_len)) { + res = tipc_withdraw(&tsk->port, 0, NULL); + goto exit; + } - if (uaddr_len < sizeof(struct sockaddr_tipc)) - return -EINVAL; - if (addr->family != AF_TIPC) - return -EAFNOSUPPORT; + if (uaddr_len < sizeof(struct sockaddr_tipc)) { + res = -EINVAL; + goto exit; + } + if (addr->family != AF_TIPC) { + res = -EAFNOSUPPORT; + goto exit; + } if (addr->addrtype == TIPC_ADDR_NAME) addr->addr.nameseq.upper = addr->addr.nameseq.lower; - else if (addr->addrtype != TIPC_ADDR_NAMESEQ) - return -EAFNOSUPPORT; + else if (addr->addrtype != TIPC_ADDR_NAMESEQ) { + res = -EAFNOSUPPORT; + goto exit; + } if ((addr->addr.nameseq.type < TIPC_RESERVED_TYPES) && (addr->addr.nameseq.type != TIPC_TOP_SRV) && - (addr->addr.nameseq.type != TIPC_CFG_SRV)) - return -EACCES; + (addr->addr.nameseq.type != TIPC_CFG_SRV)) { + res = -EACCES; + goto exit; + } - return (addr->scope > 0) ? - tipc_publish(portref, addr->scope, &addr->addr.nameseq) : - tipc_withdraw(portref, -addr->scope, &addr->addr.nameseq); + res = (addr->scope > 0) ? + tipc_publish(&tsk->port, addr->scope, &addr->addr.nameseq) : + tipc_withdraw(&tsk->port, -addr->scope, &addr->addr.nameseq); +exit: + release_sock(sk); + return res; } /** - * get_name - get port ID of socket or peer socket + * tipc_getname - get port ID of socket or peer socket * @sock: socket structure * @uaddr: area for returned socket address * @uaddr_len: area for returned length of socket address @@ -425,21 +426,21 @@ static int bind(struct socket *sock, struct sockaddr *uaddr, int uaddr_len) * accesses socket information that is unchanging (or which changes in * a completely predictable manner). */ -static int get_name(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) +static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) { struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; - struct tipc_sock *tsock = tipc_sk(sock->sk); + struct tipc_sock *tsk = tipc_sk(sock->sk); memset(addr, 0, sizeof(*addr)); if (peer) { if ((sock->state != SS_CONNECTED) && ((peer != 2) || (sock->state != SS_DISCONNECTING))) return -ENOTCONN; - addr->addr.id.ref = tsock->peer_name.ref; - addr->addr.id.node = tsock->peer_name.node; + addr->addr.id.ref = tipc_port_peerport(&tsk->port); + addr->addr.id.node = tipc_port_peernode(&tsk->port); } else { - addr->addr.id.ref = tsock->p->ref; + addr->addr.id.ref = tsk->port.ref; addr->addr.id.node = tipc_own_addr; } @@ -453,7 +454,7 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr, } /** - * poll - read and possibly block on pollmask + * tipc_poll - read and possibly block on pollmask * @file: file structure associated with the socket * @sock: socket for which to calculate the poll bits * @wait: ??? @@ -492,22 +493,23 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr, * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static unsigned int poll(struct file *file, struct socket *sock, - poll_table *wait) +static unsigned int tipc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); u32 mask = 0; sock_poll_wait(file, sk_sleep(sk), wait); switch ((int)sock->state) { case SS_UNCONNECTED: - if (!tipc_sk_port(sk)->congested) + if (!tsk->port.congested) mask |= POLLOUT; break; case SS_READY: case SS_CONNECTED: - if (!tipc_sk_port(sk)->congested) + if (!tsk->port.congested) mask |= POLLOUT; /* fall thru' */ case SS_CONNECTING: @@ -554,8 +556,34 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) return 0; } +static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) +{ + struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); + DEFINE_WAIT(wait); + int done; + + do { + int err = sock_error(sk); + if (err) + return err; + if (sock->state == SS_DISCONNECTING) + return -EPIPE; + if (!*timeo_p) + return -EAGAIN; + if (signal_pending(current)) + return sock_intr_errno(*timeo_p); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + done = sk_wait_event(sk, timeo_p, !tsk->port.congested); + finish_wait(sk_sleep(sk), &wait); + } while (!done); + return 0; +} + + /** - * send_msg - send message in connectionless manner + * tipc_sendmsg - send message in connectionless manner * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure * @m: message to send @@ -568,14 +596,15 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) * * Returns the number of bytes sent on success, or errno otherwise */ -static int send_msg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); - struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name; + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); int needs_conn; - long timeout_val; + long timeo; int res = -EINVAL; if (unlikely(!dest)) @@ -599,33 +628,32 @@ static int send_msg(struct kiocb *iocb, struct socket *sock, res = -EISCONN; goto exit; } - if (tport->published) { + if (tsk->port.published) { res = -EOPNOTSUPP; goto exit; } if (dest->addrtype == TIPC_ADDR_NAME) { - tport->conn_type = dest->addr.name.name.type; - tport->conn_instance = dest->addr.name.name.instance; + tsk->port.conn_type = dest->addr.name.name.type; + tsk->port.conn_instance = dest->addr.name.name.instance; } /* Abort any pending connection attempts (very unlikely) */ reject_rx_queue(sk); } - timeout_val = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - + timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); do { if (dest->addrtype == TIPC_ADDR_NAME) { res = dest_name_check(dest, m); if (res) break; - res = tipc_send2name(tport->ref, + res = tipc_send2name(port, &dest->addr.name.name, dest->addr.name.domain, m->msg_iov, total_len); } else if (dest->addrtype == TIPC_ADDR_ID) { - res = tipc_send2port(tport->ref, + res = tipc_send2port(port, &dest->addr.id, m->msg_iov, total_len); @@ -637,24 +665,19 @@ static int send_msg(struct kiocb *iocb, struct socket *sock, res = dest_name_check(dest, m); if (res) break; - res = tipc_multicast(tport->ref, - &dest->addr.nameseq, - m->msg_iov, - total_len); + res = tipc_port_mcast_xmit(port, + &dest->addr.nameseq, + m->msg_iov, + total_len); } if (likely(res != -ELINKCONG)) { if (needs_conn && (res >= 0)) sock->state = SS_CONNECTING; break; } - if (timeout_val <= 0L) { - res = timeout_val ? timeout_val : -EWOULDBLOCK; + res = tipc_wait_for_sndmsg(sock, &timeo); + if (res) break; - } - release_sock(sk); - timeout_val = wait_event_interruptible_timeout(*sk_sleep(sk), - !tport->congested, timeout_val); - lock_sock(sk); } while (1); exit: @@ -663,8 +686,37 @@ exit: return res; } +static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) +{ + struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; + DEFINE_WAIT(wait); + int done; + + do { + int err = sock_error(sk); + if (err) + return err; + if (sock->state == SS_DISCONNECTING) + return -EPIPE; + else if (sock->state != SS_CONNECTED) + return -ENOTCONN; + if (!*timeo_p) + return -EAGAIN; + if (signal_pending(current)) + return sock_intr_errno(*timeo_p); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + done = sk_wait_event(sk, timeo_p, + (!port->congested || !port->connected)); + finish_wait(sk_sleep(sk), &wait); + } while (!done); + return 0; +} + /** - * send_packet - send a connection-oriented message + * tipc_send_packet - send a connection-oriented message * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure * @m: message to send @@ -674,18 +726,18 @@ exit: * * Returns the number of bytes sent on success, or errno otherwise */ -static int send_packet(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); - struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name; - long timeout_val; - int res; + struct tipc_sock *tsk = tipc_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + int res = -EINVAL; + long timeo; /* Handle implied connection establishment */ if (unlikely(dest)) - return send_msg(iocb, sock, m, total_len); + return tipc_sendmsg(iocb, sock, m, total_len); if (total_len > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; @@ -693,37 +745,31 @@ static int send_packet(struct kiocb *iocb, struct socket *sock, if (iocb) lock_sock(sk); - timeout_val = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + if (unlikely(sock->state != SS_CONNECTED)) { + if (sock->state == SS_DISCONNECTING) + res = -EPIPE; + else + res = -ENOTCONN; + goto exit; + } + timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); do { - if (unlikely(sock->state != SS_CONNECTED)) { - if (sock->state == SS_DISCONNECTING) - res = -EPIPE; - else - res = -ENOTCONN; - break; - } - - res = tipc_send(tport->ref, m->msg_iov, total_len); + res = tipc_send(&tsk->port, m->msg_iov, total_len); if (likely(res != -ELINKCONG)) break; - if (timeout_val <= 0L) { - res = timeout_val ? timeout_val : -EWOULDBLOCK; + res = tipc_wait_for_sndpkt(sock, &timeo); + if (res) break; - } - release_sock(sk); - timeout_val = wait_event_interruptible_timeout(*sk_sleep(sk), - (!tport->congested || !tport->connected), timeout_val); - lock_sock(sk); } while (1); - +exit: if (iocb) release_sock(sk); return res; } /** - * send_stream - send stream-oriented data + * tipc_send_stream - send stream-oriented data * @iocb: (unused) * @sock: socket structure * @m: data to send @@ -734,11 +780,11 @@ static int send_packet(struct kiocb *iocb, struct socket *sock, * Returns the number of bytes sent on success (or partial success), * or errno if no data sent */ -static int send_stream(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); + struct tipc_sock *tsk = tipc_sk(sk); struct msghdr my_msg; struct iovec my_iov; struct iovec *curr_iov; @@ -754,16 +800,11 @@ static int send_stream(struct kiocb *iocb, struct socket *sock, /* Handle special cases where there is no connection */ if (unlikely(sock->state != SS_CONNECTED)) { - if (sock->state == SS_UNCONNECTED) { - res = send_packet(NULL, sock, m, total_len); - goto exit; - } else if (sock->state == SS_DISCONNECTING) { - res = -EPIPE; - goto exit; - } else { - res = -ENOTCONN; - goto exit; - } + if (sock->state == SS_UNCONNECTED) + res = tipc_send_packet(NULL, sock, m, total_len); + else + res = sock->state == SS_DISCONNECTING ? -EPIPE : -ENOTCONN; + goto exit; } if (unlikely(m->msg_name)) { @@ -791,21 +832,22 @@ static int send_stream(struct kiocb *iocb, struct socket *sock, my_msg.msg_name = NULL; bytes_sent = 0; - hdr_size = msg_hdr_sz(&tport->phdr); + hdr_size = msg_hdr_sz(&tsk->port.phdr); while (curr_iovlen--) { curr_start = curr_iov->iov_base; curr_left = curr_iov->iov_len; while (curr_left) { - bytes_to_send = tport->max_pkt - hdr_size; + bytes_to_send = tsk->port.max_pkt - hdr_size; if (bytes_to_send > TIPC_MAX_USER_MSG_SIZE) bytes_to_send = TIPC_MAX_USER_MSG_SIZE; if (curr_left < bytes_to_send) bytes_to_send = curr_left; my_iov.iov_base = curr_start; my_iov.iov_len = bytes_to_send; - res = send_packet(NULL, sock, &my_msg, bytes_to_send); + res = tipc_send_packet(NULL, sock, &my_msg, + bytes_to_send); if (res < 0) { if (bytes_sent) res = bytes_sent; @@ -826,27 +868,25 @@ exit: /** * auto_connect - complete connection setup to a remote port - * @sock: socket structure + * @tsk: tipc socket structure * @msg: peer's response message * * Returns 0 on success, errno otherwise */ -static int auto_connect(struct socket *sock, struct tipc_msg *msg) +static int auto_connect(struct tipc_sock *tsk, struct tipc_msg *msg) { - struct tipc_sock *tsock = tipc_sk(sock->sk); - struct tipc_port *p_ptr; + struct tipc_port *port = &tsk->port; + struct socket *sock = tsk->sk.sk_socket; + struct tipc_portid peer; - tsock->peer_name.ref = msg_origport(msg); - tsock->peer_name.node = msg_orignode(msg); - p_ptr = tipc_port_deref(tsock->p->ref); - if (!p_ptr) - return -EINVAL; + peer.ref = msg_origport(msg); + peer.node = msg_orignode(msg); - __tipc_connect(tsock->p->ref, p_ptr, &tsock->peer_name); + __tipc_port_connect(port->ref, port, &peer); if (msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE) return -EINVAL; - msg_set_importance(&p_ptr->phdr, (u32)msg_importance(msg)); + msg_set_importance(&port->phdr, (u32)msg_importance(msg)); sock->state = SS_CONNECTED; return 0; } @@ -860,7 +900,7 @@ static int auto_connect(struct socket *sock, struct tipc_msg *msg) */ static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) { - struct sockaddr_tipc *addr = (struct sockaddr_tipc *)m->msg_name; + DECLARE_SOCKADDR(struct sockaddr_tipc *, addr, m->msg_name); if (addr) { addr->family = AF_TIPC; @@ -945,8 +985,41 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, return 0; } +static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) +{ + struct sock *sk = sock->sk; + DEFINE_WAIT(wait); + long timeo = *timeop; + int err; + + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { + if (sock->state == SS_DISCONNECTING) { + err = -ENOTCONN; + break; + } + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + } + err = 0; + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk_sleep(sk), &wait); + *timeop = timeo; + return err; +} + /** - * recv_msg - receive packet-oriented message + * tipc_recvmsg - receive packet-oriented message * @iocb: (unused) * @m: descriptor for message info * @buf_len: total size of user buffer area @@ -957,14 +1030,15 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, * * Returns size of returned message data, errno otherwise */ -static int recv_msg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t buf_len, int flags) +static int tipc_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t buf_len, int flags) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; struct sk_buff *buf; struct tipc_msg *msg; - long timeout; + long timeo; unsigned int sz; u32 err; int res; @@ -980,25 +1054,13 @@ static int recv_msg(struct kiocb *iocb, struct socket *sock, goto exit; } - timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); restart: /* Look for a message in receive queue; wait if necessary */ - while (skb_queue_empty(&sk->sk_receive_queue)) { - if (sock->state == SS_DISCONNECTING) { - res = -ENOTCONN; - goto exit; - } - if (timeout <= 0L) { - res = timeout ? timeout : -EWOULDBLOCK; - goto exit; - } - release_sock(sk); - timeout = wait_event_interruptible_timeout(*sk_sleep(sk), - tipc_rx_ready(sock), - timeout); - lock_sock(sk); - } + res = tipc_wait_for_rcvmsg(sock, &timeo); + if (res) + goto exit; /* Look at first message in receive queue */ buf = skb_peek(&sk->sk_receive_queue); @@ -1016,7 +1078,7 @@ restart: set_orig_addr(m, msg); /* Capture ancillary data (optional) */ - res = anc_data_recv(m, msg, tport); + res = anc_data_recv(m, msg, port); if (res) goto exit; @@ -1042,8 +1104,8 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { if ((sock->state != SS_READY) && - (++tport->conn_unacked >= TIPC_FLOW_CONTROL_WIN)) - tipc_acknowledge(tport->ref, tport->conn_unacked); + (++port->conn_unacked >= TIPC_CONNACK_INTV)) + tipc_acknowledge(port->ref, port->conn_unacked); advance_rx_queue(sk); } exit: @@ -1052,7 +1114,7 @@ exit: } /** - * recv_stream - receive stream-oriented data + * tipc_recv_stream - receive stream-oriented data * @iocb: (unused) * @m: descriptor for message info * @buf_len: total size of user buffer area @@ -1063,14 +1125,15 @@ exit: * * Returns size of returned message data, errno otherwise */ -static int recv_stream(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t buf_len, int flags) +static int tipc_recv_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t buf_len, int flags) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; struct sk_buff *buf; struct tipc_msg *msg; - long timeout; + long timeo; unsigned int sz; int sz_to_copy, target, needed; int sz_copied = 0; @@ -1083,31 +1146,19 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, lock_sock(sk); - if (unlikely((sock->state == SS_UNCONNECTED))) { + if (unlikely(sock->state == SS_UNCONNECTED)) { res = -ENOTCONN; goto exit; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); - timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); restart: /* Look for a message in receive queue; wait if necessary */ - while (skb_queue_empty(&sk->sk_receive_queue)) { - if (sock->state == SS_DISCONNECTING) { - res = -ENOTCONN; - goto exit; - } - if (timeout <= 0L) { - res = timeout ? timeout : -EWOULDBLOCK; - goto exit; - } - release_sock(sk); - timeout = wait_event_interruptible_timeout(*sk_sleep(sk), - tipc_rx_ready(sock), - timeout); - lock_sock(sk); - } + res = tipc_wait_for_rcvmsg(sock, &timeo); + if (res) + goto exit; /* Look at first message in receive queue */ buf = skb_peek(&sk->sk_receive_queue); @@ -1124,7 +1175,7 @@ restart: /* Optionally capture sender's address & ancillary data of first msg */ if (sz_copied == 0) { set_orig_addr(m, msg); - res = anc_data_recv(m, msg, tport); + res = anc_data_recv(m, msg, port); if (res) goto exit; } @@ -1162,8 +1213,8 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { - if (unlikely(++tport->conn_unacked >= TIPC_FLOW_CONTROL_WIN)) - tipc_acknowledge(tport->ref, tport->conn_unacked); + if (unlikely(++port->conn_unacked >= TIPC_CONNACK_INTV)) + tipc_acknowledge(port->ref, port->conn_unacked); advance_rx_queue(sk); } @@ -1201,7 +1252,7 @@ static void tipc_write_space(struct sock *sk) * @sk: socket * @len: the length of messages */ -static void tipc_data_ready(struct sock *sk, int len) +static void tipc_data_ready(struct sock *sk) { struct socket_wq *wq; @@ -1215,17 +1266,19 @@ static void tipc_data_ready(struct sock *sk, int len) /** * filter_connect - Handle all incoming messages for a connection-based socket - * @tsock: TIPC socket + * @tsk: TIPC socket * @msg: message * * Returns TIPC error status code and socket error status code * once it encounters some errors */ -static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) +static u32 filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) { - struct socket *sock = tsock->sk.sk_socket; + struct sock *sk = &tsk->sk; + struct tipc_port *port = &tsk->port; + struct socket *sock = sk->sk_socket; struct tipc_msg *msg = buf_msg(*buf); - struct sock *sk = &tsock->sk; + u32 retval = TIPC_ERR_NO_PORT; int res; @@ -1235,10 +1288,10 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) switch ((int)sock->state) { case SS_CONNECTED: /* Accept only connection-based messages sent by peer */ - if (msg_connected(msg) && tipc_port_peer_msg(tsock->p, msg)) { + if (msg_connected(msg) && tipc_port_peer_msg(port, msg)) { if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; - __tipc_disconnect(tsock->p); + __tipc_port_disconnect(port); } retval = TIPC_OK; } @@ -1255,7 +1308,7 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) if (unlikely(!msg_connected(msg))) break; - res = auto_connect(sock, msg); + res = auto_connect(tsk, msg); if (res) { sock->state = SS_DISCONNECTING; sk->sk_err = -res; @@ -1311,14 +1364,12 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); - unsigned int limit; if (msg_connected(msg)) - limit = sysctl_tipc_rmem[2]; - else - limit = sk->sk_rcvbuf >> TIPC_CRITICAL_IMPORTANCE << - msg_importance(msg); - return limit; + return sysctl_tipc_rmem[2]; + + return sk->sk_rcvbuf >> TIPC_CRITICAL_IMPORTANCE << + msg_importance(msg); } /** @@ -1336,6 +1387,7 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) { struct socket *sock = sk->sk_socket; + struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *msg = buf_msg(buf); unsigned int limit = rcvbuf_limit(sk, buf); u32 res = TIPC_OK; @@ -1348,7 +1400,7 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) if (msg_connected(msg)) return TIPC_ERR_NO_PORT; } else { - res = filter_connect(tipc_sk(sk), &buf); + res = filter_connect(tsk, &buf); if (res != TIPC_OK || buf == NULL) return res; } @@ -1362,12 +1414,12 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) __skb_queue_tail(&sk->sk_receive_queue, buf); skb_set_owner_r(buf, sk); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return TIPC_OK; } /** - * backlog_rcv - handle incoming message from backlog queue + * tipc_backlog_rcv - handle incoming message from backlog queue * @sk: socket * @buf: message * @@ -1375,65 +1427,100 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) * * Returns 0 */ -static int backlog_rcv(struct sock *sk, struct sk_buff *buf) +static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *buf) { u32 res; + struct tipc_sock *tsk = tipc_sk(sk); + uint truesize = buf->truesize; res = filter_rcv(sk, buf); - if (res) + if (unlikely(res)) tipc_reject_msg(buf, res); + + if (atomic_read(&tsk->dupl_rcvcnt) < TIPC_CONN_OVERLOAD_LIMIT) + atomic_add(truesize, &tsk->dupl_rcvcnt); + return 0; } /** - * dispatch - handle incoming message - * @tport: TIPC port that received message - * @buf: message - * - * Called with port lock already taken. - * - * Returns TIPC error status code (TIPC_OK if message is not to be rejected) + * tipc_sk_rcv - handle incoming message + * @buf: buffer containing arriving message + * Consumes buffer + * Returns 0 if success, or errno: -EHOSTUNREACH */ -static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf) +int tipc_sk_rcv(struct sk_buff *buf) { - struct sock *sk = tport->sk; - u32 res; + struct tipc_sock *tsk; + struct tipc_port *port; + struct sock *sk; + u32 dport = msg_destport(buf_msg(buf)); + int err = TIPC_OK; + uint limit; - /* - * Process message if socket is unlocked; otherwise add to backlog queue - * - * This code is based on sk_receive_skb(), but must be distinct from it - * since a TIPC-specific filter/reject mechanism is utilized - */ + /* Forward unresolved named message */ + if (unlikely(!dport)) { + tipc_net_route_msg(buf); + return 0; + } + + /* Validate destination */ + port = tipc_port_lock(dport); + if (unlikely(!port)) { + err = TIPC_ERR_NO_PORT; + goto exit; + } + + tsk = tipc_port_to_sock(port); + sk = &tsk->sk; + + /* Queue message */ bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { - res = filter_rcv(sk, buf); + err = filter_rcv(sk, buf); } else { - if (sk_add_backlog(sk, buf, rcvbuf_limit(sk, buf))) - res = TIPC_ERR_OVERLOAD; - else - res = TIPC_OK; + if (sk->sk_backlog.len == 0) + atomic_set(&tsk->dupl_rcvcnt, 0); + limit = rcvbuf_limit(sk, buf) + atomic_read(&tsk->dupl_rcvcnt); + if (sk_add_backlog(sk, buf, limit)) + err = TIPC_ERR_OVERLOAD; } + bh_unlock_sock(sk); + tipc_port_unlock(port); - return res; + if (likely(!err)) + return 0; +exit: + tipc_reject_msg(buf, err); + return -EHOSTUNREACH; } -/** - * wakeupdispatch - wake up port after congestion - * @tport: port to wakeup - * - * Called with port lock already taken. - */ -static void wakeupdispatch(struct tipc_port *tport) +static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) { - struct sock *sk = tport->sk; + struct sock *sk = sock->sk; + DEFINE_WAIT(wait); + int done; - sk->sk_write_space(sk); + do { + int err = sock_error(sk); + if (err) + return err; + if (!*timeo_p) + return -ETIMEDOUT; + if (signal_pending(current)) + return sock_intr_errno(*timeo_p); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + done = sk_wait_event(sk, timeo_p, sock->state != SS_CONNECTING); + finish_wait(sk_sleep(sk), &wait); + } while (!done); + return 0; } /** - * connect - establish a connection to another TIPC port + * tipc_connect - establish a connection to another TIPC port * @sock: socket structure * @dest: socket address for destination port * @destlen: size of socket address data structure @@ -1441,13 +1528,14 @@ static void wakeupdispatch(struct tipc_port *tport) * * Returns 0 on success, errno otherwise */ -static int connect(struct socket *sock, struct sockaddr *dest, int destlen, - int flags) +static int tipc_connect(struct socket *sock, struct sockaddr *dest, + int destlen, int flags) { struct sock *sk = sock->sk; struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest; struct msghdr m = {NULL,}; - unsigned int timeout; + long timeout = (flags & O_NONBLOCK) ? 0 : tipc_sk(sk)->conn_timeout; + socket_state previous; int res; lock_sock(sk); @@ -1469,8 +1557,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, goto exit; } - timeout = (flags & O_NONBLOCK) ? 0 : tipc_sk(sk)->conn_timeout; - + previous = sock->state; switch (sock->state) { case SS_UNCONNECTED: /* Send a 'SYN-' to destination */ @@ -1483,7 +1570,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, if (!timeout) m.msg_flags = MSG_DONTWAIT; - res = send_msg(NULL, sock, &m, 0); + res = tipc_sendmsg(NULL, sock, &m, 0); if ((res < 0) && (res != -EWOULDBLOCK)) goto exit; @@ -1492,56 +1579,35 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, * case is EINPROGRESS, rather than EALREADY. */ res = -EINPROGRESS; - break; case SS_CONNECTING: - res = -EALREADY; + if (previous == SS_CONNECTING) + res = -EALREADY; + if (!timeout) + goto exit; + timeout = msecs_to_jiffies(timeout); + /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ + res = tipc_wait_for_connect(sock, &timeout); break; case SS_CONNECTED: res = -EISCONN; break; default: res = -EINVAL; - goto exit; - } - - if (sock->state == SS_CONNECTING) { - if (!timeout) - goto exit; - - /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ - release_sock(sk); - res = wait_event_interruptible_timeout(*sk_sleep(sk), - sock->state != SS_CONNECTING, - timeout ? (long)msecs_to_jiffies(timeout) - : MAX_SCHEDULE_TIMEOUT); - lock_sock(sk); - if (res <= 0) { - if (res == 0) - res = -ETIMEDOUT; - else - ; /* leave "res" unchanged */ - goto exit; - } + break; } - - if (unlikely(sock->state == SS_DISCONNECTING)) - res = sock_error(sk); - else - res = 0; - exit: release_sock(sk); return res; } /** - * listen - allow socket to listen for incoming connections + * tipc_listen - allow socket to listen for incoming connections * @sock: socket structure * @len: (unused) * * Returns 0 on success, errno otherwise */ -static int listen(struct socket *sock, int len) +static int tipc_listen(struct socket *sock, int len) { struct sock *sk = sock->sk; int res; @@ -1559,23 +1625,59 @@ static int listen(struct socket *sock, int len) return res; } +static int tipc_wait_for_accept(struct socket *sock, long timeo) +{ + struct sock *sk = sock->sk; + DEFINE_WAIT(wait); + int err; + + /* True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + */ + for (;;) { + prepare_to_wait_exclusive(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + } + err = 0; + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + err = -EINVAL; + if (sock->state != SS_LISTENING) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk_sleep(sk), &wait); + return err; +} + /** - * accept - wait for connection request + * tipc_accept - wait for connection request * @sock: listening socket * @newsock: new socket that is to be connected * @flags: file-related flags associated with socket * * Returns 0 on success, errno otherwise */ -static int accept(struct socket *sock, struct socket *new_sock, int flags) +static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) { struct sock *new_sk, *sk = sock->sk; struct sk_buff *buf; - struct tipc_sock *new_tsock; - struct tipc_port *new_tport; + struct tipc_port *new_port; struct tipc_msg *msg; + struct tipc_portid peer; u32 new_ref; - + long timeo; int res; lock_sock(sk); @@ -1584,19 +1686,10 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags) res = -EINVAL; goto exit; } - - while (skb_queue_empty(&sk->sk_receive_queue)) { - if (flags & O_NONBLOCK) { - res = -EWOULDBLOCK; - goto exit; - } - release_sock(sk); - res = wait_event_interruptible(*sk_sleep(sk), - (!skb_queue_empty(&sk->sk_receive_queue))); - lock_sock(sk); - if (res) - goto exit; - } + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + res = tipc_wait_for_accept(sock, timeo); + if (res) + goto exit; buf = skb_peek(&sk->sk_receive_queue); @@ -1605,9 +1698,8 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags) goto exit; new_sk = new_sock->sk; - new_tsock = tipc_sk(new_sk); - new_tport = new_tsock->p; - new_ref = new_tport->ref; + new_port = &tipc_sk(new_sk)->port; + new_ref = new_port->ref; msg = buf_msg(buf); /* we lock on new_sk; but lockdep sees the lock on sk */ @@ -1620,15 +1712,15 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags) reject_rx_queue(new_sk); /* Connect new socket to it's peer */ - new_tsock->peer_name.ref = msg_origport(msg); - new_tsock->peer_name.node = msg_orignode(msg); - tipc_connect(new_ref, &new_tsock->peer_name); + peer.ref = msg_origport(msg); + peer.node = msg_orignode(msg); + tipc_port_connect(new_ref, &peer); new_sock->state = SS_CONNECTED; - tipc_set_portimportance(new_ref, msg_importance(msg)); + tipc_port_set_importance(new_port, msg_importance(msg)); if (msg_named(msg)) { - new_tport->conn_type = msg_nametype(msg); - new_tport->conn_instance = msg_nameinst(msg); + new_port->conn_type = msg_nametype(msg); + new_port->conn_instance = msg_nameinst(msg); } /* @@ -1639,21 +1731,20 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags) struct msghdr m = {NULL,}; advance_rx_queue(sk); - send_packet(NULL, new_sock, &m, 0); + tipc_send_packet(NULL, new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } release_sock(new_sk); - exit: release_sock(sk); return res; } /** - * shutdown - shutdown socket connection + * tipc_shutdown - shutdown socket connection * @sock: socket structure * @how: direction to close (must be SHUT_RDWR) * @@ -1661,10 +1752,11 @@ exit: * * Returns 0 on success, errno otherwise */ -static int shutdown(struct socket *sock, int how) +static int tipc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; struct sk_buff *buf; int res; @@ -1685,10 +1777,10 @@ restart: kfree_skb(buf); goto restart; } - tipc_disconnect(tport->ref); + tipc_port_disconnect(port->ref); tipc_reject_msg(buf, TIPC_CONN_SHUTDOWN); } else { - tipc_shutdown(tport->ref); + tipc_port_shutdown(port->ref); } sock->state = SS_DISCONNECTING; @@ -1714,7 +1806,7 @@ restart: } /** - * setsockopt - set socket option + * tipc_setsockopt - set socket option * @sock: socket structure * @lvl: option level * @opt: option identifier @@ -1726,11 +1818,12 @@ restart: * * Returns 0 on success, errno otherwise */ -static int setsockopt(struct socket *sock, int lvl, int opt, char __user *ov, - unsigned int ol) +static int tipc_setsockopt(struct socket *sock, int lvl, int opt, + char __user *ov, unsigned int ol) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; u32 value; int res; @@ -1748,16 +1841,16 @@ static int setsockopt(struct socket *sock, int lvl, int opt, char __user *ov, switch (opt) { case TIPC_IMPORTANCE: - res = tipc_set_portimportance(tport->ref, value); + tipc_port_set_importance(port, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) - res = tipc_set_portunreliable(tport->ref, value); + tipc_port_set_unreliable(port, value); else res = -ENOPROTOOPT; break; case TIPC_DEST_DROPPABLE: - res = tipc_set_portunreturnable(tport->ref, value); + tipc_port_set_unreturnable(port, value); break; case TIPC_CONN_TIMEOUT: tipc_sk(sk)->conn_timeout = value; @@ -1773,7 +1866,7 @@ static int setsockopt(struct socket *sock, int lvl, int opt, char __user *ov, } /** - * getsockopt - get socket option + * tipc_getsockopt - get socket option * @sock: socket structure * @lvl: option level * @opt: option identifier @@ -1785,11 +1878,12 @@ static int setsockopt(struct socket *sock, int lvl, int opt, char __user *ov, * * Returns 0 on success, errno otherwise */ -static int getsockopt(struct socket *sock, int lvl, int opt, char __user *ov, - int __user *ol) +static int tipc_getsockopt(struct socket *sock, int lvl, int opt, + char __user *ov, int __user *ol) { struct sock *sk = sock->sk; - struct tipc_port *tport = tipc_sk_port(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; int len; u32 value; int res; @@ -1806,13 +1900,13 @@ static int getsockopt(struct socket *sock, int lvl, int opt, char __user *ov, switch (opt) { case TIPC_IMPORTANCE: - res = tipc_portimportance(tport->ref, &value); + value = tipc_port_importance(port); break; case TIPC_SRC_DROPPABLE: - res = tipc_portunreliable(tport->ref, &value); + value = tipc_port_unreliable(port); break; case TIPC_DEST_DROPPABLE: - res = tipc_portunreturnable(tport->ref, &value); + value = tipc_port_unreturnable(port); break; case TIPC_CONN_TIMEOUT: value = tipc_sk(sk)->conn_timeout; @@ -1842,25 +1936,47 @@ static int getsockopt(struct socket *sock, int lvl, int opt, char __user *ov, return put_user(sizeof(value), ol); } +int tipc_ioctl(struct socket *sk, unsigned int cmd, unsigned long arg) +{ + struct tipc_sioc_ln_req lnr; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case SIOCGETLINKNAME: + if (copy_from_user(&lnr, argp, sizeof(lnr))) + return -EFAULT; + if (!tipc_node_get_linkname(lnr.bearer_id, lnr.peer, + lnr.linkname, TIPC_MAX_LINK_NAME)) { + if (copy_to_user(argp, &lnr, sizeof(lnr))) + return -EFAULT; + return 0; + } + return -EADDRNOTAVAIL; + break; + default: + return -ENOIOCTLCMD; + } +} + /* Protocol switches for the various types of TIPC sockets */ static const struct proto_ops msg_ops = { .owner = THIS_MODULE, .family = AF_TIPC, - .release = release, - .bind = bind, - .connect = connect, + .release = tipc_release, + .bind = tipc_bind, + .connect = tipc_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, - .getname = get_name, - .poll = poll, - .ioctl = sock_no_ioctl, + .getname = tipc_getname, + .poll = tipc_poll, + .ioctl = tipc_ioctl, .listen = sock_no_listen, - .shutdown = shutdown, - .setsockopt = setsockopt, - .getsockopt = getsockopt, - .sendmsg = send_msg, - .recvmsg = recv_msg, + .shutdown = tipc_shutdown, + .setsockopt = tipc_setsockopt, + .getsockopt = tipc_getsockopt, + .sendmsg = tipc_sendmsg, + .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; @@ -1868,20 +1984,20 @@ static const struct proto_ops msg_ops = { static const struct proto_ops packet_ops = { .owner = THIS_MODULE, .family = AF_TIPC, - .release = release, - .bind = bind, - .connect = connect, + .release = tipc_release, + .bind = tipc_bind, + .connect = tipc_connect, .socketpair = sock_no_socketpair, - .accept = accept, - .getname = get_name, - .poll = poll, - .ioctl = sock_no_ioctl, - .listen = listen, - .shutdown = shutdown, - .setsockopt = setsockopt, - .getsockopt = getsockopt, - .sendmsg = send_packet, - .recvmsg = recv_msg, + .accept = tipc_accept, + .getname = tipc_getname, + .poll = tipc_poll, + .ioctl = tipc_ioctl, + .listen = tipc_listen, + .shutdown = tipc_shutdown, + .setsockopt = tipc_setsockopt, + .getsockopt = tipc_getsockopt, + .sendmsg = tipc_send_packet, + .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; @@ -1889,20 +2005,20 @@ static const struct proto_ops packet_ops = { static const struct proto_ops stream_ops = { .owner = THIS_MODULE, .family = AF_TIPC, - .release = release, - .bind = bind, - .connect = connect, + .release = tipc_release, + .bind = tipc_bind, + .connect = tipc_connect, .socketpair = sock_no_socketpair, - .accept = accept, - .getname = get_name, - .poll = poll, - .ioctl = sock_no_ioctl, - .listen = listen, - .shutdown = shutdown, - .setsockopt = setsockopt, - .getsockopt = getsockopt, - .sendmsg = send_stream, - .recvmsg = recv_stream, + .accept = tipc_accept, + .getname = tipc_getname, + .poll = tipc_poll, + .ioctl = tipc_ioctl, + .listen = tipc_listen, + .shutdown = tipc_shutdown, + .setsockopt = tipc_setsockopt, + .getsockopt = tipc_getsockopt, + .sendmsg = tipc_send_stream, + .recvmsg = tipc_recv_stream, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; @@ -1947,8 +2063,6 @@ int tipc_socket_init(void) proto_unregister(&tipc_proto); goto out; } - - sockets_enabled = 1; out: return res; } @@ -1958,10 +2072,6 @@ int tipc_socket_init(void) */ void tipc_socket_stop(void) { - if (!sockets_enabled) - return; - - sockets_enabled = 0; sock_unregister(tipc_family_ops.family); proto_unregister(&tipc_proto); } diff --git a/net/tipc/socket.h b/net/tipc/socket.h new file mode 100644 index 00000000000..3afcd2a70b3 --- /dev/null +++ b/net/tipc/socket.h @@ -0,0 +1,74 @@ +/* net/tipc/socket.h: Include file for TIPC socket code + * + * Copyright (c) 2014, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_SOCK_H +#define _TIPC_SOCK_H + +#include "port.h" +#include <net/sock.h> + +/** + * struct tipc_sock - TIPC socket structure + * @sk: socket - interacts with 'port' and with user via the socket API + * @port: port - interacts with 'sk' and with the rest of the TIPC stack + * @peer_name: the peer of the connection, if any + * @conn_timeout: the time we can wait for an unresponded setup request + * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue + */ + +struct tipc_sock { + struct sock sk; + struct tipc_port port; + unsigned int conn_timeout; + atomic_t dupl_rcvcnt; +}; + +static inline struct tipc_sock *tipc_sk(const struct sock *sk) +{ + return container_of(sk, struct tipc_sock, sk); +} + +static inline struct tipc_sock *tipc_port_to_sock(const struct tipc_port *port) +{ + return container_of(port, struct tipc_sock, port); +} + +static inline void tipc_sock_wakeup(struct tipc_sock *tsk) +{ + tsk->sk.sk_write_space(&tsk->sk); +} + +int tipc_sk_rcv(struct sk_buff *buf); + +#endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index d38bb45d82e..642437231ad 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -42,7 +42,7 @@ /** * struct tipc_subscriber - TIPC network topology subscriber * @conid: connection identifier to server connecting to subscriber - * @lock: controll access to subscriber + * @lock: control access to subscriber * @subscription_list: list of subscription objects for this subscriber */ struct tipc_subscriber { @@ -96,20 +96,16 @@ static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower, { struct tipc_subscriber *subscriber = sub->subscriber; struct kvec msg_sect; - int ret; msg_sect.iov_base = (void *)&sub->evt; msg_sect.iov_len = sizeof(struct tipc_event); - sub->evt.event = htohl(event, sub->swap); sub->evt.found_lower = htohl(found_lower, sub->swap); sub->evt.found_upper = htohl(found_upper, sub->swap); sub->evt.port.ref = htohl(port_ref, sub->swap); sub->evt.port.node = htohl(node, sub->swap); - ret = tipc_conn_sendmsg(&topsrv, subscriber->conid, NULL, - msg_sect.iov_base, msg_sect.iov_len); - if (ret < 0) - pr_err("Sending subscription event failed, no memory\n"); + tipc_conn_sendmsg(&topsrv, subscriber->conid, NULL, msg_sect.iov_base, + msg_sect.iov_len); } /** @@ -153,14 +149,6 @@ static void subscr_timeout(struct tipc_subscription *sub) /* The spin lock per subscriber is used to protect its members */ spin_lock_bh(&subscriber->lock); - /* Validate if the connection related to the subscriber is - * closed (in case subscriber is terminating) - */ - if (subscriber->conid == 0) { - spin_unlock_bh(&subscriber->lock); - return; - } - /* Validate timeout (in case subscription is being cancelled) */ if (sub->timeout == TIPC_WAIT_FOREVER) { spin_unlock_bh(&subscriber->lock); @@ -215,9 +203,6 @@ static void subscr_release(struct tipc_subscriber *subscriber) spin_lock_bh(&subscriber->lock); - /* Invalidate subscriber reference */ - subscriber->conid = 0; - /* Destroy any existing subscriptions for subscriber */ list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, subscription_list) { @@ -278,9 +263,9 @@ static void subscr_cancel(struct tipc_subscr *s, * * Called with subscriber lock held. */ -static struct tipc_subscription *subscr_subscribe(struct tipc_subscr *s, - struct tipc_subscriber *subscriber) -{ +static int subscr_subscribe(struct tipc_subscr *s, + struct tipc_subscriber *subscriber, + struct tipc_subscription **sub_p) { struct tipc_subscription *sub; int swap; @@ -291,23 +276,21 @@ static struct tipc_subscription *subscr_subscribe(struct tipc_subscr *s, if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); subscr_cancel(s, subscriber); - return NULL; + return 0; } /* Refuse subscription if global limit exceeded */ if (atomic_read(&subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) { pr_warn("Subscription rejected, limit reached (%u)\n", TIPC_MAX_SUBSCRIPTIONS); - subscr_terminate(subscriber); - return NULL; + return -EINVAL; } /* Allocate subscription object */ sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); - subscr_terminate(subscriber); - return NULL; + return -ENOMEM; } /* Initialize subscription object */ @@ -321,8 +304,7 @@ static struct tipc_subscription *subscr_subscribe(struct tipc_subscr *s, (sub->seq.lower > sub->seq.upper)) { pr_warn("Subscription rejected, illegal request\n"); kfree(sub); - subscr_terminate(subscriber); - return NULL; + return -EINVAL; } INIT_LIST_HEAD(&sub->nameseq_list); list_add(&sub->subscription_list, &subscriber->subscription_list); @@ -335,8 +317,8 @@ static struct tipc_subscription *subscr_subscribe(struct tipc_subscr *s, (Handler)subscr_timeout, (unsigned long)sub); k_start_timer(&sub->timer, sub->timeout); } - - return sub; + *sub_p = sub; + return 0; } /* Handle one termination request for the subscriber */ @@ -350,10 +332,14 @@ static void subscr_conn_msg_event(int conid, struct sockaddr_tipc *addr, void *usr_data, void *buf, size_t len) { struct tipc_subscriber *subscriber = usr_data; - struct tipc_subscription *sub; + struct tipc_subscription *sub = NULL; spin_lock_bh(&subscriber->lock); - sub = subscr_subscribe((struct tipc_subscr *)buf, subscriber); + if (subscr_subscribe((struct tipc_subscr *)buf, subscriber, &sub) < 0) { + spin_unlock_bh(&subscriber->lock); + subscr_terminate(subscriber); + return; + } if (sub) tipc_nametbl_subscribe(sub); spin_unlock_bh(&subscriber->lock); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 81362229631..e9688438073 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -163,9 +163,8 @@ static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) static inline unsigned int unix_hash_fold(__wsum n) { - unsigned int hash = (__force unsigned int)n; + unsigned int hash = (__force unsigned int)csum_fold(n); - hash ^= hash>>16; hash ^= hash>>8; return hash&(UNIX_HASH_SIZE-1); } @@ -532,13 +531,17 @@ static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *, struct msghdr *, size_t, int); -static void unix_set_peek_off(struct sock *sk, int val) +static int unix_set_peek_off(struct sock *sk, int val) { struct unix_sock *u = unix_sk(sk); - mutex_lock(&u->readlock); + if (mutex_lock_interruptible(&u->readlock)) + return -EINTR; + sk->sk_peek_off = val; mutex_unlock(&u->readlock); + + return 0; } @@ -716,7 +719,9 @@ static int unix_autobind(struct socket *sock) int err; unsigned int retries = 0; - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) + return err; err = 0; if (u->addr) @@ -875,7 +880,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) + goto out; err = -EINVAL; if (u->addr) @@ -1200,7 +1207,7 @@ restart: sk->sk_state = TCP_ESTABLISHED; sock_hold(newsk); - smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); @@ -1210,7 +1217,7 @@ restart: __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other, 0); + other->sk_data_ready(other); sock_put(other); return 0; @@ -1442,7 +1449,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); - struct sockaddr_un *sunaddr = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *other = NULL; int namelen = 0; /* fake GCC */ int err; @@ -1485,10 +1492,14 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (len > sk->sk_sndbuf - 32) goto out; - if (len > SKB_MAX_ALLOC) + if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, len - SKB_MAX_ALLOC, MAX_SKB_FRAGS * PAGE_SIZE); + data_len = PAGE_ALIGN(data_len); + + BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); + } skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, @@ -1593,7 +1604,7 @@ restart: if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); - other->sk_data_ready(other, len); + other->sk_data_ready(other); sock_put(other); scm_destroy(siocb->scm); return len; @@ -1663,6 +1674,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); + data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); + skb = sock_alloc_send_pskb(sk, size - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, get_order(UNIX_SKB_FRAGS_SZ)); @@ -1699,7 +1712,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); - other->sk_data_ready(other, size); + other->sk_data_ready(other); sent += size; } @@ -1780,8 +1793,11 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; err = mutex_lock_interruptible(&u->readlock); - if (err) { - err = sock_intr_errno(sock_rcvtimeo(sk, noblock)); + if (unlikely(err)) { + /* recvmsg() in non blocking mode is supposed to return -EAGAIN + * sk_rcvtimeo is not honored by mutex_lock_interruptible() + */ + err = noblock ? -EAGAIN : -ERESTARTSYS; goto out; } @@ -1904,8 +1920,9 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct scm_cookie tmp_scm; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - struct sockaddr_un *sunaddr = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); int copied = 0; + int noblock = flags & MSG_DONTWAIT; int check_creds = 0; int target; int err = 0; @@ -1921,7 +1938,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); - timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); + timeo = sock_rcvtimeo(sk, noblock); /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg @@ -1933,8 +1950,11 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } err = mutex_lock_interruptible(&u->readlock); - if (err) { - err = sock_intr_errno(timeo); + if (unlikely(err)) { + /* recvmsg() in non blocking mode is supposed to return -EAGAIN + * sk_rcvtimeo is not honored by mutex_lock_interruptible() + */ + err = noblock ? -EAGAIN : -ERESTARTSYS; goto out; } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5adfd94c5b8..85d232bed87 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1925,9 +1925,23 @@ static struct miscdevice vsock_device = { .fops = &vsock_device_ops, }; -static int __vsock_core_init(void) +int __vsock_core_init(const struct vsock_transport *t, struct module *owner) { - int err; + int err = mutex_lock_interruptible(&vsock_register_mutex); + + if (err) + return err; + + if (transport) { + err = -EBUSY; + goto err_busy; + } + + /* Transport must be the owner of the protocol so that it can't + * unload while there are open sockets. + */ + vsock_proto.owner = owner; + transport = t; vsock_init_tables(); @@ -1951,36 +1965,19 @@ static int __vsock_core_init(void) goto err_unregister_proto; } + mutex_unlock(&vsock_register_mutex); return 0; err_unregister_proto: proto_unregister(&vsock_proto); err_misc_deregister: misc_deregister(&vsock_device); - return err; -} - -int vsock_core_init(const struct vsock_transport *t) -{ - int retval = mutex_lock_interruptible(&vsock_register_mutex); - if (retval) - return retval; - - if (transport) { - retval = -EBUSY; - goto out; - } - - transport = t; - retval = __vsock_core_init(); - if (retval) - transport = NULL; - -out: + transport = NULL; +err_busy: mutex_unlock(&vsock_register_mutex); - return retval; + return err; } -EXPORT_SYMBOL_GPL(vsock_core_init); +EXPORT_SYMBOL_GPL(__vsock_core_init); void vsock_core_exit(void) { @@ -2000,5 +1997,5 @@ EXPORT_SYMBOL_GPL(vsock_core_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); -MODULE_VERSION("1.0.0.0-k"); +MODULE_VERSION("1.0.1.0-k"); MODULE_LICENSE("GPL v2"); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 687360da62d..9bb63ffec4f 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1779,10 +1779,8 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, goto out; if (msg->msg_name) { - struct sockaddr_vm *vm_addr; - /* Provide the address of the sender. */ - vm_addr = (struct sockaddr_vm *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name); vsock_addr_init(vm_addr, dg->src.context, dg->src.resource); msg->msg_namelen = sizeof(*vm_addr); } diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index 9a730744e7b..9b7f207f2be 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -315,7 +315,7 @@ vmci_transport_handle_wrote(struct sock *sk, struct vsock_sock *vsk = vsock_sk(sk); PKT_FIELD(vsk, sent_waiting_read) = false; #endif - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } static void vmci_transport_notify_pkt_socket_init(struct sock *sk) diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index 622bd7aa101..dc9c7929a2f 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -92,7 +92,7 @@ vmci_transport_handle_wrote(struct sock *sk, bool bottom_half, struct sockaddr_vm *dst, struct sockaddr_vm *src) { - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } static void vsock_block_update_write_window(struct sock *sk) @@ -290,7 +290,7 @@ vmci_transport_notify_pkt_recv_post_dequeue( /* See the comment in * vmci_transport_notify_pkt_send_post_enqueue(). */ - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } return err; diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 16d08b39921..405f3c4cf70 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -95,6 +95,43 @@ config CFG80211_CERTIFICATION_ONUS you are a wireless researcher and are working in a controlled and approved environment by your local regulatory agency. +config CFG80211_REG_CELLULAR_HINTS + bool "cfg80211 regulatory support for cellular base station hints" + depends on CFG80211_CERTIFICATION_ONUS + ---help--- + This option enables support for parsing regulatory hints + from cellular base stations. If enabled and at least one driver + claims support for parsing cellular base station hints the + regulatory core will allow and parse these regulatory hints. + The regulatory core will only apply these regulatory hints on + drivers that support this feature. You should only enable this + feature if you have tested and validated this feature on your + systems. + +config CFG80211_REG_RELAX_NO_IR + bool "cfg80211 support for NO_IR relaxation" + depends on CFG80211_CERTIFICATION_ONUS + ---help--- + This option enables support for relaxation of the NO_IR flag for + situations that certain regulatory bodies have provided clarifications + on how relaxation can occur. This feature has an inherent dependency on + userspace features which must have been properly tested and as such is + not enabled by default. + + A relaxation feature example is allowing the operation of a P2P group + owner (GO) on channels marked with NO_IR if there is an additional BSS + interface which associated to an AP which userspace assumes or confirms + to be an authorized master, i.e., with radar detection support and DFS + capabilities. However, note that in order to not create daisy chain + scenarios, this relaxation is not allowed in cases that the BSS client + is associated to P2P GO and in addition the P2P GO instantiated on + a channel due to this relaxation should not allow connection from + non P2P clients. + + The regulatory core will apply these relaxations only for drivers that + support this feature by declaring the appropriate channel flags and + capabilities in their registration flow. + config CFG80211_DEFAULT_PS bool "enable powersave by default" depends on CFG80211 diff --git a/net/wireless/ap.c b/net/wireless/ap.c index 324e8d851dc..bdad1f95156 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -6,8 +6,8 @@ #include "rdev-ops.h" -static int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev) +int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; @@ -27,21 +27,24 @@ static int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, err = rdev_stop_ap(rdev, dev); if (!err) { wdev->beacon_interval = 0; - wdev->channel = NULL; + memset(&wdev->chandef, 0, sizeof(wdev->chandef)); wdev->ssid_len = 0; + rdev_set_qos_map(rdev, dev, NULL); + if (notify) + nl80211_send_ap_stopped(wdev); } return err; } int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev) + struct net_device *dev, bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; wdev_lock(wdev); - err = __cfg80211_stop_ap(rdev, dev); + err = __cfg80211_stop_ap(rdev, dev, notify); wdev_unlock(wdev); return err; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 78559b5bbd1..992b34070bc 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -326,28 +326,57 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, int cfg80211_chandef_dfs_required(struct wiphy *wiphy, - const struct cfg80211_chan_def *chandef) + const struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) { int width; - int r; + int ret; if (WARN_ON(!cfg80211_chandef_valid(chandef))) return -EINVAL; - width = cfg80211_chandef_get_width(chandef); - if (width < 0) - return -EINVAL; + switch (iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + width = cfg80211_chandef_get_width(chandef); + if (width < 0) + return -EINVAL; - r = cfg80211_get_chans_dfs_required(wiphy, chandef->center_freq1, - width); - if (r) - return r; + ret = cfg80211_get_chans_dfs_required(wiphy, + chandef->center_freq1, + width); + if (ret < 0) + return ret; + else if (ret > 0) + return BIT(chandef->width); - if (!chandef->center_freq2) - return 0; + if (!chandef->center_freq2) + return 0; - return cfg80211_get_chans_dfs_required(wiphy, chandef->center_freq2, - width); + ret = cfg80211_get_chans_dfs_required(wiphy, + chandef->center_freq2, + width); + if (ret < 0) + return ret; + else if (ret > 0) + return BIT(chandef->width); + + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_P2P_DEVICE: + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + WARN_ON(1); + } + + return 0; } EXPORT_SYMBOL(cfg80211_chandef_dfs_required); @@ -490,6 +519,62 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, return r; } +static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy, + u32 center_freq, + u32 bandwidth) +{ + struct ieee80211_channel *c; + u32 start_freq, end_freq, freq; + unsigned int dfs_cac_ms = 0; + + start_freq = cfg80211_get_start_freq(center_freq, bandwidth); + end_freq = cfg80211_get_end_freq(center_freq, bandwidth); + + for (freq = start_freq; freq <= end_freq; freq += 20) { + c = ieee80211_get_channel(wiphy, freq); + if (!c) + return 0; + + if (c->flags & IEEE80211_CHAN_DISABLED) + return 0; + + if (!(c->flags & IEEE80211_CHAN_RADAR)) + continue; + + if (c->dfs_cac_ms > dfs_cac_ms) + dfs_cac_ms = c->dfs_cac_ms; + } + + return dfs_cac_ms; +} + +unsigned int +cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width; + unsigned int t1 = 0, t2 = 0; + + if (WARN_ON(!cfg80211_chandef_valid(chandef))) + return 0; + + width = cfg80211_chandef_get_width(chandef); + if (width < 0) + return 0; + + t1 = cfg80211_get_chans_dfs_cac_time(wiphy, + chandef->center_freq1, + width); + + if (!chandef->center_freq2) + return t1; + + t2 = cfg80211_get_chans_dfs_cac_time(wiphy, + chandef->center_freq2, + width); + + return max(t1, t2); +} static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, u32 center_freq, u32 bandwidth, @@ -531,12 +616,14 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, width = 5; break; case NL80211_CHAN_WIDTH_10: + prohibited_flags |= IEEE80211_CHAN_NO_10MHZ; width = 10; break; case NL80211_CHAN_WIDTH_20: if (!ht_cap->ht_supported) return false; case NL80211_CHAN_WIDTH_20_NOHT: + prohibited_flags |= IEEE80211_CHAN_NO_20MHZ; width = 20; break; case NL80211_CHAN_WIDTH_40: @@ -605,17 +692,111 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_chandef_usable); +/* + * For GO only, check if the channel can be used under permissive conditions + * mandated by the some regulatory bodies, i.e., the channel is marked with + * IEEE80211_CHAN_GO_CONCURRENT and there is an additional station interface + * associated to an AP on the same channel or on the same UNII band + * (assuming that the AP is an authorized master). + * In addition allow the GO to operate on a channel on which indoor operation is + * allowed, iff we are currently operating in an indoor environment. + */ +static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, + struct ieee80211_channel *chan) +{ + struct wireless_dev *wdev_iter; + struct wiphy *wiphy = wiphy_idx_to_wiphy(rdev->wiphy_idx); + + ASSERT_RTNL(); + + if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) || + !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) + return false; + + if (regulatory_indoor_allowed() && + (chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) + return true; + + if (!(chan->flags & IEEE80211_CHAN_GO_CONCURRENT)) + return false; + + /* + * Generally, it is possible to rely on another device/driver to allow + * the GO concurrent relaxation, however, since the device can further + * enforce the relaxation (by doing a similar verifications as this), + * and thus fail the GO instantiation, consider only the interfaces of + * the current registered device. + */ + list_for_each_entry(wdev_iter, &rdev->wdev_list, list) { + struct ieee80211_channel *other_chan = NULL; + int r1, r2; + + if (wdev_iter->iftype != NL80211_IFTYPE_STATION || + !netif_running(wdev_iter->netdev)) + continue; + + wdev_lock(wdev_iter); + if (wdev_iter->current_bss) + other_chan = wdev_iter->current_bss->pub.channel; + wdev_unlock(wdev_iter); + + if (!other_chan) + continue; + + if (chan == other_chan) + return true; + + if (chan->band != IEEE80211_BAND_5GHZ) + continue; + + r1 = cfg80211_get_unii(chan->center_freq); + r2 = cfg80211_get_unii(other_chan->center_freq); + + if (r1 != -EINVAL && r1 == r2) { + /* + * At some locations channels 149-165 are considered a + * bundle, but at other locations, e.g., Indonesia, + * channels 149-161 are considered a bundle while + * channel 165 is left out and considered to be in a + * different bundle. Thus, in case that there is a + * station interface connected to an AP on channel 165, + * it is assumed that channels 149-161 are allowed for + * GO operations. However, having a station interface + * connected to an AP on channels 149-161, does not + * allow GO operation on channel 165. + */ + if (chan->center_freq == 5825 && + other_chan->center_freq != 5825) + continue; + return true; + } + } + + return false; +} + bool cfg80211_reg_can_beacon(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef) + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); bool res; u32 prohibited_flags = IEEE80211_CHAN_DISABLED | - IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR; - trace_cfg80211_reg_can_beacon(wiphy, chandef); + trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype); + + /* + * Under certain conditions suggested by the some regulatory bodies + * a GO can operate on channels marked with IEEE80211_NO_IR + * so set this flag only if such relaxations are not enabled and + * the conditions are not met. + */ + if (iftype != NL80211_IFTYPE_P2P_GO || + !cfg80211_go_permissive_chan(rdev, chandef->chan)) + prohibited_flags |= IEEE80211_CHAN_NO_IR; - if (cfg80211_chandef_dfs_required(wiphy, chandef) > 0 && + if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 && cfg80211_chandef_dfs_available(wiphy, chandef)) { /* We can skip IEEE80211_CHAN_NO_IR if chandef dfs available */ prohibited_flags = IEEE80211_CHAN_DISABLED; @@ -642,8 +823,11 @@ int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, void cfg80211_get_chan_state(struct wireless_dev *wdev, struct ieee80211_channel **chan, - enum cfg80211_chan_mode *chanmode) + enum cfg80211_chan_mode *chanmode, + u8 *radar_detect) { + int ret; + *chan = NULL; *chanmode = CHAN_MODE_UNDEFINED; @@ -660,6 +844,11 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, !wdev->ibss_dfs_possible) ? CHAN_MODE_SHARED : CHAN_MODE_EXCLUSIVE; + + /* consider worst-case - IBSS can try to return to the + * original user-specified channel as creator */ + if (wdev->ibss_dfs_possible) + *radar_detect |= BIT(wdev->chandef.width); return; } break; @@ -674,33 +863,42 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: if (wdev->cac_started) { - *chan = wdev->channel; + *chan = wdev->chandef.chan; *chanmode = CHAN_MODE_SHARED; + *radar_detect |= BIT(wdev->chandef.width); } else if (wdev->beacon_interval) { - *chan = wdev->channel; + *chan = wdev->chandef.chan; *chanmode = CHAN_MODE_SHARED; + + ret = cfg80211_chandef_dfs_required(wdev->wiphy, + &wdev->chandef, + wdev->iftype); + WARN_ON(ret < 0); + if (ret > 0) + *radar_detect |= BIT(wdev->chandef.width); } return; case NL80211_IFTYPE_MESH_POINT: if (wdev->mesh_id_len) { - *chan = wdev->channel; + *chan = wdev->chandef.chan; *chanmode = CHAN_MODE_SHARED; + + ret = cfg80211_chandef_dfs_required(wdev->wiphy, + &wdev->chandef, + wdev->iftype); + WARN_ON(ret < 0); + if (ret > 0) + *radar_detect |= BIT(wdev->chandef.width); } return; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: - /* these interface types don't really have a channel */ - return; case NL80211_IFTYPE_P2P_DEVICE: - if (wdev->wiphy->features & - NL80211_FEATURE_P2P_DEVICE_NEEDS_CHANNEL) - *chanmode = CHAN_MODE_EXCLUSIVE; + /* these interface types don't really have a channel */ return; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: WARN_ON(1); } - - return; } diff --git a/net/wireless/core.c b/net/wireless/core.c index 06db6eb5258..a1c40654dd9 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -69,7 +69,7 @@ struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx) int get_wiphy_idx(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); return rdev->wiphy_idx; } @@ -130,7 +130,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, newname)) pr_err("failed to rename debugfs dir to %s!\n", newname); - nl80211_notify_dev_rename(rdev); + nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); return 0; } @@ -204,27 +204,18 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; if (rdev->scan_req && rdev->scan_req->wdev == wdev) { - /* - * If the scan request wasn't notified as done, set it - * to aborted and leak it after a warning. The driver - * should have notified us that it ended at the latest - * during rdev_stop_p2p_device(). - */ if (WARN_ON(!rdev->scan_req->notified)) rdev->scan_req->aborted = true; - ___cfg80211_scan_done(rdev, !rdev->scan_req->notified); + ___cfg80211_scan_done(rdev, false); } } -static int cfg80211_rfkill_set_block(void *data, bool blocked) +void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev = data; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct wireless_dev *wdev; - if (!blocked) - return 0; - - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(wdev, &rdev->wdev_list, list) { if (wdev->netdev) { @@ -240,7 +231,18 @@ static int cfg80211_rfkill_set_block(void *data, bool blocked) break; } } +} +EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); + +static int cfg80211_rfkill_set_block(void *data, bool blocked) +{ + struct cfg80211_registered_device *rdev = data; + + if (!blocked) + return 0; + rtnl_lock(); + cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return 0; @@ -266,6 +268,45 @@ static void cfg80211_event_work(struct work_struct *work) rtnl_unlock(); } +void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_iface_destroy *item; + + ASSERT_RTNL(); + + spin_lock_irq(&rdev->destroy_list_lock); + while ((item = list_first_entry_or_null(&rdev->destroy_list, + struct cfg80211_iface_destroy, + list))) { + struct wireless_dev *wdev, *tmp; + u32 nlportid = item->nlportid; + + list_del(&item->list); + kfree(item); + spin_unlock_irq(&rdev->destroy_list_lock); + + list_for_each_entry_safe(wdev, tmp, &rdev->wdev_list, list) { + if (nlportid == wdev->owner_nlportid) + rdev_del_virtual_intf(rdev, wdev); + } + + spin_lock_irq(&rdev->destroy_list_lock); + } + spin_unlock_irq(&rdev->destroy_list_lock); +} + +static void cfg80211_destroy_iface_wk(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(work, struct cfg80211_registered_device, + destroy_work); + + rtnl_lock(); + cfg80211_destroy_ifaces(rdev); + rtnl_unlock(); +} + /* exported functions */ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) @@ -324,6 +365,10 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) rdev->wiphy.dev.class = &ieee80211_class; rdev->wiphy.dev.platform_data = rdev; + INIT_LIST_HEAD(&rdev->destroy_list); + spin_lock_init(&rdev->destroy_list_lock); + INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk); + #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; #endif @@ -357,6 +402,8 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) rdev->wiphy.rts_threshold = (u32) -1; rdev->wiphy.coverage_class = 0; + rdev->wiphy.max_num_csa_counters = 1; + return &rdev->wiphy; } EXPORT_SYMBOL(wiphy_new); @@ -402,10 +449,7 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) for (j = 0; j < c->n_limits; j++) { u16 types = c->limits[j].types; - /* - * interface types shouldn't overlap, this is - * used in cfg80211_can_change_interface() - */ + /* interface types shouldn't overlap */ if (WARN_ON(types & all_iftypes)) return -EINVAL; all_iftypes |= types; @@ -441,7 +485,7 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) int wiphy_register(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); int res; enum ieee80211_band band; struct ieee80211_supported_band *sband; @@ -449,9 +493,6 @@ int wiphy_register(struct wiphy *wiphy) int i; u16 ifmodes = wiphy->interface_modes; - /* support for 5/10 MHz is broken due to nl80211 API mess - disable */ - wiphy->flags &= ~WIPHY_FLAG_SUPPORTS_5_10_MHZ; - /* * There are major locking problems in nl80211/mac80211 for CSA, * disable for all drivers until this has been reworked. @@ -619,13 +660,15 @@ int wiphy_register(struct wiphy *wiphy) return res; } + nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + return 0; } EXPORT_SYMBOL(wiphy_register); void wiphy_rfkill_start_polling(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!rdev->ops->rfkill_poll) return; @@ -636,7 +679,7 @@ EXPORT_SYMBOL(wiphy_rfkill_start_polling); void wiphy_rfkill_stop_polling(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); rfkill_pause_polling(rdev->rfkill); } @@ -644,7 +687,7 @@ EXPORT_SYMBOL(wiphy_rfkill_stop_polling); void wiphy_unregister(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); wait_event(rdev->dev_wait, ({ int __count; @@ -657,9 +700,10 @@ void wiphy_unregister(struct wiphy *wiphy) rfkill_unregister(rdev->rfkill); rtnl_lock(); + nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); rdev->wiphy.registered = false; - BUG_ON(!list_empty(&rdev->wdev_list)); + WARN_ON(!list_empty(&rdev->wdev_list)); /* * First remove the hardware from everywhere, this makes @@ -684,6 +728,7 @@ void wiphy_unregister(struct wiphy *wiphy) cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); + flush_work(&rdev->destroy_work); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) @@ -716,7 +761,7 @@ EXPORT_SYMBOL(wiphy_free); void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (rfkill_set_hw_state(rdev->rfkill, blocked)) schedule_work(&rdev->rfkill_sync); @@ -725,7 +770,7 @@ EXPORT_SYMBOL(wiphy_rfkill_set_hw_state); void cfg80211_unregister_wdev(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); ASSERT_RTNL(); @@ -746,7 +791,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) } EXPORT_SYMBOL(cfg80211_unregister_wdev); -static struct device_type wiphy_type = { +static const struct device_type wiphy_type = { .name = "wlan", }; @@ -760,20 +805,23 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, rdev->num_running_monitor_ifaces += num; } -void cfg80211_leave(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +void __cfg80211_leave(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) { struct net_device *dev = wdev->netdev; + ASSERT_RTNL(); + ASSERT_WDEV_LOCK(wdev); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - cfg80211_leave_ibss(rdev, dev, true); + __cfg80211_leave_ibss(rdev, dev, true); break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: - __cfg80211_stop_sched_scan(rdev, false); + if (rdev->sched_scan_req && dev == rdev->sched_scan_req->dev) + __cfg80211_stop_sched_scan(rdev, false); - wdev_lock(wdev); #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.ie); wdev->wext.ie = NULL; @@ -782,21 +830,48 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, #endif cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); - wdev_unlock(wdev); break; case NL80211_IFTYPE_MESH_POINT: - cfg80211_leave_mesh(rdev, dev); + __cfg80211_leave_mesh(rdev, dev); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, dev); + __cfg80211_stop_ap(rdev, dev, true); break; default: break; } +} + +void cfg80211_leave(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + wdev_lock(wdev); + __cfg80211_leave(rdev, wdev); + wdev_unlock(wdev); +} - wdev->beacon_interval = 0; +void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, + gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct cfg80211_event *ev; + unsigned long flags; + + trace_cfg80211_stop_iface(wiphy, wdev); + + ev = kzalloc(sizeof(*ev), gfp); + if (!ev) + return; + + ev->type = EVENT_STOPPED; + + spin_lock_irqsave(&wdev->event_lock, flags); + list_add_tail(&ev->list, &wdev->event_list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + queue_work(cfg80211_wq, &rdev->event_work); } +EXPORT_SYMBOL(cfg80211_stop_iface); static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) @@ -804,12 +879,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; - int ret; if (!wdev) return NOTIFY_DONE; - rdev = wiphy_to_dev(wdev->wiphy); + rdev = wiphy_to_rdev(wdev->wiphy); WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED); @@ -868,7 +942,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) rdev->scan_req->aborted = true; - ___cfg80211_scan_done(rdev, true); + ___cfg80211_scan_done(rdev, false); } if (WARN_ON(rdev->sched_scan_req && @@ -967,13 +1041,14 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, case NETDEV_PRE_UP: if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype))) return notifier_from_errno(-EOPNOTSUPP); - ret = cfg80211_can_add_interface(rdev, wdev->iftype); - if (ret) - return notifier_from_errno(ret); + if (rfkill_blocked(rdev->rfkill)) + return notifier_from_errno(-ERFKILL); break; + default: + return NOTIFY_DONE; } - return NOTIFY_DONE; + return NOTIFY_OK; } static struct notifier_block cfg80211_netdev_notifier = { diff --git a/net/wireless/core.h b/net/wireless/core.h index 0a277c33bb0..7e3a3cef7df 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -62,14 +62,13 @@ struct cfg80211_registered_device { struct rb_root bss_tree; u32 bss_generation; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + struct sk_buff *scan_msg; struct cfg80211_sched_scan_request *sched_scan_req; unsigned long suspend_at; struct work_struct scan_done_wk; struct work_struct sched_scan_results_wk; -#ifdef CONFIG_NL80211_TESTMODE - struct genl_info *testmode_info; -#endif + struct genl_info *cur_cmd_info; struct work_struct conn_work; struct work_struct event_work; @@ -81,13 +80,17 @@ struct cfg80211_registered_device { struct cfg80211_coalesce *coalesce; + spinlock_t destroy_list_lock; + struct list_head destroy_list; + struct work_struct destroy_work; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); }; static inline -struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy) +struct cfg80211_registered_device *wiphy_to_rdev(struct wiphy *wiphy) { BUG_ON(!wiphy); return container_of(wiphy, struct cfg80211_registered_device, wiphy); @@ -167,7 +170,6 @@ static inline void wdev_unlock(struct wireless_dev *wdev) mutex_unlock(&wdev->mtx); } -#define ASSERT_RDEV_LOCK(rdev) ASSERT_RTNL() #define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx) static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev) @@ -183,6 +185,7 @@ enum cfg80211_event_type { EVENT_ROAMED, EVENT_DISCONNECTED, EVENT_IBSS_JOINED, + EVENT_STOPPED, }; struct cfg80211_event { @@ -212,6 +215,7 @@ struct cfg80211_event { } dc; struct { u8 bssid[ETH_ALEN]; + struct ieee80211_channel *channel; } ij; }; }; @@ -233,6 +237,13 @@ struct cfg80211_beacon_registration { u32 nlportid; }; +struct cfg80211_iface_destroy { + struct list_head list; + u32 nlportid; +}; + +void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); + /* free object */ void cfg80211_dev_free(struct cfg80211_registered_device *rdev); @@ -241,15 +252,11 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, void ieee80211_set_bitrate_flags(struct wiphy *wiphy); -void cfg80211_bss_expire(struct cfg80211_registered_device *dev); -void cfg80211_bss_age(struct cfg80211_registered_device *dev, +void cfg80211_bss_expire(struct cfg80211_registered_device *rdev); +void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); /* IBSS */ -int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys); int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params, @@ -259,7 +266,8 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext); int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext); -void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid); +void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, + struct ieee80211_channel *channel); int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); @@ -274,6 +282,8 @@ int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_setup *setup, const struct mesh_config *conf); +int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev); int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, @@ -281,8 +291,10 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef); /* AP */ +int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool notify); int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, - struct net_device *dev); + struct net_device *dev, bool notify); /* MLME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, @@ -363,7 +375,8 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); void __cfg80211_scan_done(struct work_struct *wk); -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak); +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message); void __cfg80211_sched_scan_results(struct work_struct *wk); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, bool driver_initiated); @@ -400,35 +413,9 @@ void cfg80211_set_dfs_state(struct wiphy *wiphy, void cfg80211_dfs_channels_update_work(struct work_struct *work); - -static inline int -cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - enum nl80211_iftype iftype) -{ - return cfg80211_can_use_iftype_chan(rdev, wdev, iftype, NULL, - CHAN_MODE_UNDEFINED, 0); -} - -static inline int -cfg80211_can_add_interface(struct cfg80211_registered_device *rdev, - enum nl80211_iftype iftype) -{ - if (rfkill_blocked(rdev->rfkill)) - return -ERFKILL; - - return cfg80211_can_change_interface(rdev, NULL, iftype); -} - -static inline int -cfg80211_can_use_chan(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - struct ieee80211_channel *chan, - enum cfg80211_chan_mode chanmode) -{ - return cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, - chan, chanmode, 0); -} +unsigned int +cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef); static inline unsigned int elapsed_jiffies_msecs(unsigned long start) { @@ -437,13 +424,14 @@ static inline unsigned int elapsed_jiffies_msecs(unsigned long start) if (end >= start) return jiffies_to_msecs(end - start); - return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1); + return jiffies_to_msecs(end + (ULONG_MAX - start) + 1); } void cfg80211_get_chan_state(struct wireless_dev *wdev, struct ieee80211_channel **chan, - enum cfg80211_chan_mode *chanmode); + enum cfg80211_chan_mode *chanmode, + u8 *radar_detect); int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef); @@ -458,6 +446,8 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num); +void __cfg80211_leave(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); void cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index e37862f1b12..d4860bfc020 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -43,7 +43,7 @@ static void cfg80211_get_ringparam(struct net_device *dev, struct ethtool_ringparam *rp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); memset(rp, 0, sizeof(*rp)); @@ -56,7 +56,7 @@ static int cfg80211_set_ringparam(struct net_device *dev, struct ethtool_ringparam *rp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; @@ -70,7 +70,7 @@ static int cfg80211_set_ringparam(struct net_device *dev, static int cfg80211_get_sset_count(struct net_device *dev, int sset) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (rdev->ops->get_et_sset_count) return rdev_get_et_sset_count(rdev, dev, sset); return -EOPNOTSUPP; @@ -80,7 +80,7 @@ static void cfg80211_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (rdev->ops->get_et_stats) rdev_get_et_stats(rdev, dev, stats, data); } @@ -88,7 +88,7 @@ static void cfg80211_get_stats(struct net_device *dev, static void cfg80211_get_strings(struct net_device *dev, u32 sset, u8 *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (rdev->ops->get_et_strings) rdev_get_et_strings(rdev, dev, sset, data); } diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk index 9a8217d2a90..40c37fc5b67 100644 --- a/net/wireless/genregdb.awk +++ b/net/wireless/genregdb.awk @@ -66,19 +66,15 @@ function parse_reg_rule() units = $8 sub(/\)/, "", units) sub(/,/, "", units) + dfs_cac = $9 if (units == "mW") { - if (power == 100) { - power = 20 - } else if (power == 200) { - power = 23 - } else if (power == 500) { - power = 27 - } else if (power == 1000) { - power = 30 - } else { - print "Unknown power value in database!" - } + power = 10 * log(power)/log(10) + } else { + dfs_cac = $8 } + sub(/,/, "", dfs_cac) + sub(/\(/, "", dfs_cac) + sub(/\)/, "", dfs_cac) flagstr = "" for (i=8; i<=NF; i++) flagstr = flagstr $i @@ -105,11 +101,13 @@ function parse_reg_rule() flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " } else if (flagarray[arg] == "NO-IR") { flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " + } else if (flagarray[arg] == "AUTO-BW") { + flags = flags "\n\t\t\tNL80211_RRF_AUTO_BW | " } } flags = flags "0" - printf "\t\tREG_RULE(%d, %d, %d, %d, %d, %s),\n", start, end, bw, gain, power, flags + printf "\t\tREG_RULE_EXT(%d, %d, %d, %d, %.0f, %d, %s),\n", start, end, bw, gain, power, dfs_cac, flags rules++ } diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 730147ed8e6..8f345da3ea5 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -14,7 +14,8 @@ #include "rdev-ops.h" -void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid) +void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, + struct ieee80211_channel *channel) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_bss *bss; @@ -28,8 +29,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid) if (!wdev->ssid_len) return; - bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, - wdev->ssid, wdev->ssid_len, + bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0, WLAN_CAPABILITY_IBSS, WLAN_CAPABILITY_IBSS); if (WARN_ON(!bss)) @@ -45,7 +45,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid) cfg80211_upload_connect_keys(wdev); - nl80211_send_ibss_bssid(wiphy_to_dev(wdev->wiphy), dev, bssid, + nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); @@ -54,21 +54,26 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid) #endif } -void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, gfp_t gfp) +void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, + struct ieee80211_channel *channel, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; - trace_cfg80211_ibss_joined(dev, bssid); + trace_cfg80211_ibss_joined(dev, bssid, channel); + + if (WARN_ON(!channel)) + return; ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_IBSS_JOINED; - memcpy(ev->cr.bssid, bssid, ETH_ALEN); + memcpy(ev->ij.bssid, bssid, ETH_ALEN); + ev->ij.channel = channel; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); @@ -77,14 +82,12 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, gfp_t gfp) } EXPORT_SYMBOL(cfg80211_ibss_joined); -int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys) +static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct ieee80211_channel *check_chan; - u8 radar_detect_width = 0; int err; ASSERT_WDEV_LOCK(wdev); @@ -117,32 +120,10 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, wdev->ibss_fixed = params->channel_fixed; wdev->ibss_dfs_possible = params->userspace_handles_dfs; + wdev->chandef = params->chandef; #ifdef CONFIG_CFG80211_WEXT wdev->wext.ibss.chandef = params->chandef; #endif - check_chan = params->chandef.chan; - if (params->userspace_handles_dfs) { - /* use channel NULL to check for radar even if the current - * channel is not a radar channel - it might decide to change - * to DFS channel later. - */ - radar_detect_width = BIT(params->chandef.width); - check_chan = NULL; - } - - err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, - check_chan, - (params->channel_fixed && - !radar_detect_width) - ? CHAN_MODE_SHARED - : CHAN_MODE_EXCLUSIVE, - radar_detect_width); - - if (err) { - wdev->connect_keys = NULL; - return err; - } - err = rdev_join_ibss(rdev, dev, params); if (err) { wdev->connect_keys = NULL; @@ -175,7 +156,7 @@ int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; ASSERT_WDEV_LOCK(wdev); @@ -183,6 +164,8 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) kfree(wdev->connect_keys); wdev->connect_keys = NULL; + rdev_set_qos_map(rdev, dev, NULL); + /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. @@ -198,6 +181,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) wdev->current_bss = NULL; wdev->ssid_len = 0; + memset(&wdev->chandef, 0, sizeof(wdev->chandef)); #ifdef CONFIG_CFG80211_WEXT if (!nowext) wdev->wext.ibss.ssid_len = 0; @@ -327,7 +311,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; @@ -338,7 +322,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev, if (!rdev->ops->join_ibss) return -EOPNOTSUPP; - freq = cfg80211_wext_freq(wdev->wiphy, wextfreq); + freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; @@ -412,7 +396,7 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; @@ -436,8 +420,8 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev, if (len > 0 && ssid[len - 1] == '\0') len--; + memcpy(wdev->ssid, ssid, len); wdev->wext.ibss.ssid = wdev->ssid; - memcpy(wdev->wext.ibss.ssid, ssid, len); wdev->wext.ibss.ssid_len = len; wdev_lock(wdev); @@ -479,7 +463,7 @@ int cfg80211_ibss_wext_siwap(struct net_device *dev, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; @@ -497,6 +481,9 @@ int cfg80211_ibss_wext_siwap(struct net_device *dev, if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; + if (bssid && !is_valid_ether_addr(bssid)) + return -EINVAL; + /* both automatic */ if (!bssid && !wdev->wext.ibss.bssid) return 0; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index b0e1869de7d..092300b30c3 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -174,19 +174,15 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, scan_width); } - if (!cfg80211_reg_can_beacon(&rdev->wiphy, &setup->chandef)) + if (!cfg80211_reg_can_beacon(&rdev->wiphy, &setup->chandef, + NL80211_IFTYPE_MESH_POINT)) return -EINVAL; - err = cfg80211_can_use_chan(rdev, wdev, setup->chandef.chan, - CHAN_MODE_SHARED); - if (err) - return err; - err = rdev_join_mesh(rdev, dev, conf, setup); if (!err) { memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len); wdev->mesh_id_len = setup->mesh_id_len; - wdev->channel = setup->chandef.chan; + wdev->chandef = setup->chandef; } return err; @@ -227,15 +223,10 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, if (!netif_running(wdev->netdev)) return -ENETDOWN; - err = cfg80211_can_use_chan(rdev, wdev, chandef->chan, - CHAN_MODE_SHARED); - if (err) - return err; - err = rdev_libertas_set_mesh_channel(rdev, wdev->netdev, chandef->chan); if (!err) - wdev->channel = chandef->chan; + wdev->chandef = *chandef; return err; } @@ -247,8 +238,8 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, return 0; } -static int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, - struct net_device *dev) +int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; @@ -267,7 +258,8 @@ static int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, err = rdev_leave_mesh(rdev, dev); if (!err) { wdev->mesh_id_len = 0; - wdev->channel = NULL; + memset(&wdev->chandef, 0, sizeof(wdev->chandef)); + rdev_set_qos_map(rdev, dev, NULL); } return err; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 52cca05044a..266766b8d80 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -23,7 +23,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; u8 *ie = mgmt->u.assoc_resp.variable; int ieoffs = offsetof(struct ieee80211_mgmt, u.assoc_resp.variable); @@ -54,7 +54,7 @@ EXPORT_SYMBOL(cfg80211_rx_assoc_resp); static void cfg80211_process_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); nl80211_send_rx_auth(rdev, wdev->netdev, buf, len, GFP_KERNEL); cfg80211_sme_rx_auth(wdev, buf, len); @@ -63,7 +63,7 @@ static void cfg80211_process_auth(struct wireless_dev *wdev, static void cfg80211_process_deauth(struct wireless_dev *wdev, const u8 *buf, size_t len) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); @@ -82,7 +82,7 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev, static void cfg80211_process_disassoc(struct wireless_dev *wdev, const u8 *buf, size_t len) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); @@ -123,7 +123,7 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_send_auth_timeout(dev, addr); @@ -136,7 +136,7 @@ void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_send_assoc_timeout(dev, bss->bssid); @@ -172,7 +172,7 @@ void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr, const u8 *tsc, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; char *buf = kmalloc(128, gfp); @@ -233,14 +233,8 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, if (!req.bss) return -ENOENT; - err = cfg80211_can_use_chan(rdev, wdev, req.bss->channel, - CHAN_MODE_SHARED); - if (err) - goto out; - err = rdev_auth(rdev, dev, &req); -out: cfg80211_put_bss(&rdev->wiphy, req.bss); return err; } @@ -306,16 +300,10 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, if (!req->bss) return -ENOENT; - err = cfg80211_can_use_chan(rdev, wdev, chan, CHAN_MODE_SHARED); - if (err) - goto out; - err = rdev_assoc(rdev, dev, req); if (!err) cfg80211_hold_bss(bss_from_pub(req->bss)); - -out: - if (err) + else cfg80211_put_bss(&rdev->wiphy, req->bss); return err; @@ -414,7 +402,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, int match_len) { struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; @@ -473,7 +461,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) { struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&wdev->mgmt_registrations_lock); @@ -620,7 +608,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, const u8 *buf, size_t len, u32 flags, gfp_t gfp) { struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg; const struct ieee80211_txrx_stypes *stypes = &wiphy->mgmt_stypes[wdev->iftype]; @@ -739,7 +727,7 @@ void cfg80211_radar_event(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, gfp_t gfp) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; trace_cfg80211_radar_event(wiphy, chandef); @@ -764,7 +752,7 @@ void cfg80211_cac_event(struct net_device *netdev, { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; trace_cfg80211_cac_event(netdev, event); @@ -772,13 +760,13 @@ void cfg80211_cac_event(struct net_device *netdev, if (WARN_ON(!wdev->cac_started)) return; - if (WARN_ON(!wdev->channel)) + if (WARN_ON(!wdev->chandef.chan)) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: timeout = wdev->cac_start_time + - msecs_to_jiffies(IEEE80211_DFS_MIN_CAC_TIME_MS); + msecs_to_jiffies(wdev->cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a693f86e597..6668daf6932 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -53,6 +53,7 @@ enum nl80211_multicast_groups { NL80211_MCGRP_SCAN, NL80211_MCGRP_REGULATORY, NL80211_MCGRP_MLME, + NL80211_MCGRP_VENDOR, NL80211_MCGRP_TESTMODE /* keep last - ifdef! */ }; @@ -61,6 +62,7 @@ static const struct genl_multicast_group nl80211_mcgrps[] = { [NL80211_MCGRP_SCAN] = { .name = "scan", }, [NL80211_MCGRP_REGULATORY] = { .name = "regulatory", }, [NL80211_MCGRP_MLME] = { .name = "mlme", }, + [NL80211_MCGRP_VENDOR] = { .name = "vendor", }, #ifdef CONFIG_NL80211_TESTMODE [NL80211_MCGRP_TESTMODE] = { .name = "testmode", } #endif @@ -163,16 +165,14 @@ __cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs) if (attrs[NL80211_ATTR_IFINDEX]) { int ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]); - netdev = dev_get_by_index(netns, ifindex); + netdev = __dev_get_by_index(netns, ifindex); if (netdev) { if (netdev->ieee80211_ptr) - tmp = wiphy_to_dev( - netdev->ieee80211_ptr->wiphy); + tmp = wiphy_to_rdev( + netdev->ieee80211_ptr->wiphy); else tmp = NULL; - dev_put(netdev); - /* not wireless device -- return error */ if (!tmp) return ERR_PTR(-EINVAL); @@ -371,11 +371,22 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, - [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_U16 }, - [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_U16 }, + [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_BINARY }, + [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_BINARY }, [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = { .type = NLA_BINARY }, [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = { .type = NLA_BINARY }, [NL80211_ATTR_HANDLE_DFS] = { .type = NLA_FLAG }, + [NL80211_ATTR_OPMODE_NOTIF] = { .type = NLA_U8 }, + [NL80211_ATTR_VENDOR_ID] = { .type = NLA_U32 }, + [NL80211_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, + [NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, + [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY, + .len = IEEE80211_QOS_MAP_LEN_MAX }, + [NL80211_ATTR_MAC_HINT] = { .len = ETH_ALEN }, + [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 }, + [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, + [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, + [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, }; /* policy for the key attributes */ @@ -475,7 +486,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, err = PTR_ERR(*wdev); goto out_unlock; } - *rdev = wiphy_to_dev((*wdev)->wiphy); + *rdev = wiphy_to_rdev((*wdev)->wiphy); /* 0 is the first index - add 1 to parse only once */ cb->args[0] = (*rdev)->wiphy_idx + 1; cb->args[1] = (*wdev)->identifier; @@ -488,7 +499,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, err = -ENODEV; goto out_unlock; } - *rdev = wiphy_to_dev(wiphy); + *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; list_for_each_entry(tmp, &(*rdev)->wdev_list, list) { @@ -557,6 +568,13 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct ieee80211_channel *chan, bool large) { + /* Some channels must be completely excluded from the + * list to protect old user-space tools from breaking + */ + if (!large && chan->flags & + (IEEE80211_CHAN_NO_10MHZ | IEEE80211_CHAN_NO_20MHZ)) + return 0; + if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_FREQ, chan->center_freq)) goto nla_put_failure; @@ -584,6 +602,10 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_DFS_TIME, time)) goto nla_put_failure; + if (nla_put_u32(msg, + NL80211_FREQUENCY_ATTR_DFS_CAC_TIME, + chan->dfs_cac_ms)) + goto nla_put_failure; } } @@ -600,6 +622,18 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, if ((chan->flags & IEEE80211_CHAN_NO_160MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_160MHZ)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_INDOOR_ONLY) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_INDOOR_ONLY)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_GO_CONCURRENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_GO_CONCURRENT)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_20MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_20MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_10MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_10MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -849,6 +883,19 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) return 0; } +static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy, + struct nlattr *tb) +{ + struct ieee80211_channel *chan; + + if (tb == NULL) + return NULL; + chan = ieee80211_get_channel(wiphy, nla_get_u32(tb)); + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + return NULL; + return chan; +} + static int nl80211_put_iftypes(struct sk_buff *msg, u32 attr, u16 ifmodes) { struct nlattr *nl_modes = nla_nest_start(msg, attr); @@ -924,8 +971,10 @@ static int nl80211_put_iface_combinations(struct wiphy *wiphy, c->max_interfaces)) goto nla_put_failure; if (large && - nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS, - c->radar_detect_widths)) + (nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS, + c->radar_detect_widths) || + nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS, + c->radar_detect_regions))) goto nla_put_failure; nla_nest_end(msg, nl_combi); @@ -980,42 +1029,42 @@ static int nl80211_send_wowlan_tcp_caps(struct cfg80211_registered_device *rdev, } static int nl80211_send_wowlan(struct sk_buff *msg, - struct cfg80211_registered_device *dev, + struct cfg80211_registered_device *rdev, bool large) { struct nlattr *nl_wowlan; - if (!dev->wiphy.wowlan) + if (!rdev->wiphy.wowlan) return 0; nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED); if (!nl_wowlan) return -ENOBUFS; - if (((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_ANY) && + if (((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_ANY) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_ANY)) || - ((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_DISCONNECT) && + ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_DISCONNECT) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) || - ((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT) && + ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) || - ((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY) && + ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED)) || - ((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && + ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) || - ((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ) && + ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) || - ((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE) && + ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) || - ((dev->wiphy.wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE) && + ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE) && nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE))) return -ENOBUFS; - if (dev->wiphy.wowlan->n_patterns) { + if (rdev->wiphy.wowlan->n_patterns) { struct nl80211_pattern_support pat = { - .max_patterns = dev->wiphy.wowlan->n_patterns, - .min_pattern_len = dev->wiphy.wowlan->pattern_min_len, - .max_pattern_len = dev->wiphy.wowlan->pattern_max_len, - .max_pkt_offset = dev->wiphy.wowlan->max_pkt_offset, + .max_patterns = rdev->wiphy.wowlan->n_patterns, + .min_pattern_len = rdev->wiphy.wowlan->pattern_min_len, + .max_pattern_len = rdev->wiphy.wowlan->pattern_max_len, + .max_pkt_offset = rdev->wiphy.wowlan->max_pkt_offset, }; if (nla_put(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN, @@ -1023,7 +1072,7 @@ static int nl80211_send_wowlan(struct sk_buff *msg, return -ENOBUFS; } - if (large && nl80211_send_wowlan_tcp_caps(dev, msg)) + if (large && nl80211_send_wowlan_tcp_caps(rdev, msg)) return -ENOBUFS; nla_nest_end(msg, nl_wowlan); @@ -1033,19 +1082,19 @@ static int nl80211_send_wowlan(struct sk_buff *msg, #endif static int nl80211_send_coalesce(struct sk_buff *msg, - struct cfg80211_registered_device *dev) + struct cfg80211_registered_device *rdev) { struct nl80211_coalesce_rule_support rule; - if (!dev->wiphy.coalesce) + if (!rdev->wiphy.coalesce) return 0; - rule.max_rules = dev->wiphy.coalesce->n_rules; - rule.max_delay = dev->wiphy.coalesce->max_delay; - rule.pat.max_patterns = dev->wiphy.coalesce->n_patterns; - rule.pat.min_pattern_len = dev->wiphy.coalesce->pattern_min_len; - rule.pat.max_pattern_len = dev->wiphy.coalesce->pattern_max_len; - rule.pat.max_pkt_offset = dev->wiphy.coalesce->max_pkt_offset; + rule.max_rules = rdev->wiphy.coalesce->n_rules; + rule.max_delay = rdev->wiphy.coalesce->max_delay; + rule.pat.max_patterns = rdev->wiphy.coalesce->n_patterns; + rule.pat.min_pattern_len = rdev->wiphy.coalesce->pattern_min_len; + rule.pat.max_pattern_len = rdev->wiphy.coalesce->pattern_max_len; + rule.pat.max_pkt_offset = rdev->wiphy.coalesce->max_pkt_offset; if (nla_put(msg, NL80211_ATTR_COALESCE_RULE, sizeof(rule), &rule)) return -ENOBUFS; @@ -1176,7 +1225,8 @@ struct nl80211_dump_wiphy_state { bool split; }; -static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, +static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, + enum nl80211_commands cmd, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct nl80211_dump_wiphy_state *state) { @@ -1188,63 +1238,66 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, struct ieee80211_channel *chan; int i; const struct ieee80211_txrx_stypes *mgmt_stypes = - dev->wiphy.mgmt_stypes; + rdev->wiphy.mgmt_stypes; u32 features; - hdr = nl80211hdr_put(msg, portid, seq, flags, NL80211_CMD_NEW_WIPHY); + hdr = nl80211hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (WARN_ON(!state)) return -EINVAL; - if (nla_put_u32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx) || + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_string(msg, NL80211_ATTR_WIPHY_NAME, - wiphy_name(&dev->wiphy)) || + wiphy_name(&rdev->wiphy)) || nla_put_u32(msg, NL80211_ATTR_GENERATION, cfg80211_rdev_list_generation)) goto nla_put_failure; + if (cmd != NL80211_CMD_NEW_WIPHY) + goto finish; + switch (state->split_start) { case 0: if (nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT, - dev->wiphy.retry_short) || + rdev->wiphy.retry_short) || nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_LONG, - dev->wiphy.retry_long) || + rdev->wiphy.retry_long) || nla_put_u32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD, - dev->wiphy.frag_threshold) || + rdev->wiphy.frag_threshold) || nla_put_u32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD, - dev->wiphy.rts_threshold) || + rdev->wiphy.rts_threshold) || nla_put_u8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS, - dev->wiphy.coverage_class) || + rdev->wiphy.coverage_class) || nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, - dev->wiphy.max_scan_ssids) || + rdev->wiphy.max_scan_ssids) || nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS, - dev->wiphy.max_sched_scan_ssids) || + rdev->wiphy.max_sched_scan_ssids) || nla_put_u16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN, - dev->wiphy.max_scan_ie_len) || + rdev->wiphy.max_scan_ie_len) || nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN, - dev->wiphy.max_sched_scan_ie_len) || + rdev->wiphy.max_sched_scan_ie_len) || nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS, - dev->wiphy.max_match_sets)) + rdev->wiphy.max_match_sets)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) && + if ((rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) && nla_put_flag(msg, NL80211_ATTR_SUPPORT_IBSS_RSN)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) && + if ((rdev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) && nla_put_flag(msg, NL80211_ATTR_SUPPORT_MESH_AUTH)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) && + if ((rdev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) && nla_put_flag(msg, NL80211_ATTR_SUPPORT_AP_UAPSD)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && + if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && nla_put_flag(msg, NL80211_ATTR_ROAM_SUPPORT)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) && + if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) && nla_put_flag(msg, NL80211_ATTR_TDLS_SUPPORT)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) && + if ((rdev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) && nla_put_flag(msg, NL80211_ATTR_TDLS_EXTERNAL_SETUP)) goto nla_put_failure; state->split_start++; @@ -1252,35 +1305,35 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, break; case 1: if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES, - sizeof(u32) * dev->wiphy.n_cipher_suites, - dev->wiphy.cipher_suites)) + sizeof(u32) * rdev->wiphy.n_cipher_suites, + rdev->wiphy.cipher_suites)) goto nla_put_failure; if (nla_put_u8(msg, NL80211_ATTR_MAX_NUM_PMKIDS, - dev->wiphy.max_num_pmkids)) + rdev->wiphy.max_num_pmkids)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) && + if ((rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) && nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE)) goto nla_put_failure; if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX, - dev->wiphy.available_antennas_tx) || + rdev->wiphy.available_antennas_tx) || nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX, - dev->wiphy.available_antennas_rx)) + rdev->wiphy.available_antennas_rx)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD) && + if ((rdev->wiphy.flags & WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD) && nla_put_u32(msg, NL80211_ATTR_PROBE_RESP_OFFLOAD, - dev->wiphy.probe_resp_offload)) + rdev->wiphy.probe_resp_offload)) goto nla_put_failure; - if ((dev->wiphy.available_antennas_tx || - dev->wiphy.available_antennas_rx) && - dev->ops->get_antenna) { + if ((rdev->wiphy.available_antennas_tx || + rdev->wiphy.available_antennas_rx) && + rdev->ops->get_antenna) { u32 tx_ant = 0, rx_ant = 0; int res; - res = rdev_get_antenna(dev, &tx_ant, &rx_ant); + res = rdev_get_antenna(rdev, &tx_ant, &rx_ant); if (!res) { if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_TX, @@ -1297,7 +1350,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, break; case 2: if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES, - dev->wiphy.interface_modes)) + rdev->wiphy.interface_modes)) goto nla_put_failure; state->split_start++; if (state->split) @@ -1311,7 +1364,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, band < IEEE80211_NUM_BANDS; band++) { struct ieee80211_supported_band *sband; - sband = dev->wiphy.bands[band]; + sband = rdev->wiphy.bands[band]; if (!sband) continue; @@ -1388,7 +1441,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, i = 0; #define CMD(op, n) \ do { \ - if (dev->ops->op) { \ + if (rdev->ops->op) { \ i++; \ if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \ goto nla_put_failure; \ @@ -1412,58 +1465,58 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, CMD(set_pmksa, SET_PMKSA); CMD(del_pmksa, DEL_PMKSA); CMD(flush_pmksa, FLUSH_PMKSA); - if (dev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) + if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) CMD(remain_on_channel, REMAIN_ON_CHANNEL); CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); CMD(mgmt_tx, FRAME); CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL); - if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { + if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { i++; if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS)) goto nla_put_failure; } - if (dev->ops->set_monitor_channel || dev->ops->start_ap || - dev->ops->join_mesh) { + if (rdev->ops->set_monitor_channel || rdev->ops->start_ap || + rdev->ops->join_mesh) { i++; if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL)) goto nla_put_failure; } CMD(set_wds_peer, SET_WDS_PEER); - if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) { + if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) { CMD(tdls_mgmt, TDLS_MGMT); CMD(tdls_oper, TDLS_OPER); } - if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) CMD(sched_scan_start, START_SCHED_SCAN); CMD(probe_client, PROBE_CLIENT); CMD(set_noack_map, SET_NOACK_MAP); - if (dev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) { + if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) { i++; if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS)) goto nla_put_failure; } CMD(start_p2p_device, START_P2P_DEVICE); CMD(set_mcast_rate, SET_MCAST_RATE); +#ifdef CONFIG_NL80211_TESTMODE + CMD(testmode_cmd, TESTMODE); +#endif if (state->split) { CMD(crit_proto_start, CRIT_PROTOCOL_START); CMD(crit_proto_stop, CRIT_PROTOCOL_STOP); - if (dev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) + if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); + CMD(set_qos_map, SET_QOS_MAP); } - -#ifdef CONFIG_NL80211_TESTMODE - CMD(testmode_cmd, TESTMODE); -#endif - + /* add into the if now */ #undef CMD - if (dev->ops->connect || dev->ops->auth) { + if (rdev->ops->connect || rdev->ops->auth) { i++; if (nla_put_u32(msg, i, NL80211_CMD_CONNECT)) goto nla_put_failure; } - if (dev->ops->disconnect || dev->ops->deauth) { + if (rdev->ops->disconnect || rdev->ops->deauth) { i++; if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT)) goto nla_put_failure; @@ -1474,14 +1527,14 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, if (state->split) break; case 5: - if (dev->ops->remain_on_channel && - (dev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) && + if (rdev->ops->remain_on_channel && + (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) && nla_put_u32(msg, NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION, - dev->wiphy.max_remain_on_channel_duration)) + rdev->wiphy.max_remain_on_channel_duration)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX) && + if ((rdev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX) && nla_put_flag(msg, NL80211_ATTR_OFFCHANNEL_TX_OK)) goto nla_put_failure; @@ -1492,7 +1545,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, break; case 6: #ifdef CONFIG_PM - if (nl80211_send_wowlan(msg, dev, state->split)) + if (nl80211_send_wowlan(msg, rdev, state->split)) goto nla_put_failure; state->split_start++; if (state->split) @@ -1502,10 +1555,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, #endif case 7: if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES, - dev->wiphy.software_iftypes)) + rdev->wiphy.software_iftypes)) goto nla_put_failure; - if (nl80211_put_iface_combinations(&dev->wiphy, msg, + if (nl80211_put_iface_combinations(&rdev->wiphy, msg, state->split)) goto nla_put_failure; @@ -1513,12 +1566,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, if (state->split) break; case 8: - if ((dev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) && + if ((rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) && nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME, - dev->wiphy.ap_sme_capa)) + rdev->wiphy.ap_sme_capa)) goto nla_put_failure; - features = dev->wiphy.features; + features = rdev->wiphy.features; /* * We can only add the per-channel limit information if the * dump is split, otherwise it makes it too big. Therefore @@ -1529,16 +1582,16 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, if (nla_put_u32(msg, NL80211_ATTR_FEATURE_FLAGS, features)) goto nla_put_failure; - if (dev->wiphy.ht_capa_mod_mask && + if (rdev->wiphy.ht_capa_mod_mask && nla_put(msg, NL80211_ATTR_HT_CAPABILITY_MASK, - sizeof(*dev->wiphy.ht_capa_mod_mask), - dev->wiphy.ht_capa_mod_mask)) + sizeof(*rdev->wiphy.ht_capa_mod_mask), + rdev->wiphy.ht_capa_mod_mask)) goto nla_put_failure; - if (dev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME && - dev->wiphy.max_acl_mac_addrs && + if (rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME && + rdev->wiphy.max_acl_mac_addrs && nla_put_u32(msg, NL80211_ATTR_MAC_ACL_MAX, - dev->wiphy.max_acl_mac_addrs)) + rdev->wiphy.max_acl_mac_addrs)) goto nla_put_failure; /* @@ -1554,36 +1607,85 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, state->split_start++; break; case 9: - if (dev->wiphy.extended_capabilities && + if (rdev->wiphy.extended_capabilities && (nla_put(msg, NL80211_ATTR_EXT_CAPA, - dev->wiphy.extended_capabilities_len, - dev->wiphy.extended_capabilities) || + rdev->wiphy.extended_capabilities_len, + rdev->wiphy.extended_capabilities) || nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK, - dev->wiphy.extended_capabilities_len, - dev->wiphy.extended_capabilities_mask))) + rdev->wiphy.extended_capabilities_len, + rdev->wiphy.extended_capabilities_mask))) goto nla_put_failure; - if (dev->wiphy.vht_capa_mod_mask && + if (rdev->wiphy.vht_capa_mod_mask && nla_put(msg, NL80211_ATTR_VHT_CAPABILITY_MASK, - sizeof(*dev->wiphy.vht_capa_mod_mask), - dev->wiphy.vht_capa_mod_mask)) + sizeof(*rdev->wiphy.vht_capa_mod_mask), + rdev->wiphy.vht_capa_mod_mask)) goto nla_put_failure; state->split_start++; break; case 10: - if (nl80211_send_coalesce(msg, dev)) + if (nl80211_send_coalesce(msg, rdev)) goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ) && + if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ) && (nla_put_flag(msg, NL80211_ATTR_SUPPORT_5_MHZ) || nla_put_flag(msg, NL80211_ATTR_SUPPORT_10_MHZ))) goto nla_put_failure; + if (rdev->wiphy.max_ap_assoc_sta && + nla_put_u32(msg, NL80211_ATTR_MAX_AP_ASSOC_STA, + rdev->wiphy.max_ap_assoc_sta)) + goto nla_put_failure; + + state->split_start++; + break; + case 11: + if (rdev->wiphy.n_vendor_commands) { + const struct nl80211_vendor_cmd_info *info; + struct nlattr *nested; + + nested = nla_nest_start(msg, NL80211_ATTR_VENDOR_DATA); + if (!nested) + goto nla_put_failure; + + for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { + info = &rdev->wiphy.vendor_commands[i].info; + if (nla_put(msg, i + 1, sizeof(*info), info)) + goto nla_put_failure; + } + nla_nest_end(msg, nested); + } + + if (rdev->wiphy.n_vendor_events) { + const struct nl80211_vendor_cmd_info *info; + struct nlattr *nested; + + nested = nla_nest_start(msg, + NL80211_ATTR_VENDOR_EVENTS); + if (!nested) + goto nla_put_failure; + + for (i = 0; i < rdev->wiphy.n_vendor_events; i++) { + info = &rdev->wiphy.vendor_events[i]; + if (nla_put(msg, i + 1, sizeof(*info), info)) + goto nla_put_failure; + } + nla_nest_end(msg, nested); + } + state->split_start++; + break; + case 12: + if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH && + nla_put_u8(msg, NL80211_ATTR_MAX_CSA_COUNTERS, + rdev->wiphy.max_num_csa_counters)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; } + finish: return genlmsg_end(msg, hdr); nla_put_failure: @@ -1612,15 +1714,14 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb, struct cfg80211_registered_device *rdev; int ifidx = nla_get_u32(tb[NL80211_ATTR_IFINDEX]); - netdev = dev_get_by_index(sock_net(skb->sk), ifidx); + netdev = __dev_get_by_index(sock_net(skb->sk), ifidx); if (!netdev) return -ENODEV; if (netdev->ieee80211_ptr) { - rdev = wiphy_to_dev( + rdev = wiphy_to_rdev( netdev->ieee80211_ptr->wiphy); state->filter_wiphy = rdev->wiphy_idx; } - dev_put(netdev); } return 0; @@ -1630,7 +1731,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, ret; struct nl80211_dump_wiphy_state *state = (void *)cb->args[0]; - struct cfg80211_registered_device *dev; + struct cfg80211_registered_device *rdev; rtnl_lock(); if (!state) { @@ -1649,17 +1750,18 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0] = (long)state; } - list_for_each_entry(dev, &cfg80211_rdev_list, list) { - if (!net_eq(wiphy_net(&dev->wiphy), sock_net(skb->sk))) + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk))) continue; if (++idx <= state->start) continue; if (state->filter_wiphy != -1 && - state->filter_wiphy != dev->wiphy_idx) + state->filter_wiphy != rdev->wiphy_idx) continue; /* attempt to fit multiple wiphy data chunks into the skb */ do { - ret = nl80211_send_wiphy(dev, skb, + ret = nl80211_send_wiphy(rdev, NL80211_CMD_NEW_WIPHY, + skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, state); @@ -1678,9 +1780,10 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) * We can then retry with the larger buffer. */ if ((ret == -ENOBUFS || ret == -EMSGSIZE) && - !skb->len && + !skb->len && !state->split && cb->min_dump_alloc < 4096) { cb->min_dump_alloc = 4096; + state->split_start = 0; rtnl_unlock(); return 1; } @@ -1706,14 +1809,15 @@ static int nl80211_dump_wiphy_done(struct netlink_callback *cb) static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; - struct cfg80211_registered_device *dev = info->user_ptr[0]; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct nl80211_dump_wiphy_state state = {}; msg = nlmsg_new(4096, GFP_KERNEL); if (!msg) return -ENOMEM; - if (nl80211_send_wiphy(dev, msg, info->snd_portid, info->snd_seq, 0, + if (nl80211_send_wiphy(rdev, NL80211_CMD_NEW_WIPHY, msg, + info->snd_portid, info->snd_seq, 0, &state) < 0) { nlmsg_free(msg); return -ENOBUFS; @@ -1840,18 +1944,20 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, } static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, + struct net_device *dev, struct genl_info *info) { struct cfg80211_chan_def chandef; int result; enum nl80211_iftype iftype = NL80211_IFTYPE_MONITOR; + struct wireless_dev *wdev = NULL; - if (wdev) - iftype = wdev->iftype; - + if (dev) + wdev = dev->ieee80211_ptr; if (!nl80211_can_set_dev_channel(wdev)) return -EOPNOTSUPP; + if (wdev) + iftype = wdev->iftype; result = nl80211_parse_chandef(rdev, info, &chandef); if (result) @@ -1860,14 +1966,27 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - if (wdev->beacon_interval) { - result = -EBUSY; - break; - } - if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef)) { + if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, iftype)) { result = -EINVAL; break; } + if (wdev->beacon_interval) { + if (!dev || !rdev->ops->set_ap_chanwidth || + !(rdev->wiphy.features & + NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE)) { + result = -EBUSY; + break; + } + + /* Only allow dynamic channel width changes */ + if (chandef.chan != wdev->preset_chandef.chan) { + result = -EBUSY; + break; + } + result = rdev_set_ap_chanwidth(rdev, dev, &chandef); + if (result) + break; + } wdev->preset_chandef = chandef; result = 0; break; @@ -1889,7 +2008,7 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *netdev = info->user_ptr[1]; - return __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info); + return __nl80211_set_channel(rdev, netdev, info); } static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) @@ -1943,9 +2062,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_IFINDEX]) { int ifindex = nla_get_u32(info->attrs[NL80211_ATTR_IFINDEX]); - netdev = dev_get_by_index(genl_info_net(info), ifindex); + netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev && netdev->ieee80211_ptr) - rdev = wiphy_to_dev(netdev->ieee80211_ptr->wiphy); + rdev = wiphy_to_rdev(netdev->ieee80211_ptr->wiphy); else netdev = NULL; } @@ -1971,57 +2090,52 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME])); if (result) - goto bad_res; + return result; if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; - if (!rdev->ops->set_txq_params) { - result = -EOPNOTSUPP; - goto bad_res; - } + if (!rdev->ops->set_txq_params) + return -EOPNOTSUPP; - if (!netdev) { - result = -EINVAL; - goto bad_res; - } + if (!netdev) + return -EINVAL; if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - result = -EINVAL; - goto bad_res; - } + netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EINVAL; - if (!netif_running(netdev)) { - result = -ENETDOWN; - goto bad_res; - } + if (!netif_running(netdev)) + return -ENETDOWN; nla_for_each_nested(nl_txq_params, info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], rem_txq_params) { - nla_parse(tb, NL80211_TXQ_ATTR_MAX, - nla_data(nl_txq_params), - nla_len(nl_txq_params), - txq_params_policy); + result = nla_parse(tb, NL80211_TXQ_ATTR_MAX, + nla_data(nl_txq_params), + nla_len(nl_txq_params), + txq_params_policy); + if (result) + return result; result = parse_txq_params(tb, &txq_params); if (result) - goto bad_res; + return result; result = rdev_set_txq_params(rdev, netdev, &txq_params); if (result) - goto bad_res; + return result; } } if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - result = __nl80211_set_channel(rdev, - nl80211_can_set_dev_channel(wdev) ? wdev : NULL, - info); + result = __nl80211_set_channel( + rdev, + nl80211_can_set_dev_channel(wdev) ? netdev : NULL, + info); if (result) - goto bad_res; + return result; } if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) { @@ -2032,19 +2146,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER)) txp_wdev = NULL; - if (!rdev->ops->set_tx_power) { - result = -EOPNOTSUPP; - goto bad_res; - } + if (!rdev->ops->set_tx_power) + return -EOPNOTSUPP; idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING; type = nla_get_u32(info->attrs[idx]); if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] && - (type != NL80211_TX_POWER_AUTOMATIC)) { - result = -EINVAL; - goto bad_res; - } + (type != NL80211_TX_POWER_AUTOMATIC)) + return -EINVAL; if (type != NL80211_TX_POWER_AUTOMATIC) { idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL; @@ -2053,7 +2163,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); if (result) - goto bad_res; + return result; } if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && @@ -2061,10 +2171,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 tx_ant, rx_ant; if ((!rdev->wiphy.available_antennas_tx && !rdev->wiphy.available_antennas_rx) || - !rdev->ops->set_antenna) { - result = -EOPNOTSUPP; - goto bad_res; - } + !rdev->ops->set_antenna) + return -EOPNOTSUPP; tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]); rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]); @@ -2072,17 +2180,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) /* reject antenna configurations which don't match the * available antenna masks, except for the "all" mask */ if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) || - (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) { - result = -EINVAL; - goto bad_res; - } + (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) + return -EINVAL; tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; result = rdev_set_antenna(rdev, tx_ant, rx_ant); if (result) - goto bad_res; + return result; } changed = 0; @@ -2090,30 +2196,27 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) { retry_short = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]); - if (retry_short == 0) { - result = -EINVAL; - goto bad_res; - } + if (retry_short == 0) + return -EINVAL; + changed |= WIPHY_PARAM_RETRY_SHORT; } if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) { retry_long = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]); - if (retry_long == 0) { - result = -EINVAL; - goto bad_res; - } + if (retry_long == 0) + return -EINVAL; + changed |= WIPHY_PARAM_RETRY_LONG; } if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) { frag_threshold = nla_get_u32( info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]); - if (frag_threshold < 256) { - result = -EINVAL; - goto bad_res; - } + if (frag_threshold < 256) + return -EINVAL; + if (frag_threshold != (u32) -1) { /* * Fragments (apart from the last one) are required to @@ -2143,10 +2246,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 old_frag_threshold, old_rts_threshold; u8 old_coverage_class; - if (!rdev->ops->set_wiphy_params) { - result = -EOPNOTSUPP; - goto bad_res; - } + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; old_retry_short = rdev->wiphy.retry_short; old_retry_long = rdev->wiphy.retry_long; @@ -2174,17 +2275,13 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.coverage_class = old_coverage_class; } } - - bad_res: - if (netdev) - dev_put(netdev); - return result; + return 0; } static inline u64 wdev_id(struct wireless_dev *wdev) { return (u64)wdev->identifier | - ((u64)wiphy_to_dev(wdev->wiphy)->wiphy_idx << 32); + ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32); } static int nl80211_send_chandef(struct sk_buff *msg, @@ -2310,7 +2407,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; - struct cfg80211_registered_device *dev = info->user_ptr[0]; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -2318,7 +2415,7 @@ static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0, - dev, wdev) < 0) { + rdev, wdev) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -2469,6 +2566,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; u32 flags; + /* to avoid failing a new interface creation due to pending removal */ + cfg80211_destroy_ifaces(rdev); + memset(¶ms, 0, sizeof(params)); if (!info->attrs[NL80211_ATTR_IFNAME]) @@ -2518,6 +2618,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(wdev); } + if (info->attrs[NL80211_ATTR_IFACE_SOCKET_OWNER]) + wdev->owner_nlportid = info->snd_portid; + switch (type) { case NL80211_IFTYPE_MESH_POINT: if (!info->attrs[NL80211_ATTR_MESH_ID]) @@ -3097,7 +3200,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_ap_settings params; int err; - u8 radar_detect_width = 0; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -3213,24 +3315,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } else if (!nl80211_get_ap_channel(rdev, ¶ms)) return -EINVAL; - if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef)) + if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef, + wdev->iftype)) return -EINVAL; - err = cfg80211_chandef_dfs_required(wdev->wiphy, ¶ms.chandef); - if (err < 0) - return err; - if (err) { - radar_detect_width = BIT(params.chandef.width); - params.radar_required = true; - } - - err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, - params.chandef.chan, - CHAN_MODE_SHARED, - radar_detect_width); - if (err) - return err; - if (info->attrs[NL80211_ATTR_ACL_POLICY]) { params.acl = parse_acl_data(&rdev->wiphy, info); if (IS_ERR(params.acl)) @@ -3242,7 +3330,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (!err) { wdev->preset_chandef = params.chandef; wdev->beacon_interval = params.beacon_interval; - wdev->channel = params.chandef.chan; + wdev->chandef = params.chandef; wdev->ssid_len = params.ssid_len; memcpy(wdev->ssid, params.ssid, wdev->ssid_len); } @@ -3287,7 +3375,7 @@ static int nl80211_stop_ap(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; - return cfg80211_stop_ap(rdev, dev); + return cfg80211_stop_ap(rdev, dev, false); } static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = { @@ -3568,6 +3656,10 @@ static int nl80211_send_station(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u32(msg, NL80211_STA_INFO_TX_FAILED, sinfo->tx_failed)) goto nla_put_failure; + if ((sinfo->filled & STATION_INFO_EXPECTED_THROUGHPUT) && + nla_put_u32(msg, NL80211_STA_INFO_EXPECTED_THROUGHPUT, + sinfo->expected_throughput)) + goto nla_put_failure; if ((sinfo->filled & STATION_INFO_BEACON_LOSS_COUNT) && nla_put_u32(msg, NL80211_STA_INFO_BEACON_LOSS, sinfo->beacon_loss_count)) @@ -3630,13 +3722,13 @@ static int nl80211_dump_station(struct sk_buff *skb, struct netlink_callback *cb) { struct station_info sinfo; - struct cfg80211_registered_device *dev; + struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; u8 mac_addr[ETH_ALEN]; int sta_idx = cb->args[2]; int err; - err = nl80211_prepare_wdev_dump(skb, cb, &dev, &wdev); + err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) return err; @@ -3645,14 +3737,14 @@ static int nl80211_dump_station(struct sk_buff *skb, goto out_err; } - if (!dev->ops->dump_station) { + if (!rdev->ops->dump_station) { err = -EOPNOTSUPP; goto out_err; } while (1) { memset(&sinfo, 0, sizeof(sinfo)); - err = rdev_dump_station(dev, wdev->netdev, sta_idx, + err = rdev_dump_station(rdev, wdev->netdev, sta_idx, mac_addr, &sinfo); if (err == -ENOENT) break; @@ -3662,7 +3754,7 @@ static int nl80211_dump_station(struct sk_buff *skb, if (nl80211_send_station(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - dev, wdev->netdev, mac_addr, + rdev, wdev->netdev, mac_addr, &sinfo) < 0) goto out; @@ -3674,7 +3766,7 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[2] = sta_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(dev); + nl80211_finish_wdev_dump(rdev); return err; } @@ -3885,8 +3977,8 @@ static struct net_device *get_vlan(struct genl_info *info, return ERR_PTR(ret); } -static struct nla_policy -nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] __read_mostly = { +static const struct nla_policy +nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = { [NL80211_STA_WME_UAPSD_QUEUES] = { .type = NLA_U8 }, [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, }; @@ -4151,6 +4243,12 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { + params.opmode_notif_used = true; + params.opmode_notif = + nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); + } + if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) { params.plink_action = nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); @@ -4329,18 +4427,18 @@ static int nl80211_dump_mpath(struct sk_buff *skb, struct netlink_callback *cb) { struct mpath_info pinfo; - struct cfg80211_registered_device *dev; + struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; u8 dst[ETH_ALEN]; u8 next_hop[ETH_ALEN]; int path_idx = cb->args[2]; int err; - err = nl80211_prepare_wdev_dump(skb, cb, &dev, &wdev); + err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) return err; - if (!dev->ops->dump_mpath) { + if (!rdev->ops->dump_mpath) { err = -EOPNOTSUPP; goto out_err; } @@ -4351,7 +4449,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, } while (1) { - err = rdev_dump_mpath(dev, wdev->netdev, path_idx, dst, + err = rdev_dump_mpath(rdev, wdev->netdev, path_idx, dst, next_hop, &pinfo); if (err == -ENOENT) break; @@ -4372,7 +4470,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(dev); + nl80211_finish_wdev_dump(rdev); return err; } @@ -4567,6 +4665,7 @@ static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = [NL80211_ATTR_FREQ_RANGE_MAX_BW] = { .type = NLA_U32 }, [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN] = { .type = NLA_U32 }, [NL80211_ATTR_POWER_RULE_MAX_EIRP] = { .type = NLA_U32 }, + [NL80211_ATTR_DFS_CAC_TIME] = { .type = NLA_U32 }, }; static int parse_reg_rule(struct nlattr *tb[], @@ -4602,12 +4701,15 @@ static int parse_reg_rule(struct nlattr *tb[], power_rule->max_antenna_gain = nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]); + if (tb[NL80211_ATTR_DFS_CAC_TIME]) + reg_rule->dfs_cac_ms = + nla_get_u32(tb[NL80211_ATTR_DFS_CAC_TIME]); + return 0; } static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) { - int r; char *data = NULL; enum nl80211_user_reg_hint_type user_reg_hint_type; @@ -4620,11 +4722,6 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) if (unlikely(!rcu_access_pointer(cfg80211_regdomain))) return -EINPROGRESS; - if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) - return -EINVAL; - - data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]); - if (info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE]) user_reg_hint_type = nla_get_u32(info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE]); @@ -4634,14 +4731,16 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) switch (user_reg_hint_type) { case NL80211_USER_REG_HINT_USER: case NL80211_USER_REG_HINT_CELL_BASE: - break; + if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) + return -EINVAL; + + data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]); + return regulatory_hint_user(data, user_reg_hint_type); + case NL80211_USER_REG_HINT_INDOOR: + return regulatory_hint_indoor_user(); default: return -EINVAL; } - - r = regulatory_hint_user(data, user_reg_hint_type); - - return r; } static int nl80211_get_mesh_config(struct sk_buff *skb, @@ -5063,6 +5162,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) const struct ieee80211_reg_rule *reg_rule; const struct ieee80211_freq_range *freq_range; const struct ieee80211_power_rule *power_rule; + unsigned int max_bandwidth_khz; reg_rule = ®dom->reg_rules[i]; freq_range = ®_rule->freq_range; @@ -5072,6 +5172,11 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) if (!nl_reg_rule) goto nla_put_failure_rcu; + max_bandwidth_khz = freq_range->max_bandwidth_khz; + if (!max_bandwidth_khz) + max_bandwidth_khz = reg_get_max_bandwidth(regdom, + reg_rule); + if (nla_put_u32(msg, NL80211_ATTR_REG_RULE_FLAGS, reg_rule->flags) || nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_START, @@ -5079,11 +5184,13 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_END, freq_range->end_freq_khz) || nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_MAX_BW, - freq_range->max_bandwidth_khz) || + max_bandwidth_khz) || nla_put_u32(msg, NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN, power_rule->max_antenna_gain) || nla_put_u32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP, - power_rule->max_eirp)) + power_rule->max_eirp) || + nla_put_u32(msg, NL80211_ATTR_DFS_CAC_TIME, + reg_rule->dfs_cac_ms)) goto nla_put_failure_rcu; nla_nest_end(msg, nl_reg_rule); @@ -5155,9 +5262,11 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], rem_reg_rules) { - nla_parse(tb, NL80211_REG_RULE_ATTR_MAX, - nla_data(nl_reg_rule), nla_len(nl_reg_rule), - reg_rule_policy); + r = nla_parse(tb, NL80211_REG_RULE_ATTR_MAX, + nla_data(nl_reg_rule), nla_len(nl_reg_rule), + reg_rule_policy); + if (r) + goto bad_reg; r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]); if (r) goto bad_reg; @@ -5222,7 +5331,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto unlock; } @@ -5235,12 +5344,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto unlock; } } else { - enum ieee80211_band band; - n_channels = 0; - - for (band = 0; band < IEEE80211_NUM_BANDS; band++) - if (wiphy->bands[band]) - n_channels += wiphy->bands[band]->n_channels; + n_channels = ieee80211_get_num_supported_channels(wiphy); } if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) @@ -5425,6 +5529,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, enum ieee80211_band band; size_t ie_len; struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1]; + s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF; if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || !rdev->ops->sched_scan_start) @@ -5448,11 +5553,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, if (!n_channels) return -EINVAL; } else { - n_channels = 0; - - for (band = 0; band < IEEE80211_NUM_BANDS; band++) - if (wiphy->bands[band]) - n_channels += wiphy->bands[band]->n_channels; + n_channels = ieee80211_get_num_supported_channels(wiphy); } if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) @@ -5463,11 +5564,40 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, if (n_ssids > wiphy->max_sched_scan_ssids) return -EINVAL; - if (info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) + /* + * First, count the number of 'real' matchsets. Due to an issue with + * the old implementation, matchsets containing only the RSSI attribute + * (NL80211_SCHED_SCAN_MATCH_ATTR_RSSI) are considered as the 'default' + * RSSI for all matchsets, rather than their own matchset for reporting + * all APs with a strong RSSI. This is needed to be compatible with + * older userspace that treated a matchset with only the RSSI as the + * global RSSI for all other matchsets - if there are other matchsets. + */ + if (info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) { nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH], - tmp) - n_match_sets++; + tmp) { + struct nlattr *rssi; + + err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, + nla_data(attr), nla_len(attr), + nl80211_match_policy); + if (err) + return err; + /* add other standalone attributes here */ + if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]) { + n_match_sets++; + continue; + } + rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI]; + if (rssi) + default_match_rssi = nla_get_s32(rssi); + } + } + + /* However, if there's no other matchset, add the RSSI one */ + if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF) + n_match_sets = 1; if (n_match_sets > wiphy->max_match_sets) return -EINVAL; @@ -5588,11 +5718,22 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, tmp) { struct nlattr *ssid, *rssi; - nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, - nla_data(attr), nla_len(attr), - nl80211_match_policy); + err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, + nla_data(attr), nla_len(attr), + nl80211_match_policy); + if (err) + goto out_free; ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]; if (ssid) { + if (WARN_ON(i >= n_match_sets)) { + /* this indicates a programming error, + * the loop above should have verified + * things properly + */ + err = -EINVAL; + goto out_free; + } + if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { err = -EINVAL; goto out_free; @@ -5601,19 +5742,32 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, nla_data(ssid), nla_len(ssid)); request->match_sets[i].ssid.ssid_len = nla_len(ssid); + /* special attribute - old implemenation w/a */ + request->match_sets[i].rssi_thold = + default_match_rssi; + rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI]; + if (rssi) + request->match_sets[i].rssi_thold = + nla_get_s32(rssi); } - rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI]; - if (rssi) - request->rssi_thold = nla_get_u32(rssi); - else - request->rssi_thold = - NL80211_SCAN_RSSI_THOLD_OFF; i++; } + + /* there was no other matchset, so the RSSI one is alone */ + if (i == 0) + request->match_sets[0].rssi_thold = default_match_rssi; + + request->min_rssi_thold = INT_MAX; + for (i = 0; i < n_match_sets; i++) + request->min_rssi_thold = + min(request->match_sets[i].rssi_thold, + request->min_rssi_thold); + } else { + request->min_rssi_thold = NL80211_SCAN_RSSI_THOLD_OFF; } - if (info->attrs[NL80211_ATTR_IE]) { - request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + if (ie_len) { + request->ie_len = ie_len; memcpy((void *)request->ie, nla_data(info->attrs[NL80211_ATTR_IE]), request->ie_len); @@ -5667,8 +5821,14 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_chan_def chandef; + enum nl80211_dfs_regions dfs_region; + unsigned int cac_time_ms; int err; + dfs_region = reg_get_dfs_region(wdev->wiphy); + if (dfs_region == NL80211_DFS_UNSET) + return -EINVAL; + err = nl80211_parse_chandef(rdev, info, &chandef); if (err) return err; @@ -5679,7 +5839,8 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, if (wdev->cac_started) return -EBUSY; - err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef); + err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef, + wdev->iftype); if (err < 0) return err; @@ -5692,17 +5853,17 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, if (!rdev->ops->start_radar_detection) return -EOPNOTSUPP; - err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, - chandef.chan, CHAN_MODE_SHARED, - BIT(chandef.width)); - if (err) - return err; + cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, &chandef); + if (WARN_ON(!cac_time_ms)) + cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; - err = rdev->ops->start_radar_detection(&rdev->wiphy, dev, &chandef); + err = rdev->ops->start_radar_detection(&rdev->wiphy, dev, &chandef, + cac_time_ms); if (!err) { - wdev->channel = chandef.chan; + wdev->chandef = chandef; wdev->cac_started = true; wdev->cac_start_time = jiffies; + wdev->cac_time_ms = cac_time_ms; } return err; } @@ -5720,6 +5881,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) u8 radar_detect_width = 0; int err; bool need_new_beacon = false; + int len, i; if (!rdev->ops->channel_switch || !(rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)) @@ -5732,10 +5894,15 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) /* useless if AP is not running */ if (!wdev->beacon_interval) - return -EINVAL; + return -ENOTCONN; break; case NL80211_IFTYPE_ADHOC: + if (!wdev->ssid_len) + return -ENOTCONN; + break; case NL80211_IFTYPE_MESH_POINT: + if (!wdev->mesh_id_len) + return -ENOTCONN; break; default: return -EOPNOTSUPP; @@ -5773,26 +5940,55 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]) return -EINVAL; - params.counter_offset_beacon = - nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); - if (params.counter_offset_beacon >= params.beacon_csa.tail_len) + len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + if (!len || (len % sizeof(u16))) return -EINVAL; - /* sanity check - counters should be the same */ - if (params.beacon_csa.tail[params.counter_offset_beacon] != - params.count) + params.n_counter_offsets_beacon = len / sizeof(u16); + if (rdev->wiphy.max_num_csa_counters && + (params.n_counter_offsets_beacon > + rdev->wiphy.max_num_csa_counters)) return -EINVAL; + params.counter_offsets_beacon = + nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + + /* sanity checks - counters should fit and be the same */ + for (i = 0; i < params.n_counter_offsets_beacon; i++) { + u16 offset = params.counter_offsets_beacon[i]; + + if (offset >= params.beacon_csa.tail_len) + return -EINVAL; + + if (params.beacon_csa.tail[offset] != params.count) + return -EINVAL; + } + if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) { - params.counter_offset_presp = - nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); - if (params.counter_offset_presp >= - params.beacon_csa.probe_resp_len) + len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + if (!len || (len % sizeof(u16))) return -EINVAL; - if (params.beacon_csa.probe_resp[params.counter_offset_presp] != - params.count) + params.n_counter_offsets_presp = len / sizeof(u16); + if (rdev->wiphy.max_num_csa_counters && + (params.n_counter_offsets_beacon > + rdev->wiphy.max_num_csa_counters)) return -EINVAL; + + params.counter_offsets_presp = + nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + + /* sanity checks - counters should fit and be the same */ + for (i = 0; i < params.n_counter_offsets_presp; i++) { + u16 offset = params.counter_offsets_presp[i]; + + if (offset >= params.beacon_csa.probe_resp_len) + return -EINVAL; + + if (params.beacon_csa.probe_resp[offset] != + params.count) + return -EINVAL; + } } skip_beacons: @@ -5800,22 +5996,25 @@ skip_beacons: if (err) return err; - if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef)) + if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef, + wdev->iftype)) return -EINVAL; - if (dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP || - dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO || - dev->ieee80211_ptr->iftype == NL80211_IFTYPE_ADHOC) { - err = cfg80211_chandef_dfs_required(wdev->wiphy, - ¶ms.chandef); - if (err < 0) { - return err; - } else if (err) { - radar_detect_width = BIT(params.chandef.width); - params.radar_required = true; - } + err = cfg80211_chandef_dfs_required(wdev->wiphy, + ¶ms.chandef, + wdev->iftype); + if (err < 0) + return err; + + if (err > 0) { + radar_detect_width = BIT(params.chandef.width); + params.radar_required = true; } + /* TODO: I left this here for now. With channel switch, the + * verification is a bit more complicated, because we only do + * it later when the channel switch really happens. + */ err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, params.chandef.chan, CHAN_MODE_SHARED, @@ -6042,12 +6241,12 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) { struct survey_info survey; - struct cfg80211_registered_device *dev; + struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; int survey_idx = cb->args[2]; int res; - res = nl80211_prepare_wdev_dump(skb, cb, &dev, &wdev); + res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (res) return res; @@ -6056,7 +6255,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, goto out_err; } - if (!dev->ops->dump_survey) { + if (!rdev->ops->dump_survey) { res = -EOPNOTSUPP; goto out_err; } @@ -6064,7 +6263,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, while (1) { struct ieee80211_channel *chan; - res = rdev_dump_survey(dev, wdev->netdev, survey_idx, &survey); + res = rdev_dump_survey(rdev, wdev->netdev, survey_idx, &survey); if (res == -ENOENT) break; if (res) @@ -6076,7 +6275,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, goto out; } - chan = ieee80211_get_channel(&dev->wiphy, + chan = ieee80211_get_channel(&rdev->wiphy, survey.channel->center_freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) { survey_idx++; @@ -6095,7 +6294,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, cb->args[2] = survey_idx; res = skb->len; out_err: - nl80211_finish_wdev_dump(dev); + nl80211_finish_wdev_dump(rdev); return res; } @@ -6173,9 +6372,9 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = ieee80211_get_channel(&rdev->wiphy, - nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); - if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) + chan = nl80211_get_valid_chan(&rdev->wiphy, + info->attrs[NL80211_ATTR_WIPHY_FREQ]); + if (!chan) return -EINVAL; ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); @@ -6328,9 +6527,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = ieee80211_get_channel(&rdev->wiphy, - nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); - if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) + chan = nl80211_get_valid_chan(&rdev->wiphy, + info->attrs[NL80211_ATTR_WIPHY_FREQ]); + if (!chan) return -EINVAL; ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); @@ -6571,7 +6770,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (!cfg80211_reg_can_beacon(&rdev->wiphy, &ibss.chandef)) + if (!cfg80211_reg_can_beacon(&rdev->wiphy, &ibss.chandef, + NL80211_IFTYPE_ADHOC)) return -EINVAL; switch (ibss.chandef.width) { @@ -6693,6 +6893,101 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) return err; } +static struct sk_buff * +__cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev, + int approxlen, u32 portid, u32 seq, + enum nl80211_commands cmd, + enum nl80211_attrs attr, + const struct nl80211_vendor_cmd_info *info, + gfp_t gfp) +{ + struct sk_buff *skb; + void *hdr; + struct nlattr *data; + + skb = nlmsg_new(approxlen + 100, gfp); + if (!skb) + return NULL; + + hdr = nl80211hdr_put(skb, portid, seq, 0, cmd); + if (!hdr) { + kfree_skb(skb); + return NULL; + } + + if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx)) + goto nla_put_failure; + + if (info) { + if (nla_put_u32(skb, NL80211_ATTR_VENDOR_ID, + info->vendor_id)) + goto nla_put_failure; + if (nla_put_u32(skb, NL80211_ATTR_VENDOR_SUBCMD, + info->subcmd)) + goto nla_put_failure; + } + + data = nla_nest_start(skb, attr); + + ((void **)skb->cb)[0] = rdev; + ((void **)skb->cb)[1] = hdr; + ((void **)skb->cb)[2] = data; + + return skb; + + nla_put_failure: + kfree_skb(skb); + return NULL; +} + +struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy, + enum nl80211_commands cmd, + enum nl80211_attrs attr, + int vendor_event_idx, + int approxlen, gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + const struct nl80211_vendor_cmd_info *info; + + switch (cmd) { + case NL80211_CMD_TESTMODE: + if (WARN_ON(vendor_event_idx != -1)) + return NULL; + info = NULL; + break; + case NL80211_CMD_VENDOR: + if (WARN_ON(vendor_event_idx < 0 || + vendor_event_idx >= wiphy->n_vendor_events)) + return NULL; + info = &wiphy->vendor_events[vendor_event_idx]; + break; + default: + WARN_ON(1); + return NULL; + } + + return __cfg80211_alloc_vendor_skb(rdev, approxlen, 0, 0, + cmd, attr, info, gfp); +} +EXPORT_SYMBOL(__cfg80211_alloc_event_skb); + +void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0]; + void *hdr = ((void **)skb->cb)[1]; + struct nlattr *data = ((void **)skb->cb)[2]; + enum nl80211_multicast_groups mcgrp = NL80211_MCGRP_TESTMODE; + + nla_nest_end(skb, data); + genlmsg_end(skb, hdr); + + if (data->nla_type == NL80211_ATTR_VENDOR_DATA) + mcgrp = NL80211_MCGRP_VENDOR; + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), skb, 0, + mcgrp, gfp); +} +EXPORT_SYMBOL(__cfg80211_send_event_skb); #ifdef CONFIG_NL80211_TESTMODE static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) @@ -6717,11 +7012,11 @@ static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_TESTDATA]) return -EINVAL; - rdev->testmode_info = info; + rdev->cur_cmd_info = info; err = rdev_testmode_cmd(rdev, wdev, nla_data(info->attrs[NL80211_ATTR_TESTDATA]), nla_len(info->attrs[NL80211_ATTR_TESTDATA])); - rdev->testmode_info = NULL; + rdev->cur_cmd_info = NULL; return err; } @@ -6820,93 +7115,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb, rtnl_unlock(); return err; } - -static struct sk_buff * -__cfg80211_testmode_alloc_skb(struct cfg80211_registered_device *rdev, - int approxlen, u32 portid, u32 seq, gfp_t gfp) -{ - struct sk_buff *skb; - void *hdr; - struct nlattr *data; - - skb = nlmsg_new(approxlen + 100, gfp); - if (!skb) - return NULL; - - hdr = nl80211hdr_put(skb, portid, seq, 0, NL80211_CMD_TESTMODE); - if (!hdr) { - kfree_skb(skb); - return NULL; - } - - if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx)) - goto nla_put_failure; - data = nla_nest_start(skb, NL80211_ATTR_TESTDATA); - - ((void **)skb->cb)[0] = rdev; - ((void **)skb->cb)[1] = hdr; - ((void **)skb->cb)[2] = data; - - return skb; - - nla_put_failure: - kfree_skb(skb); - return NULL; -} - -struct sk_buff *cfg80211_testmode_alloc_reply_skb(struct wiphy *wiphy, - int approxlen) -{ - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - if (WARN_ON(!rdev->testmode_info)) - return NULL; - - return __cfg80211_testmode_alloc_skb(rdev, approxlen, - rdev->testmode_info->snd_portid, - rdev->testmode_info->snd_seq, - GFP_KERNEL); -} -EXPORT_SYMBOL(cfg80211_testmode_alloc_reply_skb); - -int cfg80211_testmode_reply(struct sk_buff *skb) -{ - struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0]; - void *hdr = ((void **)skb->cb)[1]; - struct nlattr *data = ((void **)skb->cb)[2]; - - if (WARN_ON(!rdev->testmode_info)) { - kfree_skb(skb); - return -EINVAL; - } - - nla_nest_end(skb, data); - genlmsg_end(skb, hdr); - return genlmsg_reply(skb, rdev->testmode_info); -} -EXPORT_SYMBOL(cfg80211_testmode_reply); - -struct sk_buff *cfg80211_testmode_alloc_event_skb(struct wiphy *wiphy, - int approxlen, gfp_t gfp) -{ - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - return __cfg80211_testmode_alloc_skb(rdev, approxlen, 0, 0, gfp); -} -EXPORT_SYMBOL(cfg80211_testmode_alloc_event_skb); - -void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) -{ - struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0]; - void *hdr = ((void **)skb->cb)[1]; - struct nlattr *data = ((void **)skb->cb)[2]; - - nla_nest_end(skb, data); - genlmsg_end(skb, hdr); - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), skb, 0, - NL80211_MCGRP_TESTMODE, gfp); -} -EXPORT_SYMBOL(cfg80211_testmode_event); #endif static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) @@ -6958,6 +7166,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) connect.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + else if (info->attrs[NL80211_ATTR_MAC_HINT]) + connect.bssid_hint = + nla_data(info->attrs[NL80211_ATTR_MAC_HINT]); connect.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); connect.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); @@ -6976,11 +7187,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - connect.channel = - ieee80211_get_channel(wiphy, - nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); - if (!connect.channel || - connect.channel->flags & IEEE80211_CHAN_DISABLED) + connect.channel = nl80211_get_valid_chan( + wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]); + if (!connect.channel) + return -EINVAL; + } else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) { + connect.channel_hint = nl80211_get_valid_chan( + wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + if (!connect.channel_hint) return -EINVAL; } @@ -7147,6 +7361,7 @@ static int nl80211_tdls_mgmt(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; u8 action_code, dialog_token; + u32 peer_capability = 0; u16 status_code; u8 *peer; @@ -7165,9 +7380,12 @@ static int nl80211_tdls_mgmt(struct sk_buff *skb, struct genl_info *info) action_code = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_ACTION]); status_code = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); dialog_token = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_DIALOG_TOKEN]); + if (info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY]) + peer_capability = + nla_get_u32(info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY]); return rdev_tdls_mgmt(rdev, dev, peer, action_code, - dialog_token, status_code, + dialog_token, status_code, peer_capability, nla_data(info->attrs[NL80211_ATTR_IE]), nla_len(info->attrs[NL80211_ATTR_IE])); } @@ -7328,11 +7546,73 @@ static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, return true; } +static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map) +{ + u16 mcs_mask = 0; + + switch (vht_mcs_map) { + case IEEE80211_VHT_MCS_NOT_SUPPORTED: + break; + case IEEE80211_VHT_MCS_SUPPORT_0_7: + mcs_mask = 0x00FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_8: + mcs_mask = 0x01FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_9: + mcs_mask = 0x03FF; + break; + default: + break; + } + + return mcs_mask; +} + +static void vht_build_mcs_mask(u16 vht_mcs_map, + u16 vht_mcs_mask[NL80211_VHT_NSS_MAX]) +{ + u8 nss; + + for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) { + vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03); + vht_mcs_map >>= 2; + } +} + +static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, + struct nl80211_txrate_vht *txrate, + u16 mcs[NL80211_VHT_NSS_MAX]) +{ + u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {}; + u8 i; + + if (!sband->vht_cap.vht_supported) + return false; + + memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX); + + /* Build vht_mcs_mask from VHT capabilities */ + vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask); + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, .len = NL80211_MAX_SUPP_RATES }, - [NL80211_TXRATE_MCS] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_HT] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)}, + [NL80211_TXRATE_GI] = { .type = NLA_U8 }, }; static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, @@ -7345,9 +7625,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, struct net_device *dev = info->user_ptr[1]; struct nlattr *tx_rates; struct ieee80211_supported_band *sband; - - if (info->attrs[NL80211_ATTR_TX_RATES] == NULL) - return -EINVAL; + u16 vht_tx_mcs_map; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; @@ -7356,32 +7634,44 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, /* Default to all rates enabled */ for (i = 0; i < IEEE80211_NUM_BANDS; i++) { sband = rdev->wiphy.bands[i]; - mask.control[i].legacy = - sband ? (1 << sband->n_bitrates) - 1 : 0; - if (sband) - memcpy(mask.control[i].mcs, - sband->ht_cap.mcs.rx_mask, - sizeof(mask.control[i].mcs)); - else - memset(mask.control[i].mcs, 0, - sizeof(mask.control[i].mcs)); + + if (!sband) + continue; + + mask.control[i].legacy = (1 << sband->n_bitrates) - 1; + memcpy(mask.control[i].ht_mcs, + sband->ht_cap.mcs.rx_mask, + sizeof(mask.control[i].ht_mcs)); + + if (!sband->vht_cap.vht_supported) + continue; + + vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + vht_build_mcs_mask(vht_tx_mcs_map, mask.control[i].vht_mcs); } + /* if no rates are given set it back to the defaults */ + if (!info->attrs[NL80211_ATTR_TX_RATES]) + goto out; + /* * The nested attribute uses enum nl80211_band as the index. This maps * directly to the enum ieee80211_band values used in cfg80211. */ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); - nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) - { + nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { enum ieee80211_band band = nla_type(tx_rates); + int err; + if (band < 0 || band >= IEEE80211_NUM_BANDS) return -EINVAL; sband = rdev->wiphy.bands[band]; if (sband == NULL) return -EINVAL; - nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), - nla_len(tx_rates), nl80211_txattr_policy); + err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), + nla_len(tx_rates), nl80211_txattr_policy); + if (err) + return err; if (tb[NL80211_TXRATE_LEGACY]) { mask.control[band].legacy = rateset_to_mask( sband, @@ -7391,31 +7681,50 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, nla_len(tb[NL80211_TXRATE_LEGACY])) return -EINVAL; } - if (tb[NL80211_TXRATE_MCS]) { + if (tb[NL80211_TXRATE_HT]) { if (!ht_rateset_to_mask( sband, - nla_data(tb[NL80211_TXRATE_MCS]), - nla_len(tb[NL80211_TXRATE_MCS]), - mask.control[band].mcs)) + nla_data(tb[NL80211_TXRATE_HT]), + nla_len(tb[NL80211_TXRATE_HT]), + mask.control[band].ht_mcs)) + return -EINVAL; + } + if (tb[NL80211_TXRATE_VHT]) { + if (!vht_set_mcs_mask( + sband, + nla_data(tb[NL80211_TXRATE_VHT]), + mask.control[band].vht_mcs)) + return -EINVAL; + } + if (tb[NL80211_TXRATE_GI]) { + mask.control[band].gi = + nla_get_u8(tb[NL80211_TXRATE_GI]); + if (mask.control[band].gi > NL80211_TXRATE_FORCE_LGI) return -EINVAL; } if (mask.control[band].legacy == 0) { - /* don't allow empty legacy rates if HT - * is not even supported. */ - if (!rdev->wiphy.bands[band]->ht_cap.ht_supported) + /* don't allow empty legacy rates if HT or VHT + * are not even supported. + */ + if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || + rdev->wiphy.bands[band]->vht_cap.vht_supported)) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) - if (mask.control[band].mcs[i]) - break; + if (mask.control[band].ht_mcs[i]) + goto out; + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + if (mask.control[band].vht_mcs[i]) + goto out; /* legacy and mcs rates may not be both empty */ - if (i == IEEE80211_HT_MCS_MASK_LEN) - return -EINVAL; + return -EINVAL; } } +out: return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); } @@ -7525,6 +7834,27 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) if (!chandef.chan && params.offchan) return -EINVAL; + params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); + params.len = nla_len(info->attrs[NL80211_ATTR_FRAME]); + + if (info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]) { + int len = nla_len(info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]); + int i; + + if (len % sizeof(u16)) + return -EINVAL; + + params.n_csa_offsets = len / sizeof(u16); + params.csa_offsets = + nla_data(info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]); + + /* check that all the offsets fit the frame */ + for (i = 0; i < params.n_csa_offsets; i++) { + if (params.csa_offsets[i] >= params.len) + return -EINVAL; + } + } + if (!params.dont_wait_for_ack) { msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -7538,8 +7868,6 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) } } - params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); - params.len = nla_len(info->attrs[NL80211_ATTR_FRAME]); params.chan = chandef.chan; err = cfg80211_mlme_mgmt_tx(rdev, wdev, ¶ms, &cookie); if (err) @@ -7669,8 +7997,8 @@ static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info) return err; } -static struct nla_policy -nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] __read_mostly = { +static const struct nla_policy +nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = { [NL80211_ATTR_CQM_RSSI_THOLD] = { .type = NLA_U32 }, [NL80211_ATTR_CQM_RSSI_HYST] = { .type = NLA_U32 }, [NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT] = { .type = NLA_U32 }, @@ -8236,6 +8564,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], rem) { + u8 *mask_pat; + nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), nla_len(pat), NULL); err = -EINVAL; @@ -8259,19 +8589,18 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) goto error; new_triggers.patterns[i].pkt_offset = pkt_offset; - new_triggers.patterns[i].mask = - kmalloc(mask_len + pat_len, GFP_KERNEL); - if (!new_triggers.patterns[i].mask) { + mask_pat = kmalloc(mask_len + pat_len, GFP_KERNEL); + if (!mask_pat) { err = -ENOMEM; goto error; } - new_triggers.patterns[i].pattern = - new_triggers.patterns[i].mask + mask_len; - memcpy(new_triggers.patterns[i].mask, - nla_data(pat_tb[NL80211_PKTPAT_MASK]), + new_triggers.patterns[i].mask = mask_pat; + memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_MASK]), mask_len); + mask_pat += mask_len; + new_triggers.patterns[i].pattern = mask_pat; new_triggers.patterns[i].pattern_len = pat_len; - memcpy(new_triggers.patterns[i].pattern, + memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), pat_len); i++; @@ -8463,6 +8792,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN], rem) { + u8 *mask_pat; + nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), nla_len(pat), NULL); if (!pat_tb[NL80211_PKTPAT_MASK] || @@ -8484,17 +8815,19 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, return -EINVAL; new_rule->patterns[i].pkt_offset = pkt_offset; - new_rule->patterns[i].mask = - kmalloc(mask_len + pat_len, GFP_KERNEL); - if (!new_rule->patterns[i].mask) + mask_pat = kmalloc(mask_len + pat_len, GFP_KERNEL); + if (!mask_pat) return -ENOMEM; - new_rule->patterns[i].pattern = - new_rule->patterns[i].mask + mask_len; - memcpy(new_rule->patterns[i].mask, - nla_data(pat_tb[NL80211_PKTPAT_MASK]), mask_len); + + new_rule->patterns[i].mask = mask_pat; + memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_MASK]), + mask_len); + + mask_pat += mask_len; + new_rule->patterns[i].pattern = mask_pat; new_rule->patterns[i].pattern_len = pat_len; - memcpy(new_rule->patterns[i].pattern, - nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), pat_len); + memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), + pat_len); i++; } @@ -8739,9 +9072,8 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info) if (wdev->p2p_started) return 0; - err = cfg80211_can_add_interface(rdev, wdev->iftype); - if (err) - return err; + if (rfkill_blocked(rdev->rfkill)) + return -ERFKILL; err = rdev_start_p2p_device(rdev, wdev); if (err) @@ -8875,6 +9207,162 @@ static int nl80211_crit_protocol_stop(struct sk_buff *skb, return 0; } +static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = + __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs); + int i, err; + u32 vid, subcmd; + + if (!rdev->wiphy.vendor_commands) + return -EOPNOTSUPP; + + if (IS_ERR(wdev)) { + err = PTR_ERR(wdev); + if (err != -EINVAL) + return err; + wdev = NULL; + } else if (wdev->wiphy != &rdev->wiphy) { + return -EINVAL; + } + + if (!info->attrs[NL80211_ATTR_VENDOR_ID] || + !info->attrs[NL80211_ATTR_VENDOR_SUBCMD]) + return -EINVAL; + + vid = nla_get_u32(info->attrs[NL80211_ATTR_VENDOR_ID]); + subcmd = nla_get_u32(info->attrs[NL80211_ATTR_VENDOR_SUBCMD]); + for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { + const struct wiphy_vendor_command *vcmd; + void *data = NULL; + int len = 0; + + vcmd = &rdev->wiphy.vendor_commands[i]; + + if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) + continue; + + if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV | + WIPHY_VENDOR_CMD_NEED_NETDEV)) { + if (!wdev) + return -EINVAL; + if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV && + !wdev->netdev) + return -EINVAL; + + if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { + if (wdev->netdev && + !netif_running(wdev->netdev)) + return -ENETDOWN; + if (!wdev->netdev && !wdev->p2p_started) + return -ENETDOWN; + } + } else { + wdev = NULL; + } + + if (info->attrs[NL80211_ATTR_VENDOR_DATA]) { + data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]); + len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]); + } + + rdev->cur_cmd_info = info; + err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev, + data, len); + rdev->cur_cmd_info = NULL; + return err; + } + + return -EOPNOTSUPP; +} + +struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy, + enum nl80211_commands cmd, + enum nl80211_attrs attr, + int approxlen) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + if (WARN_ON(!rdev->cur_cmd_info)) + return NULL; + + return __cfg80211_alloc_vendor_skb(rdev, approxlen, + rdev->cur_cmd_info->snd_portid, + rdev->cur_cmd_info->snd_seq, + cmd, attr, NULL, GFP_KERNEL); +} +EXPORT_SYMBOL(__cfg80211_alloc_reply_skb); + +int cfg80211_vendor_cmd_reply(struct sk_buff *skb) +{ + struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0]; + void *hdr = ((void **)skb->cb)[1]; + struct nlattr *data = ((void **)skb->cb)[2]; + + if (WARN_ON(!rdev->cur_cmd_info)) { + kfree_skb(skb); + return -EINVAL; + } + + nla_nest_end(skb, data); + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, rdev->cur_cmd_info); +} +EXPORT_SYMBOL_GPL(cfg80211_vendor_cmd_reply); + + +static int nl80211_set_qos_map(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_qos_map *qos_map = NULL; + struct net_device *dev = info->user_ptr[1]; + u8 *pos, len, num_des, des_len, des; + int ret; + + if (!rdev->ops->set_qos_map) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_QOS_MAP]) { + pos = nla_data(info->attrs[NL80211_ATTR_QOS_MAP]); + len = nla_len(info->attrs[NL80211_ATTR_QOS_MAP]); + + if (len % 2 || len < IEEE80211_QOS_MAP_LEN_MIN || + len > IEEE80211_QOS_MAP_LEN_MAX) + return -EINVAL; + + qos_map = kzalloc(sizeof(struct cfg80211_qos_map), GFP_KERNEL); + if (!qos_map) + return -ENOMEM; + + num_des = (len - IEEE80211_QOS_MAP_LEN_MIN) >> 1; + if (num_des) { + des_len = num_des * + sizeof(struct cfg80211_dscp_exception); + memcpy(qos_map->dscp_exception, pos, des_len); + qos_map->num_des = num_des; + for (des = 0; des < num_des; des++) { + if (qos_map->dscp_exception[des].up > 7) { + kfree(qos_map); + return -EINVAL; + } + } + pos += des_len; + } + memcpy(qos_map->up, pos, IEEE80211_QOS_MAP_LEN_MIN); + } + + wdev_lock(dev->ieee80211_ptr); + ret = nl80211_key_allowed(dev->ieee80211_ptr); + if (!ret) + ret = rdev_set_qos_map(rdev, dev, qos_map); + wdev_unlock(dev->ieee80211_ptr); + + kfree(qos_map); + return ret; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -8918,7 +9406,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, } dev = wdev->netdev; - rdev = wiphy_to_dev(wdev->wiphy); + rdev = wiphy_to_rdev(wdev->wiphy); if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) { if (!dev) { @@ -9599,20 +10087,40 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_VENDOR, + .doit = nl80211_vendor_cmd, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_QOS_MAP, + .doit = nl80211_set_qos_map, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ -void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev) +void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, + enum nl80211_commands cmd) { struct sk_buff *msg; struct nl80211_dump_wiphy_state state = {}; + WARN_ON(cmd != NL80211_CMD_NEW_WIPHY && + cmd != NL80211_CMD_DEL_WIPHY); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - if (nl80211_send_wiphy(rdev, msg, 0, 0, 0, &state) < 0) { + if (nl80211_send_wiphy(rdev, cmd, msg, 0, 0, 0, &state) < 0) { nlmsg_free(msg); return; } @@ -9732,40 +10240,31 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, NL80211_MCGRP_SCAN, GFP_KERNEL); } -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted) { struct sk_buff *msg; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return NULL; if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_NEW_SCAN_RESULTS) < 0) { + aborted ? NL80211_CMD_SCAN_ABORTED : + NL80211_CMD_NEW_SCAN_RESULTS) < 0) { nlmsg_free(msg); - return; + return NULL; } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, - NL80211_MCGRP_SCAN, GFP_KERNEL); + return msg; } -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) { - struct sk_buff *msg; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_SCAN_ABORTED) < 0) { - nlmsg_free(msg); - return; - } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, NL80211_MCGRP_SCAN, GFP_KERNEL); } @@ -9940,7 +10439,7 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const struct ieee80211_mgmt *mgmt = (void *)buf; u32 cmd; @@ -10162,7 +10661,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, const u8* ie, u8 ie_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; @@ -10342,7 +10841,7 @@ void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie, unsigned int duration, gfp_t gfp) { struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_ready_on_channel(wdev, cookie, chan, duration); nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL, @@ -10356,7 +10855,7 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, gfp_t gfp) { struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_ready_on_channel_expired(wdev, cookie, chan); nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, @@ -10368,7 +10867,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; trace_cfg80211_new_sta(dev, mac_addr, sinfo); @@ -10391,7 +10890,7 @@ EXPORT_SYMBOL(cfg80211_new_sta); void cfg80211_del_sta(struct net_device *dev, const u8 *mac_addr, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; void *hdr; @@ -10428,7 +10927,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; void *hdr; @@ -10463,7 +10962,7 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, const u8 *addr, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; u32 nlportid = ACCESS_ONCE(wdev->ap_unexpected_nlportid); @@ -10583,7 +11082,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, const u8 *buf, size_t len, bool ack, gfp_t gfp) { struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct net_device *netdev = wdev->netdev; struct sk_buff *msg; void *hdr; @@ -10627,7 +11126,7 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; struct nlattr *pinfoattr; void *hdr; @@ -10719,7 +11218,7 @@ void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_gtk_rekey_notify(dev, bssid); nl80211_gtk_rekey_notify(rdev, dev, bssid, replay_ctr, gfp); @@ -10777,7 +11276,7 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_pmksa_candidate_notify(dev, index, bssid, preauth); nl80211_pmksa_candidate_notify(rdev, dev, index, bssid, preauth, gfp); @@ -10824,7 +11323,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); ASSERT_WDEV_LOCK(wdev); @@ -10836,7 +11335,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->iftype != NL80211_IFTYPE_MESH_POINT)) return; - wdev->channel = chandef->chan; + wdev->chandef = *chandef; + wdev->preset_chandef = *chandef; nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL); } EXPORT_SYMBOL(cfg80211_ch_switch_notify); @@ -10847,7 +11347,7 @@ void cfg80211_cqm_txe_notify(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; struct nlattr *pinfoattr; void *hdr; @@ -10947,7 +11447,7 @@ void cfg80211_cqm_pktloss_notify(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; struct nlattr *pinfoattr; void *hdr; @@ -10994,7 +11494,7 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, u64 cookie, bool acked, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; @@ -11034,7 +11534,7 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, const u8 *frame, size_t len, int freq, int sig_dbm) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; void *hdr; struct cfg80211_beacon_registration *reg; @@ -11081,7 +11581,7 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev, struct cfg80211_wowlan_wakeup *wakeup, gfp_t gfp) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; int size = 200; @@ -11191,7 +11691,7 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer, u16 reason_code, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; @@ -11243,9 +11743,15 @@ static int nl80211_netlink_notify(struct notifier_block * nb, rcu_read_lock(); list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { - list_for_each_entry_rcu(wdev, &rdev->wdev_list, list) + bool schedule_destroy_work = false; + + list_for_each_entry_rcu(wdev, &rdev->wdev_list, list) { cfg80211_mlme_unregister_socket(wdev, notify->portid); + if (wdev->owner_nlportid == notify->portid) + schedule_destroy_work = true; + } + spin_lock_bh(&rdev->beacon_registrations_lock); list_for_each_entry_safe(reg, tmp, &rdev->beacon_registrations, list) { @@ -11256,11 +11762,24 @@ static int nl80211_netlink_notify(struct notifier_block * nb, } } spin_unlock_bh(&rdev->beacon_registrations_lock); + + if (schedule_destroy_work) { + struct cfg80211_iface_destroy *destroy; + + destroy = kzalloc(sizeof(*destroy), GFP_ATOMIC); + if (destroy) { + destroy->nlportid = notify->portid; + spin_lock(&rdev->destroy_list_lock); + list_add(&destroy->list, &rdev->destroy_list); + spin_unlock(&rdev->destroy_list_lock); + schedule_work(&rdev->destroy_work); + } + } } rcu_read_unlock(); - return NOTIFY_DONE; + return NOTIFY_OK; } static struct notifier_block nl80211_netlink_notifier = { @@ -11271,7 +11790,7 @@ void cfg80211_ft_event(struct net_device *netdev, struct cfg80211_ft_event_params *ft_event) { struct wiphy *wiphy = netdev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; void *hdr; @@ -11318,7 +11837,7 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) void *hdr; u32 nlportid; - rdev = wiphy_to_dev(wdev->wiphy); + rdev = wiphy_to_rdev(wdev->wiphy); if (!rdev->crit_proto_nlportid) return; @@ -11350,6 +11869,35 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) } EXPORT_SYMBOL(cfg80211_crit_proto_stopped); +void nl80211_send_ap_stopped(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STOP_AP); + if (!hdr) + goto out; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) || + nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + goto out; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), msg, 0, + NL80211_MCGRP_MLME, GFP_KERNEL); + return; + out: + nlmsg_free(msg); +} + /* initialisation/exit functions */ int nl80211_init(void) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index b1b231324e1..49c9a482dd1 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -5,13 +5,14 @@ int nl80211_init(void); void nl80211_exit(void); -void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev); +void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, + enum nl80211_commands cmd); void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted); +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg); void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 cmd); void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, @@ -74,6 +75,8 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, enum nl80211_radar_event event, struct net_device *netdev, gfp_t gfp); +void nl80211_send_ap_stopped(struct wireless_dev *wdev); + void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); #endif /* __NET_WIRELESS_NL80211_H */ diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index a271c27fac7..722da616438 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -124,6 +124,10 @@ int ieee80211_radiotap_iterator_init( /* find payload start allowing for extended bitmap(s) */ if (iterator->_bitmap_shifter & (1<<IEEE80211_RADIOTAP_EXT)) { + if ((unsigned long)iterator->_arg - + (unsigned long)iterator->_rtheader + sizeof(uint32_t) > + (unsigned long)iterator->_max_length) + return -EINVAL; while (get_unaligned_le32(iterator->_arg) & (1 << IEEE80211_RADIOTAP_EXT)) { iterator->_arg += sizeof(uint32_t); diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index a6c03ab14a0..d95bbe34813 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -199,7 +199,7 @@ static inline int rdev_change_station(struct cfg80211_registered_device *rdev, } static inline int rdev_get_station(struct cfg80211_registered_device *rdev, - struct net_device *dev, u8 *mac, + struct net_device *dev, const u8 *mac, struct station_info *sinfo) { int ret; @@ -769,13 +769,16 @@ static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev, static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, u8 action_code, u8 dialog_token, - u16 status_code, const u8 *buf, size_t len) + u16 status_code, u32 peer_capability, + const u8 *buf, size_t len) { int ret; trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, action_code, - dialog_token, status_code, buf, len); + dialog_token, status_code, peer_capability, + buf, len); ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, action_code, - dialog_token, status_code, buf, len); + dialog_token, status_code, peer_capability, + buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -932,4 +935,32 @@ static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_qos_map *qos_map) +{ + int ret = -EOPNOTSUPP; + + if (rdev->ops->set_qos_map) { + trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map); + ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map); + trace_rdev_return_int(&rdev->wiphy, ret); + } + + return ret; +} + +static inline int +rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_chan_def *chandef) +{ + int ret; + + trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, chandef); + ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, chandef); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index ec54e1aac8e..1afdf45db38 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -65,11 +65,26 @@ #define REG_DBG_PRINT(args...) #endif +/** + * enum reg_request_treatment - regulatory request treatment + * + * @REG_REQ_OK: continue processing the regulatory request + * @REG_REQ_IGNORE: ignore the regulatory request + * @REG_REQ_INTERSECT: the regulatory domain resulting from this request should + * be intersected with the current one. + * @REG_REQ_ALREADY_SET: the regulatory request will not change the current + * regulatory settings, and no further processing is required. + * @REG_REQ_USER_HINT_HANDLED: a non alpha2 user hint was handled and no + * further processing is required, i.e., not need to update last_request + * etc. This should be used for user hints that do not provide an alpha2 + * but some other type of regulatory hint, i.e., indoor operation. + */ enum reg_request_treatment { REG_REQ_OK, REG_REQ_IGNORE, REG_REQ_INTERSECT, REG_REQ_ALREADY_SET, + REG_REQ_USER_HINT_HANDLED, }; static struct regulatory_request core_request_world = { @@ -91,10 +106,6 @@ static struct regulatory_request __rcu *last_request = /* To trigger userspace events */ static struct platform_device *reg_pdev; -static struct device_type reg_device_type = { - .uevent = reg_device_uevent, -}; - /* * Central wireless core regulatory domains, we only need two, * the current one and a world regulatory domain in case we have no @@ -110,6 +121,14 @@ const struct ieee80211_regdomain __rcu *cfg80211_regdomain; */ static int reg_num_devs_support_basehint; +/* + * State variable indicating if the platform on which the devices + * are attached is operating in an indoor environment. The state variable + * is relevant for all registered devices. + * (protected by RTNL) + */ +static bool reg_is_indoor; + static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { return rtnl_dereference(cfg80211_regdomain); @@ -135,6 +154,33 @@ static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region) return "Unknown"; } +enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) +{ + const struct ieee80211_regdomain *regd = NULL; + const struct ieee80211_regdomain *wiphy_regd = NULL; + + regd = get_cfg80211_regdom(); + if (!wiphy) + goto out; + + wiphy_regd = get_wiphy_regdom(wiphy); + if (!wiphy_regd) + goto out; + + if (wiphy_regd->dfs_region == regd->dfs_region) + goto out; + + REG_DBG_PRINT("%s: device specific dfs_region " + "(%s) disagrees with cfg80211's " + "central dfs_region (%s)\n", + dev_name(&wiphy->dev), + reg_dfs_region_str(wiphy_regd->dfs_region), + reg_dfs_region_str(regd->dfs_region)); + +out: + return regd->dfs_region; +} + static void rcu_free_regdom(const struct ieee80211_regdomain *r) { if (!r) @@ -217,11 +263,15 @@ static char user_alpha2[2]; module_param(ieee80211_regdom, charp, 0444); MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); -static void reg_kfree_last_request(void) +static void reg_free_request(struct regulatory_request *request) { - struct regulatory_request *lr; + if (request != get_last_request()) + kfree(request); +} - lr = get_last_request(); +static void reg_free_last_request(void) +{ + struct regulatory_request *lr = get_last_request(); if (lr != &core_request_world && lr) kfree_rcu(lr, rcu_head); @@ -229,7 +279,13 @@ static void reg_kfree_last_request(void) static void reg_update_last_request(struct regulatory_request *request) { - reg_kfree_last_request(); + struct regulatory_request *lr; + + lr = get_last_request(); + if (lr == request) + return; + + reg_free_last_request(); rcu_assign_pointer(last_request, request); } @@ -460,11 +516,16 @@ static inline void reg_regdb_query(const char *alpha2) {} /* * This lets us keep regulatory code which is updated on a regulatory - * basis in userspace. Country information is filled in by - * reg_device_uevent + * basis in userspace. */ static int call_crda(const char *alpha2) { + char country[12]; + char *env[] = { country, NULL }; + + snprintf(country, sizeof(country), "COUNTRY=%c%c", + alpha2[0], alpha2[1]); + if (!is_world_regdom((char *) alpha2)) pr_info("Calling CRDA for country: %c%c\n", alpha2[0], alpha2[1]); @@ -474,7 +535,7 @@ static int call_crda(const char *alpha2) /* query internal regulatory database (if it exists) */ reg_regdb_query(alpha2); - return kobject_uevent(®_pdev->dev.kobj, KOBJ_CHANGE); + return kobject_uevent_env(®_pdev->dev.kobj, KOBJ_CHANGE, env); } static enum reg_request_treatment @@ -495,6 +556,71 @@ bool reg_is_valid_request(const char *alpha2) return alpha2_equal(lr->alpha2, alpha2); } +static const struct ieee80211_regdomain *reg_get_regdomain(struct wiphy *wiphy) +{ + struct regulatory_request *lr = get_last_request(); + + /* + * Follow the driver's regulatory domain, if present, unless a country + * IE has been processed or a user wants to help complaince further + */ + if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + lr->initiator != NL80211_REGDOM_SET_BY_USER && + wiphy->regd) + return get_wiphy_regdom(wiphy); + + return get_cfg80211_regdom(); +} + +unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, + const struct ieee80211_reg_rule *rule) +{ + const struct ieee80211_freq_range *freq_range = &rule->freq_range; + const struct ieee80211_freq_range *freq_range_tmp; + const struct ieee80211_reg_rule *tmp; + u32 start_freq, end_freq, idx, no; + + for (idx = 0; idx < rd->n_reg_rules; idx++) + if (rule == &rd->reg_rules[idx]) + break; + + if (idx == rd->n_reg_rules) + return 0; + + /* get start_freq */ + no = idx; + + while (no) { + tmp = &rd->reg_rules[--no]; + freq_range_tmp = &tmp->freq_range; + + if (freq_range_tmp->end_freq_khz < freq_range->start_freq_khz) + break; + + freq_range = freq_range_tmp; + } + + start_freq = freq_range->start_freq_khz; + + /* get end_freq */ + freq_range = &rule->freq_range; + no = idx; + + while (no < rd->n_reg_rules - 1) { + tmp = &rd->reg_rules[++no]; + freq_range_tmp = &tmp->freq_range; + + if (freq_range_tmp->start_freq_khz > freq_range->end_freq_khz) + break; + + freq_range = freq_range_tmp; + } + + end_freq = freq_range->end_freq_khz; + + return end_freq - start_freq; +} + /* Sanity check on a regulatory rule */ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) { @@ -603,7 +729,9 @@ reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1, * Helper for regdom_intersect(), this does the real * mathematical intersection fun */ -static int reg_rules_intersect(const struct ieee80211_reg_rule *rule1, +static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, + const struct ieee80211_regdomain *rd2, + const struct ieee80211_reg_rule *rule1, const struct ieee80211_reg_rule *rule2, struct ieee80211_reg_rule *intersected_rule) { @@ -611,7 +739,7 @@ static int reg_rules_intersect(const struct ieee80211_reg_rule *rule1, struct ieee80211_freq_range *freq_range; const struct ieee80211_power_rule *power_rule1, *power_rule2; struct ieee80211_power_rule *power_rule; - u32 freq_diff; + u32 freq_diff, max_bandwidth1, max_bandwidth2; freq_range1 = &rule1->freq_range; freq_range2 = &rule2->freq_range; @@ -625,8 +753,32 @@ static int reg_rules_intersect(const struct ieee80211_reg_rule *rule1, freq_range2->start_freq_khz); freq_range->end_freq_khz = min(freq_range1->end_freq_khz, freq_range2->end_freq_khz); - freq_range->max_bandwidth_khz = min(freq_range1->max_bandwidth_khz, - freq_range2->max_bandwidth_khz); + + max_bandwidth1 = freq_range1->max_bandwidth_khz; + max_bandwidth2 = freq_range2->max_bandwidth_khz; + + if (rule1->flags & NL80211_RRF_AUTO_BW) + max_bandwidth1 = reg_get_max_bandwidth(rd1, rule1); + if (rule2->flags & NL80211_RRF_AUTO_BW) + max_bandwidth2 = reg_get_max_bandwidth(rd2, rule2); + + freq_range->max_bandwidth_khz = min(max_bandwidth1, max_bandwidth2); + + intersected_rule->flags = rule1->flags | rule2->flags; + + /* + * In case NL80211_RRF_AUTO_BW requested for both rules + * set AUTO_BW in intersected rule also. Next we will + * calculate BW correctly in handle_channel function. + * In other case remove AUTO_BW flag while we calculate + * maximum bandwidth correctly and auto calculation is + * not required. + */ + if ((rule1->flags & NL80211_RRF_AUTO_BW) && + (rule2->flags & NL80211_RRF_AUTO_BW)) + intersected_rule->flags |= NL80211_RRF_AUTO_BW; + else + intersected_rule->flags &= ~NL80211_RRF_AUTO_BW; freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; if (freq_range->max_bandwidth_khz > freq_diff) @@ -637,7 +789,8 @@ static int reg_rules_intersect(const struct ieee80211_reg_rule *rule1, power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain, power_rule2->max_antenna_gain); - intersected_rule->flags = rule1->flags | rule2->flags; + intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms, + rule2->dfs_cac_ms); if (!is_valid_reg_rule(intersected_rule)) return -EINVAL; @@ -686,7 +839,8 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; - if (!reg_rules_intersect(rule1, rule2, &dummy_rule)) + if (!reg_rules_intersect(rd1, rd2, rule1, rule2, + &dummy_rule)) num_rules++; } } @@ -711,7 +865,8 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, * a memcpy() */ intersected_rule = &rd->reg_rules[rule_idx]; - r = reg_rules_intersect(rule1, rule2, intersected_rule); + r = reg_rules_intersect(rd1, rd2, rule1, rule2, + intersected_rule); /* * No need to memset here the intersected rule here as * we're not using the stack anymore @@ -749,6 +904,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_RADAR; if (rd_flags & NL80211_RRF_NO_OFDM) channel_flags |= IEEE80211_CHAN_NO_OFDM; + if (rd_flags & NL80211_RRF_NO_OUTDOOR) + channel_flags |= IEEE80211_CHAN_INDOOR_ONLY; return channel_flags; } @@ -794,18 +951,8 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, u32 center_freq) { const struct ieee80211_regdomain *regd; - struct regulatory_request *lr = get_last_request(); - /* - * Follow the driver's regulatory domain, if present, unless a country - * IE has been processed or a user wants to help complaince further - */ - if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && - lr->initiator != NL80211_REGDOM_SET_BY_USER && - wiphy->regd) - regd = get_wiphy_regdom(wiphy); - else - regd = get_cfg80211_regdom(); + regd = reg_get_regdomain(wiphy); return freq_reg_info_regd(wiphy, center_freq, regd); } @@ -830,31 +977,42 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator) EXPORT_SYMBOL(reg_initiator_name); #ifdef CONFIG_CFG80211_REG_DEBUG -static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, +static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, + struct ieee80211_channel *chan, const struct ieee80211_reg_rule *reg_rule) { const struct ieee80211_power_rule *power_rule; const struct ieee80211_freq_range *freq_range; - char max_antenna_gain[32]; + char max_antenna_gain[32], bw[32]; power_rule = ®_rule->power_rule; freq_range = ®_rule->freq_range; if (!power_rule->max_antenna_gain) - snprintf(max_antenna_gain, 32, "N/A"); + snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A"); else - snprintf(max_antenna_gain, 32, "%d", power_rule->max_antenna_gain); + snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d", + power_rule->max_antenna_gain); + + if (reg_rule->flags & NL80211_RRF_AUTO_BW) + snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO", + freq_range->max_bandwidth_khz, + reg_get_max_bandwidth(regd, reg_rule)); + else + snprintf(bw, sizeof(bw), "%d KHz", + freq_range->max_bandwidth_khz); REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n", chan->center_freq); - REG_DBG_PRINT("%d KHz - %d KHz @ %d KHz), (%s mBi, %d mBm)\n", + REG_DBG_PRINT("%d KHz - %d KHz @ %s), (%s mBi, %d mBm)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, - freq_range->max_bandwidth_khz, max_antenna_gain, + bw, max_antenna_gain, power_rule->max_eirp); } #else -static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, +static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, + struct ieee80211_channel *chan, const struct ieee80211_reg_rule *reg_rule) { return; @@ -876,6 +1034,8 @@ static void handle_channel(struct wiphy *wiphy, const struct ieee80211_freq_range *freq_range = NULL; struct wiphy *request_wiphy = NULL; struct regulatory_request *lr = get_last_request(); + const struct ieee80211_regdomain *regd; + u32 max_bandwidth_khz; request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); @@ -912,16 +1072,22 @@ static void handle_channel(struct wiphy *wiphy, return; } - chan_reg_rule_print_dbg(chan, reg_rule); + regd = reg_get_regdomain(wiphy); + chan_reg_rule_print_dbg(regd, chan, reg_rule); power_rule = ®_rule->power_rule; freq_range = ®_rule->freq_range; - if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40)) + max_bandwidth_khz = freq_range->max_bandwidth_khz; + /* Check if auto calculation requested */ + if (reg_rule->flags & NL80211_RRF_AUTO_BW) + max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) bw_flags = IEEE80211_CHAN_NO_HT40; - if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(80)) + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(160)) + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) bw_flags |= IEEE80211_CHAN_NO_160MHZ; if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && @@ -938,6 +1104,13 @@ static void handle_channel(struct wiphy *wiphy, (int) MBI_TO_DBI(power_rule->max_antenna_gain); chan->max_reg_power = chan->max_power = chan->orig_mpwr = (int) MBM_TO_DBM(power_rule->max_eirp); + + if (chan->flags & IEEE80211_CHAN_RADAR) { + chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; + if (reg_rule->dfs_cac_ms) + chan->dfs_cac_ms = reg_rule->dfs_cac_ms; + } + return; } @@ -950,6 +1123,14 @@ static void handle_channel(struct wiphy *wiphy, min_t(int, chan->orig_mag, MBI_TO_DBI(power_rule->max_antenna_gain)); chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp); + + if (chan->flags & IEEE80211_CHAN_RADAR) { + if (reg_rule->dfs_cac_ms) + chan->dfs_cac_ms = reg_rule->dfs_cac_ms; + else + chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; + } + if (chan->orig_mpwr) { /* * Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER @@ -985,12 +1166,19 @@ static bool reg_request_cell_base(struct regulatory_request *request) return request->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE; } +static bool reg_request_indoor(struct regulatory_request *request) +{ + if (request->initiator != NL80211_REGDOM_SET_BY_USER) + return false; + return request->user_reg_hint_type == NL80211_USER_REG_HINT_INDOOR; +} + bool reg_last_request_cell_base(void) { return reg_request_cell_base(get_last_request()); } -#ifdef CONFIG_CFG80211_CERTIFICATION_ONUS +#ifdef CONFIG_CFG80211_REG_CELLULAR_HINTS /* Core specific check */ static enum reg_request_treatment reg_ignore_cell_hint(struct regulatory_request *pending_request) @@ -1307,6 +1495,7 @@ static void handle_channel_custom(struct wiphy *wiphy, const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; + u32 max_bandwidth_khz; reg_rule = freq_reg_info_regd(wiphy, MHZ_TO_KHZ(chan->center_freq), regd); @@ -1319,16 +1508,21 @@ static void handle_channel_custom(struct wiphy *wiphy, return; } - chan_reg_rule_print_dbg(chan, reg_rule); + chan_reg_rule_print_dbg(regd, chan, reg_rule); power_rule = ®_rule->power_rule; freq_range = ®_rule->freq_range; - if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40)) + max_bandwidth_khz = freq_range->max_bandwidth_khz; + /* Check if auto calculation requested */ + if (reg_rule->flags & NL80211_RRF_AUTO_BW) + max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) bw_flags = IEEE80211_CHAN_NO_HT40; - if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(80)) + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(160)) + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) bw_flags |= IEEE80211_CHAN_NO_160MHZ; chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags; @@ -1421,6 +1615,11 @@ __reg_process_hint_user(struct regulatory_request *user_request) { struct regulatory_request *lr = get_last_request(); + if (reg_request_indoor(user_request)) { + reg_is_indoor = true; + return REG_REQ_USER_HINT_HANDLED; + } + if (reg_request_cell_base(user_request)) return reg_ignore_cell_hint(user_request); @@ -1468,8 +1667,9 @@ reg_process_hint_user(struct regulatory_request *user_request) treatment = __reg_process_hint_user(user_request); if (treatment == REG_REQ_IGNORE || - treatment == REG_REQ_ALREADY_SET) { - kfree(user_request); + treatment == REG_REQ_ALREADY_SET || + treatment == REG_REQ_USER_HINT_HANDLED) { + reg_free_request(user_request); return treatment; } @@ -1529,14 +1729,15 @@ reg_process_hint_driver(struct wiphy *wiphy, case REG_REQ_OK: break; case REG_REQ_IGNORE: - kfree(driver_request); + case REG_REQ_USER_HINT_HANDLED: + reg_free_request(driver_request); return treatment; case REG_REQ_INTERSECT: /* fall through */ case REG_REQ_ALREADY_SET: regd = reg_copy_regd(get_cfg80211_regdom()); if (IS_ERR(regd)) { - kfree(driver_request); + reg_free_request(driver_request); return REG_REQ_IGNORE; } rcu_assign_pointer(wiphy->regd, regd); @@ -1628,12 +1829,13 @@ reg_process_hint_country_ie(struct wiphy *wiphy, case REG_REQ_OK: break; case REG_REQ_IGNORE: + case REG_REQ_USER_HINT_HANDLED: /* fall through */ case REG_REQ_ALREADY_SET: - kfree(country_ie_request); + reg_free_request(country_ie_request); return treatment; case REG_REQ_INTERSECT: - kfree(country_ie_request); + reg_free_request(country_ie_request); /* * This doesn't happen yet, not sure we * ever want to support it for this case. @@ -1656,43 +1858,46 @@ static void reg_process_hint(struct regulatory_request *reg_request) struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; - if (WARN_ON(!reg_request->alpha2)) - return; - if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); - if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && !wiphy) { - kfree(reg_request); - return; - } - switch (reg_request->initiator) { case NL80211_REGDOM_SET_BY_CORE: reg_process_hint_core(reg_request); return; case NL80211_REGDOM_SET_BY_USER: treatment = reg_process_hint_user(reg_request); - if (treatment == REG_REQ_OK || - treatment == REG_REQ_ALREADY_SET) + if (treatment == REG_REQ_IGNORE || + treatment == REG_REQ_ALREADY_SET || + treatment == REG_REQ_USER_HINT_HANDLED) return; - schedule_delayed_work(®_timeout, msecs_to_jiffies(3142)); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, msecs_to_jiffies(3142)); return; case NL80211_REGDOM_SET_BY_DRIVER: + if (!wiphy) + goto out_free; treatment = reg_process_hint_driver(wiphy, reg_request); break; case NL80211_REGDOM_SET_BY_COUNTRY_IE: + if (!wiphy) + goto out_free; treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: WARN(1, "invalid initiator %d\n", reg_request->initiator); - return; + goto out_free; } /* This is required so that the orig_* parameters are saved */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) wiphy_update_regulatory(wiphy, reg_request->initiator); + + return; + +out_free: + reg_free_request(reg_request); } /* @@ -1708,7 +1913,7 @@ static void reg_process_pending_hints(void) /* When last_request->processed becomes true this will be rescheduled */ if (lr && !lr->processed) { - REG_DBG_PRINT("Pending regulatory request, waiting for it to be processed...\n"); + reg_process_hint(lr); return; } @@ -1818,6 +2023,22 @@ int regulatory_hint_user(const char *alpha2, return 0; } +int regulatory_hint_indoor_user(void) +{ + struct regulatory_request *request; + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->wiphy_idx = WIPHY_IDX_INVALID; + request->initiator = NL80211_REGDOM_SET_BY_USER; + request->user_reg_hint_type = NL80211_USER_REG_HINT_INDOOR; + queue_regulatory_request(request); + + return 0; +} + /* Driver hints */ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) { @@ -1826,6 +2047,8 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) if (WARN_ON(!alpha2 || !wiphy)) return -EINVAL; + wiphy->regulatory_flags &= ~REGULATORY_CUSTOM_REG; + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; @@ -1983,6 +2206,8 @@ static void restore_regulatory_settings(bool reset_user) ASSERT_RTNL(); + reg_is_indoor = false; + reset_regdomains(true, &world_regdom); restore_alpha2(alpha2, reset_user); @@ -2118,31 +2343,49 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; const struct ieee80211_power_rule *power_rule = NULL; + char bw[32], cac_time[32]; - pr_info(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp)\n"); + pr_info(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n"); for (i = 0; i < rd->n_reg_rules; i++) { reg_rule = &rd->reg_rules[i]; freq_range = ®_rule->freq_range; power_rule = ®_rule->power_rule; + if (reg_rule->flags & NL80211_RRF_AUTO_BW) + snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO", + freq_range->max_bandwidth_khz, + reg_get_max_bandwidth(rd, reg_rule)); + else + snprintf(bw, sizeof(bw), "%d KHz", + freq_range->max_bandwidth_khz); + + if (reg_rule->flags & NL80211_RRF_DFS) + scnprintf(cac_time, sizeof(cac_time), "%u s", + reg_rule->dfs_cac_ms/1000); + else + scnprintf(cac_time, sizeof(cac_time), "N/A"); + + /* * There may not be documentation for max antenna gain * in certain regions */ if (power_rule->max_antenna_gain) - pr_info(" (%d KHz - %d KHz @ %d KHz), (%d mBi, %d mBm)\n", + pr_info(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, - freq_range->max_bandwidth_khz, + bw, power_rule->max_antenna_gain, - power_rule->max_eirp); + power_rule->max_eirp, + cac_time); else - pr_info(" (%d KHz - %d KHz @ %d KHz), (N/A, %d mBm)\n", + pr_info(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, - freq_range->max_bandwidth_khz, - power_rule->max_eirp); + bw, + power_rule->max_eirp, + cac_time); } } @@ -2215,9 +2458,6 @@ static int reg_set_rd_user(const struct ieee80211_regdomain *rd, { const struct ieee80211_regdomain *intersected_rd = NULL; - if (is_world_regdom(rd->alpha2)) - return -EINVAL; - if (!regdom_changes(rd->alpha2)) return -EALREADY; @@ -2265,7 +2505,8 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx); if (!request_wiphy) { - schedule_delayed_work(®_timeout, 0); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, 0); return -ENODEV; } @@ -2325,7 +2566,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx); if (!request_wiphy) { - schedule_delayed_work(®_timeout, 0); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, 0); return -ENODEV; } @@ -2344,6 +2586,7 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, int set_regdom(const struct ieee80211_regdomain *rd) { struct regulatory_request *lr; + bool user_reset = false; int r; if (!reg_is_valid_request(rd->alpha2)) { @@ -2360,6 +2603,7 @@ int set_regdom(const struct ieee80211_regdomain *rd) break; case NL80211_REGDOM_SET_BY_USER: r = reg_set_rd_user(rd, lr); + user_reset = true; break; case NL80211_REGDOM_SET_BY_DRIVER: r = reg_set_rd_driver(rd, lr); @@ -2373,8 +2617,14 @@ int set_regdom(const struct ieee80211_regdomain *rd) } if (r) { - if (r == -EALREADY) + switch (r) { + case -EALREADY: reg_set_request_processed(); + break; + default: + /* Back to world regulatory in case of errors */ + restore_regulatory_settings(user_reset); + } kfree(rd); return r; @@ -2396,26 +2646,6 @@ int set_regdom(const struct ieee80211_regdomain *rd) return 0; } -int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - struct regulatory_request *lr; - u8 alpha2[2]; - bool add = false; - - rcu_read_lock(); - lr = get_last_request(); - if (lr && !lr->processed) { - memcpy(alpha2, lr->alpha2, 2); - add = true; - } - rcu_read_unlock(); - - if (add) - return add_uevent_var(env, "COUNTRY=%c%c", - alpha2[0], alpha2[1]); - return 0; -} - void wiphy_regulatory_register(struct wiphy *wiphy) { struct regulatory_request *lr; @@ -2438,7 +2668,7 @@ void wiphy_regulatory_deregister(struct wiphy *wiphy) reg_num_devs_support_basehint--; rcu_free_regdom(get_wiphy_regdom(wiphy)); - rcu_assign_pointer(wiphy->regd, NULL); + RCU_INIT_POINTER(wiphy->regd, NULL); if (lr) request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); @@ -2458,6 +2688,40 @@ static void reg_timeout_work(struct work_struct *work) rtnl_unlock(); } +/* + * See http://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii, for + * UNII band definitions + */ +int cfg80211_get_unii(int freq) +{ + /* UNII-1 */ + if (freq >= 5150 && freq <= 5250) + return 0; + + /* UNII-2A */ + if (freq > 5250 && freq <= 5350) + return 1; + + /* UNII-2B */ + if (freq > 5350 && freq <= 5470) + return 2; + + /* UNII-2C */ + if (freq > 5470 && freq <= 5725) + return 3; + + /* UNII-3 */ + if (freq > 5725 && freq <= 5825) + return 4; + + return -EINVAL; +} + +bool regulatory_indoor_allowed(void) +{ + return reg_is_indoor; +} + int __init regulatory_init(void) { int err = 0; @@ -2466,8 +2730,6 @@ int __init regulatory_init(void) if (IS_ERR(reg_pdev)) return PTR_ERR(reg_pdev); - reg_pdev->dev.type = ®_device_type; - spin_lock_init(®_requests_lock); spin_lock_init(®_pending_beacons_lock); diff --git a/net/wireless/reg.h b/net/wireless/reg.h index cc4c2c0a672..5e48031ccb9 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -21,11 +21,12 @@ extern const struct ieee80211_regdomain __rcu *cfg80211_regdomain; bool reg_is_valid_request(const char *alpha2); bool is_world_regdom(const char *alpha2); bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region); +enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy); int regulatory_hint_user(const char *alpha2, enum nl80211_user_reg_hint_type user_reg_hint_type); +int regulatory_hint_indoor_user(void); -int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env); void wiphy_regulatory_register(struct wiphy *wiphy); void wiphy_regulatory_deregister(struct wiphy *wiphy); @@ -33,6 +34,8 @@ int __init regulatory_init(void); void regulatory_exit(void); int set_regdom(const struct ieee80211_regdomain *rd); +unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, + const struct ieee80211_reg_rule *rule); bool reg_last_request_cell_base(void); @@ -102,4 +105,21 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, */ void regulatory_hint_disconnect(void); +/** + * cfg80211_get_unii - get the U-NII band for the frequency + * @freq: the frequency for which we want to get the UNII band. + + * Get a value specifying the U-NII band frequency belongs to. + * U-NII bands are defined by the FCC in C.F.R 47 part 15. + * + * Returns -EINVAL if freq is invalid, 0 for UNII-1, 1 for UNII-2A, + * 2 for UNII-2B, 3 for UNII-2C and 4 for UNII-3. + */ +int cfg80211_get_unii(int freq); + +/** + * regulatory_indoor_allowed - is indoor operation allowed + */ +bool regulatory_indoor_allowed(void); + #endif /* __NET_WIRELESS_REG_H */ diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d4397eba540..0798c62e608 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -81,10 +81,10 @@ static void bss_free(struct cfg80211_internal_bss *bss) kfree(bss); } -static inline void bss_ref_get(struct cfg80211_registered_device *dev, +static inline void bss_ref_get(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { - lockdep_assert_held(&dev->bss_lock); + lockdep_assert_held(&rdev->bss_lock); bss->refcount++; if (bss->pub.hidden_beacon_bss) { @@ -95,10 +95,10 @@ static inline void bss_ref_get(struct cfg80211_registered_device *dev, } } -static inline void bss_ref_put(struct cfg80211_registered_device *dev, +static inline void bss_ref_put(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { - lockdep_assert_held(&dev->bss_lock); + lockdep_assert_held(&rdev->bss_lock); if (bss->pub.hidden_beacon_bss) { struct cfg80211_internal_bss *hbss; @@ -114,10 +114,10 @@ static inline void bss_ref_put(struct cfg80211_registered_device *dev, bss_free(bss); } -static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *dev, +static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { - lockdep_assert_held(&dev->bss_lock); + lockdep_assert_held(&rdev->bss_lock); if (!list_empty(&bss->hidden_list)) { /* @@ -134,45 +134,52 @@ static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *dev, } list_del_init(&bss->list); - rb_erase(&bss->rbn, &dev->bss_tree); - bss_ref_put(dev, bss); + rb_erase(&bss->rbn, &rdev->bss_tree); + bss_ref_put(rdev, bss); return true; } -static void __cfg80211_bss_expire(struct cfg80211_registered_device *dev, +static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev, unsigned long expire_time) { struct cfg80211_internal_bss *bss, *tmp; bool expired = false; - lockdep_assert_held(&dev->bss_lock); + lockdep_assert_held(&rdev->bss_lock); - list_for_each_entry_safe(bss, tmp, &dev->bss_list, list) { + list_for_each_entry_safe(bss, tmp, &rdev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!time_after(expire_time, bss->ts)) continue; - if (__cfg80211_unlink_bss(dev, bss)) + if (__cfg80211_unlink_bss(rdev, bss)) expired = true; } if (expired) - dev->bss_generation++; + rdev->bss_generation++; } -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak) +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message) { struct cfg80211_scan_request *request; struct wireless_dev *wdev; + struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif ASSERT_RTNL(); - request = rdev->scan_req; + if (rdev->scan_msg) { + nl80211_send_scan_result(rdev, rdev->scan_msg); + rdev->scan_msg = NULL; + return; + } + request = rdev->scan_req; if (!request) return; @@ -186,18 +193,16 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak) if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); - if (request->aborted) { - nl80211_send_scan_aborted(rdev, wdev); - } else { - if (request->flags & NL80211_SCAN_FLAG_FLUSH) { - /* flush entries from previous scans */ - spin_lock_bh(&rdev->bss_lock); - __cfg80211_bss_expire(rdev, request->scan_start); - spin_unlock_bh(&rdev->bss_lock); - } - nl80211_send_scan_done(rdev, wdev); + if (!request->aborted && + request->flags & NL80211_SCAN_FLAG_FLUSH) { + /* flush entries from previous scans */ + spin_lock_bh(&rdev->bss_lock); + __cfg80211_bss_expire(rdev, request->scan_start); + spin_unlock_bh(&rdev->bss_lock); } + msg = nl80211_build_scan_msg(rdev, wdev, request->aborted); + #ifdef CONFIG_CFG80211_WEXT if (wdev->netdev && !request->aborted) { memset(&wrqu, 0, sizeof(wrqu)); @@ -210,17 +215,12 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak) dev_put(wdev->netdev); rdev->scan_req = NULL; + kfree(request); - /* - * OK. If this is invoked with "leak" then we can't - * free this ... but we've cleaned it up anyway. The - * driver failed to call the scan_done callback, so - * all bets are off, it might still be trying to use - * the scan request or not ... if it accesses the dev - * in there (it shouldn't anyway) then it may crash. - */ - if (!leak) - kfree(request); + if (!send_message) + rdev->scan_msg = msg; + else + nl80211_send_scan_result(rdev, msg); } void __cfg80211_scan_done(struct work_struct *wk) @@ -231,18 +231,18 @@ void __cfg80211_scan_done(struct work_struct *wk) scan_done_wk); rtnl_lock(); - ___cfg80211_scan_done(rdev, false); + ___cfg80211_scan_done(rdev, true); rtnl_unlock(); } void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted) { trace_cfg80211_scan_done(request, aborted); - WARN_ON(request != wiphy_to_dev(request->wiphy)->scan_req); + WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req); request->aborted = aborted; request->notified = true; - queue_work(cfg80211_wq, &wiphy_to_dev(request->wiphy)->scan_done_wk); + queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } EXPORT_SYMBOL(cfg80211_scan_done); @@ -278,20 +278,28 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy) { trace_cfg80211_sched_scan_results(wiphy); /* ignore if we're not scanning */ - if (wiphy_to_dev(wiphy)->sched_scan_req) + if (wiphy_to_rdev(wiphy)->sched_scan_req) queue_work(cfg80211_wq, - &wiphy_to_dev(wiphy)->sched_scan_results_wk); + &wiphy_to_rdev(wiphy)->sched_scan_results_wk); } EXPORT_SYMBOL(cfg80211_sched_scan_results); -void cfg80211_sched_scan_stopped(struct wiphy *wiphy) +void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + ASSERT_RTNL(); trace_cfg80211_sched_scan_stopped(wiphy); - rtnl_lock(); __cfg80211_stop_sched_scan(rdev, true); +} +EXPORT_SYMBOL(cfg80211_sched_scan_stopped_rtnl); + +void cfg80211_sched_scan_stopped(struct wiphy *wiphy) +{ + rtnl_lock(); + cfg80211_sched_scan_stopped_rtnl(wiphy); rtnl_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); @@ -322,21 +330,21 @@ int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, return 0; } -void cfg80211_bss_age(struct cfg80211_registered_device *dev, +void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs) { struct cfg80211_internal_bss *bss; unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); - spin_lock_bh(&dev->bss_lock); - list_for_each_entry(bss, &dev->bss_list, list) + spin_lock_bh(&rdev->bss_lock); + list_for_each_entry(bss, &rdev->bss_list, list) bss->ts -= age_jiffies; - spin_unlock_bh(&dev->bss_lock); + spin_unlock_bh(&rdev->bss_lock); } -void cfg80211_bss_expire(struct cfg80211_registered_device *dev) +void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) { - __cfg80211_bss_expire(dev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); + __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) @@ -526,32 +534,34 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, const u8 *ssid, size_t ssid_len, u16 capa_mask, u16 capa_val) { - struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *res = NULL; unsigned long now = jiffies; trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, capa_mask, capa_val); - spin_lock_bh(&dev->bss_lock); + spin_lock_bh(&rdev->bss_lock); - list_for_each_entry(bss, &dev->bss_list, list) { + list_for_each_entry(bss, &rdev->bss_list, list) { if ((bss->pub.capability & capa_mask) != capa_val) continue; if (channel && bss->pub.channel != channel) continue; + if (!is_valid_ether_addr(bss->pub.bssid)) + continue; /* Don't get expired BSS structs */ if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) && !atomic_read(&bss->hold)) continue; if (is_bss(&bss->pub, bssid, ssid, ssid_len)) { res = bss; - bss_ref_get(dev, res); + bss_ref_get(rdev, res); break; } } - spin_unlock_bh(&dev->bss_lock); + spin_unlock_bh(&rdev->bss_lock); if (!res) return NULL; trace_cfg80211_return_bss(&res->pub); @@ -559,10 +569,10 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_get_bss); -static void rb_insert_bss(struct cfg80211_registered_device *dev, +static void rb_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { - struct rb_node **p = &dev->bss_tree.rb_node; + struct rb_node **p = &rdev->bss_tree.rb_node; struct rb_node *parent = NULL; struct cfg80211_internal_bss *tbss; int cmp; @@ -585,15 +595,15 @@ static void rb_insert_bss(struct cfg80211_registered_device *dev, } rb_link_node(&bss->rbn, parent, p); - rb_insert_color(&bss->rbn, &dev->bss_tree); + rb_insert_color(&bss->rbn, &rdev->bss_tree); } static struct cfg80211_internal_bss * -rb_find_bss(struct cfg80211_registered_device *dev, +rb_find_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *res, enum bss_compare_mode mode) { - struct rb_node *n = dev->bss_tree.rb_node; + struct rb_node *n = rdev->bss_tree.rb_node; struct cfg80211_internal_bss *bss; int r; @@ -612,7 +622,7 @@ rb_find_bss(struct cfg80211_registered_device *dev, return NULL; } -static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev, +static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *new) { const struct cfg80211_bss_ies *ies; @@ -642,7 +652,7 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev, /* This is the bad part ... */ - list_for_each_entry(bss, &dev->bss_list, list) { + list_for_each_entry(bss, &rdev->bss_list, list) { if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid)) continue; if (bss->pub.channel != new->pub.channel) @@ -659,9 +669,6 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev, continue; if (ssidlen && ie[1] != ssidlen) continue; - /* that would be odd ... */ - if (bss->pub.beacon_ies) - continue; if (WARN_ON_ONCE(bss->pub.hidden_beacon_bss)) continue; if (WARN_ON_ONCE(!list_empty(&bss->hidden_list))) @@ -679,8 +686,9 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev, /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_internal_bss * -cfg80211_bss_update(struct cfg80211_registered_device *dev, - struct cfg80211_internal_bss *tmp) +cfg80211_bss_update(struct cfg80211_registered_device *rdev, + struct cfg80211_internal_bss *tmp, + bool signal_valid) { struct cfg80211_internal_bss *found = NULL; @@ -689,14 +697,14 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, tmp->ts = jiffies; - spin_lock_bh(&dev->bss_lock); + spin_lock_bh(&rdev->bss_lock); if (WARN_ON(!rcu_access_pointer(tmp->pub.ies))) { - spin_unlock_bh(&dev->bss_lock); + spin_unlock_bh(&rdev->bss_lock); return NULL; } - found = rb_find_bss(dev, tmp, BSS_CMP_REGULAR); + found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR); if (found) { /* Update IEs */ @@ -765,7 +773,12 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, } found->pub.beacon_interval = tmp->pub.beacon_interval; - found->pub.signal = tmp->pub.signal; + /* + * don't update the signal if beacon was heard on + * adjacent channel. + */ + if (signal_valid) + found->pub.signal = tmp->pub.signal; found->pub.capability = tmp->pub.capability; found->ts = tmp->ts; } else { @@ -778,7 +791,7 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, * is allocated on the stack since it's not needed in the * more common case of an update */ - new = kzalloc(sizeof(*new) + dev->wiphy.bss_priv_size, + new = kzalloc(sizeof(*new) + rdev->wiphy.bss_priv_size, GFP_ATOMIC); if (!new) { ies = (void *)rcu_dereference(tmp->pub.beacon_ies); @@ -794,9 +807,9 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, INIT_LIST_HEAD(&new->hidden_list); if (rcu_access_pointer(tmp->pub.proberesp_ies)) { - hidden = rb_find_bss(dev, tmp, BSS_CMP_HIDE_ZLEN); + hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN); if (!hidden) - hidden = rb_find_bss(dev, tmp, + hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_NUL); if (hidden) { new->pub.hidden_beacon_bss = &hidden->pub; @@ -813,24 +826,24 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, * expensive search for any probe responses that should * be grouped with this beacon for updates ... */ - if (!cfg80211_combine_bsses(dev, new)) { + if (!cfg80211_combine_bsses(rdev, new)) { kfree(new); goto drop; } } - list_add_tail(&new->list, &dev->bss_list); - rb_insert_bss(dev, new); + list_add_tail(&new->list, &rdev->bss_list); + rb_insert_bss(rdev, new); found = new; } - dev->bss_generation++; - bss_ref_get(dev, found); - spin_unlock_bh(&dev->bss_lock); + rdev->bss_generation++; + bss_ref_get(rdev, found); + spin_unlock_bh(&rdev->bss_lock); return found; drop: - spin_unlock_bh(&dev->bss_lock); + spin_unlock_bh(&rdev->bss_lock); return NULL; } @@ -869,14 +882,16 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss* cfg80211_inform_bss_width(struct wiphy *wiphy, - struct ieee80211_channel *channel, + struct ieee80211_channel *rx_channel, enum nl80211_bss_scan_width scan_width, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp) { struct cfg80211_bss_ies *ies; + struct ieee80211_channel *channel; struct cfg80211_internal_bss tmp = {}, *res; + bool signal_valid; if (WARN_ON(!wiphy)) return NULL; @@ -885,7 +900,7 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, (signal < 0 || signal > 100))) return NULL; - channel = cfg80211_get_bss_channel(wiphy, ie, ielen, channel); + channel = cfg80211_get_bss_channel(wiphy, ie, ielen, rx_channel); if (!channel) return NULL; @@ -913,7 +928,9 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, rcu_assign_pointer(tmp.pub.beacon_ies, ies); rcu_assign_pointer(tmp.pub.ies, ies); - res = cfg80211_bss_update(wiphy_to_dev(wiphy), &tmp); + signal_valid = abs(rx_channel->center_freq - channel->center_freq) <= + wiphy->max_adj_channel_rssi_comp; + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); if (!res) return NULL; @@ -929,20 +946,22 @@ EXPORT_SYMBOL(cfg80211_inform_bss_width); /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss * cfg80211_inform_bss_width_frame(struct wiphy *wiphy, - struct ieee80211_channel *channel, + struct ieee80211_channel *rx_channel, enum nl80211_bss_scan_width scan_width, struct ieee80211_mgmt *mgmt, size_t len, s32 signal, gfp_t gfp) { struct cfg80211_internal_bss tmp = {}, *res; struct cfg80211_bss_ies *ies; + struct ieee80211_channel *channel; + bool signal_valid; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); - trace_cfg80211_inform_bss_width_frame(wiphy, channel, scan_width, mgmt, + trace_cfg80211_inform_bss_width_frame(wiphy, rx_channel, scan_width, mgmt, len, signal); if (WARN_ON(!mgmt)) @@ -959,7 +978,7 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, return NULL; channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable, - ielen, channel); + ielen, rx_channel); if (!channel) return NULL; @@ -983,7 +1002,9 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); - res = cfg80211_bss_update(wiphy_to_dev(wiphy), &tmp); + signal_valid = abs(rx_channel->center_freq - channel->center_freq) <= + wiphy->max_adj_channel_rssi_comp; + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); if (!res) return NULL; @@ -998,7 +1019,7 @@ EXPORT_SYMBOL(cfg80211_inform_bss_width_frame); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { - struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; if (!pub) @@ -1006,15 +1027,15 @@ void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) bss = container_of(pub, struct cfg80211_internal_bss, pub); - spin_lock_bh(&dev->bss_lock); - bss_ref_get(dev, bss); - spin_unlock_bh(&dev->bss_lock); + spin_lock_bh(&rdev->bss_lock); + bss_ref_get(rdev, bss); + spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_ref_bss); void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { - struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; if (!pub) @@ -1022,15 +1043,15 @@ void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) bss = container_of(pub, struct cfg80211_internal_bss, pub); - spin_lock_bh(&dev->bss_lock); - bss_ref_put(dev, bss); - spin_unlock_bh(&dev->bss_lock); + spin_lock_bh(&rdev->bss_lock); + bss_ref_put(rdev, bss); + spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_put_bss); void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { - struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; if (WARN_ON(!pub)) @@ -1038,12 +1059,12 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) bss = container_of(pub, struct cfg80211_internal_bss, pub); - spin_lock_bh(&dev->bss_lock); + spin_lock_bh(&rdev->bss_lock); if (!list_empty(&bss->list)) { - if (__cfg80211_unlink_bss(dev, bss)) - dev->bss_generation++; + if (__cfg80211_unlink_bss(rdev, bss)) + rdev->bss_generation++; } - spin_unlock_bh(&dev->bss_lock); + spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_unlink_bss); @@ -1060,7 +1081,7 @@ cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) if (!dev) return ERR_PTR(-ENODEV); if (dev->ieee80211_ptr) - rdev = wiphy_to_dev(dev->ieee80211_ptr->wiphy); + rdev = wiphy_to_rdev(dev->ieee80211_ptr->wiphy); else rdev = ERR_PTR(-ENODEV); dev_put(dev); @@ -1089,7 +1110,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto out; } @@ -1099,11 +1120,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, /* Determine number of channels, needed to allocate creq */ if (wreq && wreq->num_channels) n_channels = wreq->num_channels; - else { - for (band = 0; band < IEEE80211_NUM_BANDS; band++) - if (wiphy->bands[band]) - n_channels += wiphy->bands[band]->n_channels; - } + else + n_channels = ieee80211_get_num_supported_channels(wiphy); creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + n_channels * sizeof(void *), @@ -1143,7 +1161,11 @@ int cfg80211_wext_siwscan(struct net_device *dev, int k; int wiphy_freq = wiphy->bands[band]->channels[j].center_freq; for (k = 0; k < wreq->num_channels; k++) { - int wext_freq = cfg80211_wext_freq(wiphy, &wreq->channel_list[k]); + struct iw_freq *freq = + &wreq->channel_list[k]; + int wext_freq = + cfg80211_wext_freq(freq); + if (wext_freq == wiphy_freq) goto wext_freq_found; } @@ -1455,7 +1477,7 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, } -static int ieee80211_scan_results(struct cfg80211_registered_device *dev, +static int ieee80211_scan_results(struct cfg80211_registered_device *rdev, struct iw_request_info *info, char *buf, size_t len) { @@ -1463,18 +1485,18 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *dev, char *end_buf = buf + len; struct cfg80211_internal_bss *bss; - spin_lock_bh(&dev->bss_lock); - cfg80211_bss_expire(dev); + spin_lock_bh(&rdev->bss_lock); + cfg80211_bss_expire(rdev); - list_for_each_entry(bss, &dev->bss_list, list) { + list_for_each_entry(bss, &rdev->bss_list, list) { if (buf + len - current_ev <= IW_EV_ADDR_LEN) { - spin_unlock_bh(&dev->bss_lock); + spin_unlock_bh(&rdev->bss_lock); return -E2BIG; } - current_ev = ieee80211_bss(&dev->wiphy, info, bss, + current_ev = ieee80211_bss(&rdev->wiphy, info, bss, current_ev, end_buf); } - spin_unlock_bh(&dev->bss_lock); + spin_unlock_bh(&rdev->bss_lock); return current_ev - buf; } @@ -1494,7 +1516,7 @@ int cfg80211_wext_giwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EAGAIN; res = ieee80211_scan_results(rdev, info, extra, data->length); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 65f800890d7..8bbeeb30221 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -59,29 +59,21 @@ static void cfg80211_sme_free(struct wireless_dev *wdev) static int cfg80211_conn_scan(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_scan_request *request; int n_channels, err; ASSERT_RTNL(); - ASSERT_RDEV_LOCK(rdev); ASSERT_WDEV_LOCK(wdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EBUSY; - if (wdev->conn->params.channel) { + if (wdev->conn->params.channel) n_channels = 1; - } else { - enum ieee80211_band band; - n_channels = 0; + else + n_channels = ieee80211_get_num_supported_channels(wdev->wiphy); - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { - if (!wdev->wiphy->bands[band]) - continue; - n_channels += wdev->wiphy->bands[band]->n_channels; - } - } request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + sizeof(request->channels[0]) * n_channels, GFP_KERNEL); @@ -138,7 +130,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) static int cfg80211_conn_do_work(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; struct cfg80211_assoc_request req = {}; int err; @@ -157,7 +149,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) case CFG80211_CONN_SCAN_AGAIN: return cfg80211_conn_scan(wdev); case CFG80211_CONN_AUTHENTICATE_NEXT: - BUG_ON(!rdev->ops->auth); + if (WARN_ON(!rdev->ops->auth)) + return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_AUTHENTICATING; return cfg80211_mlme_auth(rdev, wdev->netdev, params->channel, params->auth_type, @@ -169,7 +162,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) case CFG80211_CONN_AUTH_FAILED: return -ENOTCONN; case CFG80211_CONN_ASSOCIATE_NEXT: - BUG_ON(!rdev->ops->assoc); + if (WARN_ON(!rdev->ops->assoc)) + return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_ASSOCIATING; if (wdev->conn->prev_bssid_valid) req.prev_bssid = wdev->conn->prev_bssid; @@ -242,7 +236,6 @@ void cfg80211_conn_work(struct work_struct *work) NULL, 0, NULL, 0, WLAN_STATUS_UNSPECIFIED_FAILURE, false, NULL); - cfg80211_sme_free(wdev); } wdev_unlock(wdev); } @@ -253,7 +246,7 @@ void cfg80211_conn_work(struct work_struct *work) /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; u16 capa = WLAN_CAPABILITY_ESS; @@ -283,7 +276,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) static void __cfg80211_sme_scan_done(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; ASSERT_WDEV_LOCK(wdev); @@ -314,7 +307,7 @@ void cfg80211_sme_scan_done(struct net_device *dev) void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; u16 status_code = le16_to_cpu(mgmt->u.auth.status_code); @@ -360,7 +353,7 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return false; @@ -394,7 +387,7 @@ void cfg80211_sme_deauth(struct wireless_dev *wdev) void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; @@ -405,7 +398,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) void cfg80211_sme_disassoc(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; @@ -416,7 +409,7 @@ void cfg80211_sme_disassoc(struct wireless_dev *wdev) void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; @@ -429,7 +422,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, struct cfg80211_connect_params *connect, const u8 *prev_bssid) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; int err; @@ -476,7 +469,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, } wdev->conn->params.ssid = wdev->ssid; - wdev->conn->params.ssid_len = connect->ssid_len; + wdev->conn->params.ssid_len = wdev->ssid_len; /* see if we have the bss already */ bss = cfg80211_get_conn_bss(wdev); @@ -488,7 +481,6 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, /* we're good if we have a matching bss struct */ if (bss) { - wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; err = cfg80211_conn_do_work(wdev); cfg80211_put_bss(wdev->wiphy, bss); } else { @@ -514,7 +506,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, static int cfg80211_sme_disconnect(struct wireless_dev *wdev, u16 reason) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err; if (!wdev->conn) @@ -602,7 +594,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, return; } - nl80211_send_connect_result(wiphy_to_dev(wdev->wiphy), dev, + nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, bssid, req_ie, req_ie_len, resp_ie, resp_ie_len, status, GFP_KERNEL); @@ -632,6 +624,16 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, } #endif + if (!bss && (status == WLAN_STATUS_SUCCESS)) { + WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect); + bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, + wdev->ssid, wdev->ssid_len, + WLAN_CAPABILITY_ESS, + WLAN_CAPABILITY_ESS); + if (bss) + cfg80211_hold_bss(bss_from_pub(bss)); + } + if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); @@ -646,19 +648,12 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wdev->wiphy, bss); } + cfg80211_sme_free(wdev); return; } - if (!bss) { - WARN_ON_ONCE(!wiphy_to_dev(wdev->wiphy)->ops->connect); - bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, - wdev->ssid, wdev->ssid_len, - WLAN_CAPABILITY_ESS, - WLAN_CAPABILITY_ESS); - if (WARN_ON(!bss)) - return; - cfg80211_hold_bss(bss_from_pub(bss)); - } + if (WARN_ON(!bss)) + return; wdev->current_bss = bss_from_pub(bss); @@ -693,7 +688,7 @@ void cfg80211_connect_result(struct net_device *dev, const u8 *bssid, u16 status, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; @@ -748,7 +743,8 @@ void __cfg80211_roamed(struct wireless_dev *wdev, cfg80211_hold_bss(bss_from_pub(bss)); wdev->current_bss = bss_from_pub(bss); - nl80211_send_roamed(wiphy_to_dev(wdev->wiphy), wdev->netdev, bss->bssid, + nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), + wdev->netdev, bss->bssid, req_ie, req_ie_len, resp_ie, resp_ie_len, GFP_KERNEL); @@ -807,7 +803,7 @@ void cfg80211_roamed_bss(struct net_device *dev, size_t resp_ie_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; @@ -840,7 +836,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; @@ -870,6 +866,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, for (i = 0; i < 6; i++) rdev_del_key(rdev, dev, i, false, NULL); + rdev_set_qos_map(rdev, dev, NULL); + #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; @@ -881,10 +879,10 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, } void cfg80211_disconnected(struct net_device *dev, u16 reason, - u8 *ie, size_t ie_len, gfp_t gfp) + const u8 *ie, size_t ie_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index f7aa7a72d9b..7cc887f9da1 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -186,6 +186,28 @@ #define BOOL_TO_STR(bo) (bo) ? "true" : "false" +#define QOS_MAP_ENTRY __field(u8, num_des) \ + __array(u8, dscp_exception, \ + 2 * IEEE80211_QOS_MAP_MAX_EX) \ + __array(u8, up, IEEE80211_QOS_MAP_LEN_MIN) +#define QOS_MAP_ASSIGN(qos_map) \ + do { \ + if ((qos_map)) { \ + __entry->num_des = (qos_map)->num_des; \ + memcpy(__entry->dscp_exception, \ + &(qos_map)->dscp_exception, \ + 2 * IEEE80211_QOS_MAP_MAX_EX); \ + memcpy(__entry->up, &(qos_map)->up, \ + IEEE80211_QOS_MAP_LEN_MIN); \ + } else { \ + __entry->num_des = 0; \ + memset(__entry->dscp_exception, 0, \ + 2 * IEEE80211_QOS_MAP_MAX_EX); \ + memset(__entry->up, 0, \ + IEEE80211_QOS_MAP_LEN_MIN); \ + } \ + } while (0) + /************************************************************* * rdev->ops traces * *************************************************************/ @@ -1446,9 +1468,10 @@ TRACE_EVENT(rdev_sched_scan_start, TRACE_EVENT(rdev_tdls_mgmt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *peer, u8 action_code, u8 dialog_token, - u16 status_code, const u8 *buf, size_t len), + u16 status_code, u32 peer_capability, + const u8 *buf, size_t len), TP_ARGS(wiphy, netdev, peer, action_code, dialog_token, status_code, - buf, len), + peer_capability, buf, len), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -1456,6 +1479,7 @@ TRACE_EVENT(rdev_tdls_mgmt, __field(u8, action_code) __field(u8, dialog_token) __field(u16, status_code) + __field(u32, peer_capability) __dynamic_array(u8, buf, len) ), TP_fast_assign( @@ -1465,13 +1489,15 @@ TRACE_EVENT(rdev_tdls_mgmt, __entry->action_code = action_code; __entry->dialog_token = dialog_token; __entry->status_code = status_code; + __entry->peer_capability = peer_capability; memcpy(__get_dynamic_array(buf), buf, len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", action_code: %u, " - "dialog_token: %u, status_code: %u, buf: %#.2x ", + "dialog_token: %u, status_code: %u, peer_capability: %u buf: %#.2x ", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->action_code, __entry->dialog_token, - __entry->status_code, ((u8 *)__get_dynamic_array(buf))[0]) + __entry->status_code, __entry->peer_capability, + ((u8 *)__get_dynamic_array(buf))[0]) ); TRACE_EVENT(rdev_dump_survey, @@ -1850,29 +1876,69 @@ TRACE_EVENT(rdev_channel_switch, WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY - __field(u16, counter_offset_beacon) - __field(u16, counter_offset_presp) __field(bool, radar_required) __field(bool, block_tx) __field(u8, count) + __dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon) + __dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(¶ms->chandef); - __entry->counter_offset_beacon = params->counter_offset_beacon; - __entry->counter_offset_presp = params->counter_offset_presp; __entry->radar_required = params->radar_required; __entry->block_tx = params->block_tx; __entry->count = params->count; + memcpy(__get_dynamic_array(bcn_ofs), + params->counter_offsets_beacon, + params->n_counter_offsets_beacon * sizeof(u16)); + + /* probe response offsets are optional */ + if (params->n_counter_offsets_presp) + memcpy(__get_dynamic_array(pres_ofs), + params->counter_offsets_presp, + params->n_counter_offsets_presp * sizeof(u16)); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT - ", block_tx: %d, count: %u, radar_required: %d" - ", counter offsets (beacon/presp): %u/%u", + ", block_tx: %d, count: %u, radar_required: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, - __entry->block_tx, __entry->count, __entry->radar_required, - __entry->counter_offset_beacon, - __entry->counter_offset_presp) + __entry->block_tx, __entry->count, __entry->radar_required) +); + +TRACE_EVENT(rdev_set_qos_map, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_qos_map *qos_map), + TP_ARGS(wiphy, netdev, qos_map), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + QOS_MAP_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + QOS_MAP_ASSIGN(qos_map); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", num_des: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->num_des) +); + +TRACE_EVENT(rdev_set_ap_chanwidth, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_chan_def *chandef), + TP_ARGS(wiphy, netdev, chandef), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + CHAN_DEF_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + CHAN_DEF_ASSIGN(chandef); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); /************************************************************* @@ -2028,7 +2094,8 @@ TRACE_EVENT(cfg80211_michael_mic_failure, MAC_ASSIGN(addr, addr); __entry->key_type = key_type; __entry->key_id = key_id; - memcpy(__entry->tsc, tsc, 6); + if (tsc) + memcpy(__entry->tsc, tsc, 6); ), TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", key type: %d, key id: %d, tsc: %pm", NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->key_type, @@ -2149,18 +2216,21 @@ TRACE_EVENT(cfg80211_cqm_rssi_notify, ); TRACE_EVENT(cfg80211_reg_can_beacon, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, chandef), + TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype), + TP_ARGS(wiphy, chandef, iftype), TP_STRUCT__entry( WIPHY_ENTRY CHAN_DEF_ENTRY + __field(enum nl80211_iftype, iftype) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_DEF_ASSIGN(chandef); + __entry->iftype = iftype; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, CHAN_DEF_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d", + WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype) ); TRACE_EVENT(cfg80211_chandef_dfs_required, @@ -2238,11 +2308,6 @@ DECLARE_EVENT_CLASS(cfg80211_rx_evt, TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT, NETDEV_PR_ARG, MAC_PR_ARG(addr)) ); -DEFINE_EVENT(cfg80211_rx_evt, cfg80211_ibss_joined, - TP_PROTO(struct net_device *netdev, const u8 *addr), - TP_ARGS(netdev, addr) -); - DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame, TP_PROTO(struct net_device *netdev, const u8 *addr), TP_ARGS(netdev, addr) @@ -2253,6 +2318,24 @@ DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame, TP_ARGS(netdev, addr) ); +TRACE_EVENT(cfg80211_ibss_joined, + TP_PROTO(struct net_device *netdev, const u8 *bssid, + struct ieee80211_channel *channel), + TP_ARGS(netdev, bssid, channel), + TP_STRUCT__entry( + NETDEV_ENTRY + MAC_ENTRY(bssid) + CHAN_ENTRY + ), + TP_fast_assign( + NETDEV_ASSIGN; + MAC_ASSIGN(bssid, bssid); + CHAN_ASSIGN(channel); + ), + TP_printk(NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", " CHAN_PR_FMT, + NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG) +); + TRACE_EVENT(cfg80211_probe_status, TP_PROTO(struct net_device *netdev, const u8 *addr, u64 cookie, bool acked), @@ -2558,6 +2641,21 @@ TRACE_EVENT(cfg80211_ft_event, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(target_ap)) ); +TRACE_EVENT(cfg80211_stop_iface, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, + WIPHY_PR_ARG, WDEV_PR_ARG) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 935dea9485d..728f1c0dc70 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -11,6 +11,7 @@ #include <net/ip.h> #include <net/dsfield.h> #include <linux/if_vlan.h> +#include <linux/mpls.h> #include "core.h" #include "rdev-ops.h" @@ -475,7 +476,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, EXPORT_SYMBOL(ieee80211_data_to_8023); int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype, u8 *bssid, bool qos) + enum nl80211_iftype iftype, + const u8 *bssid, bool qos) { struct ieee80211_hdr hdr; u16 hdrlen, ethertype; @@ -689,7 +691,8 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); /* Given a data frame determine the 802.1p/1d tag to use. */ -unsigned int cfg80211_classify8021d(struct sk_buff *skb) +unsigned int cfg80211_classify8021d(struct sk_buff *skb, + struct cfg80211_qos_map *qos_map) { unsigned int dscp; unsigned char vlan_priority; @@ -716,10 +719,40 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb) case htons(ETH_P_IPV6): dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc; break; + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): { + struct mpls_label mpls_tmp, *mpls; + + mpls = skb_header_pointer(skb, sizeof(struct ethhdr), + sizeof(*mpls), &mpls_tmp); + if (!mpls) + return 0; + + return (ntohl(mpls->entry) & MPLS_LS_TC_MASK) + >> MPLS_LS_TC_SHIFT; + } + case htons(ETH_P_80221): + /* 802.21 is always network control traffic */ + return 7; default: return 0; } + if (qos_map) { + unsigned int i, tmp_dscp = dscp >> 2; + + for (i = 0; i < qos_map->num_des; i++) { + if (tmp_dscp == qos_map->dscp_exception[i].dscp) + return qos_map->dscp_exception[i].up; + } + + for (i = 0; i < 8; i++) { + if (tmp_dscp >= qos_map->up[i].low && + tmp_dscp <= qos_map->up[i].high) + return i; + } + } + return dscp >> 5; } EXPORT_SYMBOL(cfg80211_classify8021d); @@ -738,7 +771,7 @@ EXPORT_SYMBOL(ieee80211_bss_get_ie); void cfg80211_upload_connect_keys(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct net_device *dev = wdev->netdev; int i; @@ -804,7 +837,11 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) ev->dc.reason, true); break; case EVENT_IBSS_JOINED: - __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid); + __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, + ev->ij.channel); + break; + case EVENT_STOPPED: + __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; } wdev_unlock(wdev); @@ -821,7 +858,6 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) struct wireless_dev *wdev; ASSERT_RTNL(); - ASSERT_RDEV_LOCK(rdev); list_for_each_entry(wdev, &rdev->wdev_list, list) cfg80211_process_wdev_events(wdev); @@ -834,7 +870,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; - ASSERT_RDEV_LOCK(rdev); + ASSERT_RTNL(); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) @@ -856,17 +892,15 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, return -EBUSY; if (ntype != otype && netif_running(dev)) { - err = cfg80211_can_change_interface(rdev, dev->ieee80211_ptr, - ntype); - if (err) - return err; - dev->ieee80211_ptr->use_4addr = false; dev->ieee80211_ptr->mesh_id_up_len = 0; + wdev_lock(dev->ieee80211_ptr); + rdev_set_qos_map(rdev, dev, NULL); + wdev_unlock(dev->ieee80211_ptr); switch (otype) { case NL80211_IFTYPE_AP: - cfg80211_stop_ap(rdev, dev); + cfg80211_stop_ap(rdev, dev, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); @@ -1233,6 +1267,120 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, return res; } +int cfg80211_iter_combinations(struct wiphy *wiphy, + const int num_different_channels, + const u8 radar_detect, + const int iftype_num[NUM_NL80211_IFTYPES], + void (*iter)(const struct ieee80211_iface_combination *c, + void *data), + void *data) +{ + const struct ieee80211_regdomain *regdom; + enum nl80211_dfs_regions region = 0; + int i, j, iftype; + int num_interfaces = 0; + u32 used_iftypes = 0; + + if (radar_detect) { + rcu_read_lock(); + regdom = rcu_dereference(cfg80211_regdomain); + if (regdom) + region = regdom->dfs_region; + rcu_read_unlock(); + } + + for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { + num_interfaces += iftype_num[iftype]; + if (iftype_num[iftype] > 0 && + !(wiphy->software_iftypes & BIT(iftype))) + used_iftypes |= BIT(iftype); + } + + for (i = 0; i < wiphy->n_iface_combinations; i++) { + const struct ieee80211_iface_combination *c; + struct ieee80211_iface_limit *limits; + u32 all_iftypes = 0; + + c = &wiphy->iface_combinations[i]; + + if (num_interfaces > c->max_interfaces) + continue; + if (num_different_channels > c->num_different_channels) + continue; + + limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, + GFP_KERNEL); + if (!limits) + return -ENOMEM; + + for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { + if (wiphy->software_iftypes & BIT(iftype)) + continue; + for (j = 0; j < c->n_limits; j++) { + all_iftypes |= limits[j].types; + if (!(limits[j].types & BIT(iftype))) + continue; + if (limits[j].max < iftype_num[iftype]) + goto cont; + limits[j].max -= iftype_num[iftype]; + } + } + + if (radar_detect != (c->radar_detect_widths & radar_detect)) + goto cont; + + if (radar_detect && c->radar_detect_regions && + !(c->radar_detect_regions & BIT(region))) + goto cont; + + /* Finally check that all iftypes that we're currently + * using are actually part of this combination. If they + * aren't then we can't use this combination and have + * to continue to the next. + */ + if ((all_iftypes & used_iftypes) != used_iftypes) + goto cont; + + /* This combination covered all interface types and + * supported the requested numbers, so we're good. + */ + + (*iter)(c, data); + cont: + kfree(limits); + } + + return 0; +} +EXPORT_SYMBOL(cfg80211_iter_combinations); + +static void +cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c, + void *data) +{ + int *num = data; + (*num)++; +} + +int cfg80211_check_combinations(struct wiphy *wiphy, + const int num_different_channels, + const u8 radar_detect, + const int iftype_num[NUM_NL80211_IFTYPES]) +{ + int err, num = 0; + + err = cfg80211_iter_combinations(wiphy, num_different_channels, + radar_detect, iftype_num, + cfg80211_iter_sum_ifcombs, &num); + if (err) + return err; + if (num == 0) + return -EBUSY; + + return 0; +} +EXPORT_SYMBOL(cfg80211_check_combinations); + int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_iftype iftype, @@ -1241,7 +1389,6 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, u8 radar_detect) { struct wireless_dev *wdev_iter; - u32 used_iftypes = BIT(iftype); int num[NUM_NL80211_IFTYPES]; struct ieee80211_channel *used_channels[CFG80211_MAX_NUM_DIFFERENT_CHANNELS]; @@ -1249,43 +1396,14 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, enum cfg80211_chan_mode chmode; int num_different_channels = 0; int total = 1; - bool radar_required = false; - int i, j; + int i; ASSERT_RTNL(); if (WARN_ON(hweight32(radar_detect) > 1)) return -EINVAL; - switch (iftype) { - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_MESH_POINT: - case NL80211_IFTYPE_P2P_GO: - case NL80211_IFTYPE_WDS: - /* if the interface could potentially choose a DFS channel, - * then mark DFS as required. - */ - if (!chan) { - if (chanmode != CHAN_MODE_UNDEFINED && radar_detect) - radar_required = true; - break; - } - radar_required = !!(chan->flags & IEEE80211_CHAN_RADAR); - break; - case NL80211_IFTYPE_P2P_CLIENT: - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_DEVICE: - case NL80211_IFTYPE_MONITOR: - break; - case NUM_NL80211_IFTYPES: - case NL80211_IFTYPE_UNSPECIFIED: - default: - return -EINVAL; - } - - if (radar_required && !radar_detect) + if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return -EINVAL; /* Always allow software iftypes */ @@ -1300,6 +1418,11 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, num[iftype] = 1; + /* TODO: We'll probably not need this anymore, since this + * should only be called with CHAN_MODE_UNDEFINED. There are + * still a couple of pending calls where other chanmodes are + * used, but we should get rid of them. + */ switch (chanmode) { case CHAN_MODE_UNDEFINED: break; @@ -1337,7 +1460,7 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, */ mutex_lock_nested(&wdev_iter->mtx, 1); __acquire(wdev_iter->mtx); - cfg80211_get_chan_state(wdev_iter, &ch, &chmode); + cfg80211_get_chan_state(wdev_iter, &ch, &chmode, &radar_detect); wdev_unlock(wdev_iter); switch (chmode) { @@ -1363,65 +1486,13 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, num[wdev_iter->iftype]++; total++; - used_iftypes |= BIT(wdev_iter->iftype); } if (total == 1 && !radar_detect) return 0; - for (i = 0; i < rdev->wiphy.n_iface_combinations; i++) { - const struct ieee80211_iface_combination *c; - struct ieee80211_iface_limit *limits; - u32 all_iftypes = 0; - - c = &rdev->wiphy.iface_combinations[i]; - - if (total > c->max_interfaces) - continue; - if (num_different_channels > c->num_different_channels) - continue; - - limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, - GFP_KERNEL); - if (!limits) - return -ENOMEM; - - for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { - if (rdev->wiphy.software_iftypes & BIT(iftype)) - continue; - for (j = 0; j < c->n_limits; j++) { - all_iftypes |= limits[j].types; - if (!(limits[j].types & BIT(iftype))) - continue; - if (limits[j].max < num[iftype]) - goto cont; - limits[j].max -= num[iftype]; - } - } - - if (radar_detect && !(c->radar_detect_widths & radar_detect)) - goto cont; - - /* - * Finally check that all iftypes that we're currently - * using are actually part of this combination. If they - * aren't then we can't use this combination and have - * to continue to the next. - */ - if ((all_iftypes & used_iftypes) != used_iftypes) - goto cont; - - /* - * This combination covered all interface types and - * supported the requested numbers, so we're good. - */ - kfree(limits); - return 0; - cont: - kfree(limits); - } - - return -EBUSY; + return cfg80211_check_combinations(&rdev->wiphy, num_different_channels, + radar_detect, num); } int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, @@ -1462,6 +1533,37 @@ int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, return 0; } +unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) +{ + enum ieee80211_band band; + unsigned int n_channels = 0; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + if (wiphy->bands[band]) + n_channels += wiphy->bands[band]->n_channels; + + return n_channels; +} +EXPORT_SYMBOL(ieee80211_get_num_supported_channels); + +int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, + struct station_info *sinfo) +{ + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + + wdev = dev->ieee80211_ptr; + if (!wdev) + return -EOPNOTSUPP; + + rdev = wiphy_to_rdev(wdev->wiphy); + if (!rdev->ops->get_station) + return -EOPNOTSUPP; + + return rdev_get_station(rdev, dev, mac_addr, sinfo); +} +EXPORT_SYMBOL(cfg80211_get_station); + /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index e7c6e862580..11120bb1416 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -73,7 +73,7 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, struct vif_params vifparams; enum nl80211_iftype type; - rdev = wiphy_to_dev(wdev->wiphy); + rdev = wiphy_to_rdev(wdev->wiphy); switch (*mode) { case IW_MODE_INFRA: @@ -253,12 +253,12 @@ EXPORT_SYMBOL_GPL(cfg80211_wext_giwrange); /** * cfg80211_wext_freq - get wext frequency for non-"auto" - * @wiphy: the wiphy + * @dev: the net device * @freq: the wext freq encoding * * Returns a frequency, or a negative error code, or 0 for auto. */ -int cfg80211_wext_freq(struct wiphy *wiphy, struct iw_freq *freq) +int cfg80211_wext_freq(struct iw_freq *freq) { /* * Parse frequency - return 0 for auto and @@ -286,7 +286,7 @@ int cfg80211_wext_siwrts(struct net_device *dev, struct iw_param *rts, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 orts = wdev->wiphy->rts_threshold; int err; @@ -324,7 +324,7 @@ int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_param *frag, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 ofrag = wdev->wiphy->frag_threshold; int err; @@ -364,13 +364,13 @@ static int cfg80211_wext_siwretry(struct net_device *dev, struct iw_param *retry, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 changed = 0; u8 olong = wdev->wiphy->retry_long; u8 oshort = wdev->wiphy->retry_short; int err; - if (retry->disabled || + if (retry->disabled || retry->value < 1 || retry->value > 255 || (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; @@ -412,9 +412,9 @@ int cfg80211_wext_giwretry(struct net_device *dev, * First return short value, iwconfig will ask long value * later if needed */ - retry->flags |= IW_RETRY_LIMIT; + retry->flags |= IW_RETRY_LIMIT | IW_RETRY_SHORT; retry->value = wdev->wiphy->retry_short; - if (wdev->wiphy->retry_long != wdev->wiphy->retry_short) + if (wdev->wiphy->retry_long == wdev->wiphy->retry_short) retry->flags |= IW_RETRY_LONG; return 0; @@ -587,7 +587,7 @@ static int cfg80211_wext_siwencode(struct net_device *dev, struct iw_point *erq, char *keybuf) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int idx, err; bool remove = false; struct key_params params; @@ -647,7 +647,7 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, struct iw_point *erq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct iw_encode_ext *ext = (struct iw_encode_ext *) extra; const u8 *addr; int idx; @@ -775,7 +775,7 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; @@ -787,7 +787,7 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, case NL80211_IFTYPE_ADHOC: return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); case NL80211_IFTYPE_MONITOR: - freq = cfg80211_wext_freq(wdev->wiphy, wextfreq); + freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq == 0) @@ -798,7 +798,7 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, return -EINVAL; return cfg80211_set_monitor_channel(rdev, &chandef); case NL80211_IFTYPE_MESH_POINT: - freq = cfg80211_wext_freq(wdev->wiphy, wextfreq); + freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq == 0) @@ -818,7 +818,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef; int ret; @@ -847,7 +847,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; @@ -899,7 +899,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err, val; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) @@ -1119,7 +1119,7 @@ static int cfg80211_wext_siwpower(struct net_device *dev, struct iw_param *wrq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); bool ps = wdev->ps; int timeout = wdev->ps_timeout; int err; @@ -1177,7 +1177,7 @@ static int cfg80211_wds_wext_siwap(struct net_device *dev, struct sockaddr *addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err; if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS)) @@ -1221,7 +1221,7 @@ static int cfg80211_wext_siwrate(struct net_device *dev, struct iw_param *rate, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; @@ -1272,7 +1272,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, struct iw_param *rate, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use a static struct */ static struct station_info sinfo; u8 addr[ETH_ALEN]; @@ -1310,7 +1310,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; static struct station_info sinfo; @@ -1449,7 +1449,7 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, struct iw_point *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmksa cfg_pmksa; struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index 5d766b0118e..ebcacca2f73 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -50,7 +50,7 @@ int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_point *data, char *extra); -int cfg80211_wext_freq(struct wiphy *wiphy, struct iw_freq *freq); +int cfg80211_wext_freq(struct iw_freq *freq); extern const struct iw_handler_def cfg80211_wext_handler; diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 14c9a2583ba..c7e5c8eb4f2 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -21,7 +21,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, const u8 *prev_bssid = NULL; int err, i; - ASSERT_RDEV_LOCK(rdev); + ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); if (!netif_running(wdev->netdev)) @@ -67,7 +67,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; @@ -75,7 +75,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev, if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; - freq = cfg80211_wext_freq(wdev->wiphy, wextfreq); + freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; @@ -169,7 +169,7 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; @@ -260,7 +260,7 @@ int cfg80211_mgd_wext_siwap(struct net_device *dev, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; @@ -333,7 +333,7 @@ int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_point *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *ie = extra; int ie_len = data->length, err; @@ -390,7 +390,7 @@ int cfg80211_wext_siwmlme(struct net_device *dev, if (!wdev) return -EOPNOTSUPP; - rdev = wiphy_to_dev(wdev->wiphy); + rdev = wiphy_to_rdev(wdev->wiphy); if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c8a8297cd4b..5ad4418ef09 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1064,7 +1064,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: @@ -1082,7 +1082,7 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); - struct sockaddr_x25 *usx25 = (struct sockaddr_x25 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; @@ -1258,7 +1258,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); - struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index d1b0dc79bb6..7ac50098a37 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -79,7 +79,7 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) skb_set_owner_r(skbn, sk); skb_queue_tail(&sk->sk_receive_queue, skbn); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skbn->len); + sk->sk_data_ready(sk); return 0; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 88843996f93..85d1d476461 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -16,6 +16,81 @@ static struct kmem_cache *secpath_cachep __read_mostly; +static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); +static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO]; + +int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) +{ + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + spin_lock_bh(&xfrm_input_afinfo_lock); + if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) + err = -ENOBUFS; + else + rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo); + spin_unlock_bh(&xfrm_input_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_input_register_afinfo); + +int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo) +{ + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + spin_lock_bh(&xfrm_input_afinfo_lock); + if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) { + if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo)) + err = -EINVAL; + else + RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->family], NULL); + } + spin_unlock_bh(&xfrm_input_afinfo_lock); + synchronize_rcu(); + return err; +} +EXPORT_SYMBOL(xfrm_input_unregister_afinfo); + +static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) +{ + struct xfrm_input_afinfo *afinfo; + + if (unlikely(family >= NPROTO)) + return NULL; + rcu_read_lock(); + afinfo = rcu_dereference(xfrm_input_afinfo[family]); + if (unlikely(!afinfo)) + rcu_read_unlock(); + return afinfo; +} + +static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo) +{ + rcu_read_unlock(); +} + +static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, + int err) +{ + int ret; + struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family); + + if (!afinfo) + return -EAFNOSUPPORT; + + ret = afinfo->callback(skb, protocol, err); + xfrm_input_put_afinfo(afinfo); + + return ret; +} + void __secpath_destroy(struct sec_path *sp) { int i; @@ -67,7 +142,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) case IPPROTO_COMP: if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) return -EINVAL; - *spi = htonl(ntohs(*(__be16*)(skb_transport_header(skb) + 2))); + *spi = htonl(ntohs(*(__be16 *)(skb_transport_header(skb) + 2))); *seq = 0; return 0; default: @@ -77,8 +152,8 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) if (!pskb_may_pull(skb, hlen)) return -EINVAL; - *spi = *(__be32*)(skb_transport_header(skb) + offset); - *seq = *(__be32*)(skb_transport_header(skb) + offset_seq); + *spi = *(__be32 *)(skb_transport_header(skb) + offset); + *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq); return 0; } @@ -108,7 +183,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) int err; __be32 seq; __be32 seq_hi; - struct xfrm_state *x; + struct xfrm_state *x = NULL; xfrm_address_t *daddr; struct xfrm_mode *inner_mode; unsigned int family; @@ -120,9 +195,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) async = 1; x = xfrm_input_state(skb); seq = XFRM_SKB_CB(skb)->seq.input.low; + family = x->outer_mode->afinfo->family; goto resume; } + daddr = (xfrm_address_t *)(skb_network_header(skb) + + XFRM_SPI_SKB_CB(skb)->daddroff); + family = XFRM_SPI_SKB_CB(skb)->family; + /* Allocate new secpath or COW existing one. */ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { struct sec_path *sp; @@ -137,10 +217,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp = sp; } - daddr = (xfrm_address_t *)(skb_network_header(skb) + - XFRM_SPI_SKB_CB(skb)->daddroff); - family = XFRM_SPI_SKB_CB(skb)->family; - seq = 0; if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); @@ -162,6 +238,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp->xvec[skb->sp->len++] = x; + if (xfrm_tunnel_check(skb, x, family)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); + goto drop; + } + spin_lock(&x->lock); if (unlikely(x->km.state == XFRM_STATE_ACQ)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); @@ -201,7 +282,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (nexthdr == -EINPROGRESS) return 0; - resume: spin_lock(&x->lock); if (nexthdr <= 0) { @@ -263,6 +343,10 @@ resume: } } while (!err); + err = xfrm_rcv_cb(skb, family, x->type->proto, 0); + if (err) + goto drop; + nf_reset(skb); if (decaps) { @@ -276,6 +360,7 @@ resume: drop_unlock: spin_unlock(&x->lock); drop: + xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1); kfree_skb(skb); return 0; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 3bb2cdc13b4..c51e8f7b865 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -199,6 +199,7 @@ int xfrm_output(struct sk_buff *skb) return xfrm_output2(skb); } +EXPORT_SYMBOL_GPL(xfrm_output); int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) { @@ -213,6 +214,7 @@ int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) return -EAFNOSUPPORT; return inner_mode->afinfo->extract_output(x, skb); } +EXPORT_SYMBOL_GPL(xfrm_inner_extract_output); void xfrm_local_error(struct sk_buff *skb, int mtu) { @@ -233,7 +235,4 @@ void xfrm_local_error(struct sk_buff *skb, int mtu) afinfo->local_error(skb, mtu); xfrm_state_put_afinfo(afinfo); } - -EXPORT_SYMBOL_GPL(xfrm_output); -EXPORT_SYMBOL_GPL(xfrm_inner_extract_output); EXPORT_SYMBOL_GPL(xfrm_local_error); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9a91f7431c4..0525d78ba32 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -39,13 +39,6 @@ #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 -DEFINE_MUTEX(xfrm_cfg_mutex); -EXPORT_SYMBOL(xfrm_cfg_mutex); - -static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock); -static struct dst_entry *xfrm_policy_sk_bundles; -static DEFINE_RWLOCK(xfrm_policy_lock); - static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] __read_mostly; @@ -176,7 +169,7 @@ static inline unsigned long make_jiffies(long secs) static void xfrm_policy_timer(unsigned long data) { - struct xfrm_policy *xp = (struct xfrm_policy*)data; + struct xfrm_policy *xp = (struct xfrm_policy *)data; unsigned long now = get_seconds(); long next = LONG_MAX; int warn = 0; @@ -438,7 +431,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) if (!ndst) return; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(odst + i, ndst, nhashmask); @@ -446,7 +439,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) net->xfrm.policy_bydst[dir].table = ndst; net->xfrm.policy_bydst[dir].hmask = nhashmask; - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } @@ -463,7 +456,7 @@ static void xfrm_byidx_resize(struct net *net, int total) if (!nidx) return; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); @@ -471,7 +464,7 @@ static void xfrm_byidx_resize(struct net *net, int total) net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } @@ -504,7 +497,7 @@ static inline int xfrm_byidx_should_resize(struct net *net, int total) void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { - read_lock_bh(&xfrm_policy_lock); + read_lock_bh(&net->xfrm.xfrm_policy_lock); si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; @@ -513,7 +506,7 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; - read_unlock_bh(&xfrm_policy_lock); + read_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_spd_getinfo); @@ -538,7 +531,7 @@ static void xfrm_hash_resize(struct work_struct *work) /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ -static u32 xfrm_gen_index(struct net *net, int dir) +static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { static u32 idx_generator; @@ -548,8 +541,14 @@ static u32 xfrm_gen_index(struct net *net, int dir) u32 idx; int found; - idx = (idx_generator | dir); - idx_generator += 8; + if (!index) { + idx = (idx_generator | dir); + idx_generator += 8; + } else { + idx = index; + index = 0; + } + if (idx == 0) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); @@ -630,7 +629,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct hlist_head *chain; struct hlist_node *newpos; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); delpol = NULL; newpos = NULL; @@ -641,7 +640,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) { - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); return -EEXIST; } delpol = pol; @@ -660,7 +659,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); net->xfrm.policy_count[dir]++; - atomic_inc(&flow_cache_genid); + atomic_inc(&net->xfrm.flow_cache_genid); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) @@ -672,14 +671,14 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); } - policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir); + policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = get_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); list_add(&policy->walk.all, &net->xfrm.policy_all); - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); @@ -699,7 +698,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, struct hlist_head *chain; *err = 0; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); ret = NULL; hlist_for_each_entry(pol, chain, bydst) { @@ -712,7 +711,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -721,7 +720,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -740,7 +739,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, return NULL; *err = 0; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { @@ -751,7 +750,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -760,7 +759,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -770,7 +769,7 @@ EXPORT_SYMBOL(xfrm_policy_byid); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int -xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info) +xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { int dir, err = 0; @@ -784,10 +783,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audi continue; err = security_xfrm_policy_delete(pol->security); if (err) { - xfrm_audit_policy_delete(pol, 0, - audit_info->loginuid, - audit_info->sessionid, - audit_info->secid); + xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } @@ -801,9 +797,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audi pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, - audit_info->loginuid, - audit_info->sessionid, - audit_info->secid); + task_valid); return err; } } @@ -813,19 +807,19 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audi } #else static inline int -xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info) +xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } #endif -int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) +int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); - err = xfrm_policy_flush_secctx_check(net, type, audit_info); + err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) goto out; @@ -839,16 +833,14 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; - xfrm_audit_policy_delete(pol, 1, audit_info->loginuid, - audit_info->sessionid, - audit_info->secid); + xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); goto again1; } @@ -860,16 +852,13 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; - xfrm_audit_policy_delete(pol, 1, - audit_info->loginuid, - audit_info->sessionid, - audit_info->secid); + xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); goto again2; } } @@ -878,7 +867,7 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) if (!cnt) err = -ESRCH; out: - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); @@ -898,7 +887,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else @@ -924,7 +913,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, } list_del_init(&walk->walk.all); out: - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); @@ -938,14 +927,14 @@ void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) } EXPORT_SYMBOL(xfrm_policy_walk_init); -void xfrm_policy_walk_done(struct xfrm_policy_walk *walk) +void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) { if (list_empty(&walk->walk.all)) return; - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); @@ -990,7 +979,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if (unlikely(!daddr || !saddr)) return NULL; - read_lock_bh(&xfrm_policy_lock); + read_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, daddr, saddr, family, dir); ret = NULL; hlist_for_each_entry(pol, chain, bydst) { @@ -1026,7 +1015,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if (ret) xfrm_pol_hold(ret); fail: - read_unlock_bh(&xfrm_policy_lock); + read_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } @@ -1103,8 +1092,9 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, const struct flowi *fl) { struct xfrm_policy *pol; + struct net *net = sock_net(sk); - read_lock_bh(&xfrm_policy_lock); + read_lock_bh(&net->xfrm.xfrm_policy_lock); if ((pol = sk->sk_policy[dir]) != NULL) { bool match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); @@ -1128,7 +1118,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, pol = NULL; } out: - read_unlock_bh(&xfrm_policy_lock); + read_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } @@ -1156,7 +1146,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, if (hlist_unhashed(&pol->bydst)) return NULL; - hlist_del(&pol->bydst); + hlist_del_init(&pol->bydst); hlist_del(&pol->byidx); list_del(&pol->walk.all); net->xfrm.policy_count[dir]--; @@ -1166,9 +1156,11 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { - write_lock_bh(&xfrm_policy_lock); + struct net *net = xp_net(pol); + + write_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; @@ -1187,12 +1179,12 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) return -EINVAL; #endif - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = sk->sk_policy[dir]; sk->sk_policy[dir] = pol; if (pol) { pol->curlft.add_time = get_seconds(); - pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir); + pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); } if (old_pol) { @@ -1204,7 +1196,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) */ __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir); } - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); @@ -1215,6 +1207,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) { struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); + struct net *net = xp_net(old); if (newp) { newp->selector = old->selector; @@ -1233,9 +1226,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->type = old->type; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); - write_lock_bh(&xfrm_policy_lock); + write_lock_bh(&net->xfrm.xfrm_policy_lock); __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir); - write_unlock_bh(&xfrm_policy_lock); + write_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; @@ -1281,7 +1274,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; - for (nx=0, i = 0; i < policy->xfrm_nr; i++) { + for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; xfrm_address_t *remote = daddr; xfrm_address_t *local = saddr; @@ -1311,9 +1304,9 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, error = (x->km.state == XFRM_STATE_ERROR ? -EINVAL : -EAGAIN); xfrm_state_put(x); - } - else if (error == -ESRCH) + } else if (error == -ESRCH) { error = -EAGAIN; + } if (!tmpl->optional) goto fail; @@ -1321,7 +1314,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, return nx; fail: - for (nx--; nx>=0; nx--) + for (nx--; nx >= 0; nx--) xfrm_state_put(xfrm[nx]); return error; } @@ -1358,7 +1351,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, return cnx; fail: - for (cnx--; cnx>=0; cnx--) + for (cnx--; cnx >= 0; cnx--) xfrm_state_put(tpp[cnx]); return error; @@ -1636,20 +1629,22 @@ free_dst: goto out; } -static int inline -xfrm_dst_alloc_copy(void **target, const void *src, int size) +#ifdef CONFIG_XFRM_SUB_POLICY +static int xfrm_dst_alloc_copy(void **target, const void *src, int size) { if (!*target) { *target = kmalloc(size, GFP_ATOMIC); if (!*target) return -ENOMEM; } + memcpy(*target, src, size); return 0; } +#endif -static int inline -xfrm_dst_update_parent(struct dst_entry *dst, const struct xfrm_selector *sel) +static int xfrm_dst_update_parent(struct dst_entry *dst, + const struct xfrm_selector *sel) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -1660,8 +1655,8 @@ xfrm_dst_update_parent(struct dst_entry *dst, const struct xfrm_selector *sel) #endif } -static int inline -xfrm_dst_update_origin(struct dst_entry *dst, const struct flowi *fl) +static int xfrm_dst_update_origin(struct dst_entry *dst, + const struct flowi *fl) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -1699,7 +1694,7 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, xfrm_pols_put(pols, *num_pols); return PTR_ERR(pols[1]); } - (*num_pols) ++; + (*num_pols)++; (*num_xfrms) += pols[1]->xfrm_nr; } } @@ -1753,7 +1748,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, } xdst->num_pols = num_pols; - memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); + memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); return xdst; @@ -1837,7 +1832,7 @@ purge_queue: xfrm_pol_put(pol); } -static int xdst_queue_output(struct sk_buff *skb) +static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); @@ -1896,8 +1891,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, if (IS_ERR(xdst)) return xdst; - if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0 || - (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP)) + if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0) return xdst; dst1 = &xdst->u.dst; @@ -2023,7 +2017,7 @@ make_dummy_bundle: } xdst->num_pols = num_pols; xdst->num_xfrms = num_xfrms; - memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); + memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); dst_hold(&xdst->u.dst); return &xdst->flo; @@ -2072,7 +2066,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); int i, err, num_pols, num_xfrms = 0, drop_pols = 0; -restart: dst = NULL; xdst = NULL; route = NULL; @@ -2105,12 +2098,7 @@ restart: } dst_hold(&xdst->u.dst); - - spin_lock_bh(&xfrm_policy_sk_bundle_lock); - xdst->u.dst.next = xfrm_policy_sk_bundles; - xfrm_policy_sk_bundles = &xdst->u.dst; - spin_unlock_bh(&xfrm_policy_sk_bundle_lock); - + xdst->u.dst.flags |= DST_NOCACHE; route = xdst->route; } } @@ -2133,7 +2121,7 @@ restart: num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; - memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols); + memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols); route = xdst->route; } @@ -2152,23 +2140,8 @@ restart: return make_blackhole(net, family, dst_orig); } - if (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP) { - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&net->xfrm.km_waitq, &wait); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - set_current_state(TASK_RUNNING); - remove_wait_queue(&net->xfrm.km_waitq, &wait); - if (!signal_pending(current)) { - dst_release(dst); - goto restart; - } - - err = -ERESTART; - } else - err = -EAGAIN; + err = -EAGAIN; XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; @@ -2344,7 +2317,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (skb->sp) { int i; - for (i=skb->sp->len-1; i>=0; i--) { + for (i = skb->sp->len-1; i >= 0; i--) { struct xfrm_state *x = skb->sp->xvec[i]; if (!xfrm_selector_match(&x->sel, &fl, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); @@ -2390,7 +2363,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol->curlft.use_time = get_seconds(); pols[0] = pol; - npols ++; + npols++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, @@ -2402,7 +2375,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 0; } pols[1]->curlft.use_time = get_seconds(); - npols ++; + npols++; } } #endif @@ -2434,7 +2407,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_nr = ti; if (npols > 1) { - xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); + xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net); tpp = stp; } @@ -2559,33 +2532,15 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) return dst; } -static void __xfrm_garbage_collect(struct net *net) -{ - struct dst_entry *head, *next; - - spin_lock_bh(&xfrm_policy_sk_bundle_lock); - head = xfrm_policy_sk_bundles; - xfrm_policy_sk_bundles = NULL; - spin_unlock_bh(&xfrm_policy_sk_bundle_lock); - - while (head) { - next = head->next; - dst_free(head); - head = next; - } -} - void xfrm_garbage_collect(struct net *net) { - flow_cache_flush(); - __xfrm_garbage_collect(net); + flow_cache_flush(net); } EXPORT_SYMBOL(xfrm_garbage_collect); static void xfrm_garbage_collect_deferred(struct net *net) { - flow_cache_flush_deferred(); - __xfrm_garbage_collect(net); + flow_cache_flush_deferred(net); } static void xfrm_init_pmtu(struct dst_entry *dst) @@ -2820,21 +2775,19 @@ static struct notifier_block xfrm_dev_notifier = { static int __net_init xfrm_statistics_init(struct net *net) { int rv; - - if (snmp_mib_init((void __percpu **)net->mib.xfrm_statistics, - sizeof(struct linux_xfrm_mib), - __alignof__(struct linux_xfrm_mib)) < 0) + net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); + if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) - snmp_mib_free((void __percpu **)net->mib.xfrm_statistics); + free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); - snmp_mib_free((void __percpu **)net->mib.xfrm_statistics); + free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) @@ -2899,21 +2852,14 @@ out_byidx: static void xfrm_policy_fini(struct net *net) { - struct xfrm_audit audit_info; unsigned int sz; int dir; flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY - audit_info.loginuid = INVALID_UID; - audit_info.sessionid = -1; - audit_info.secid = 0; - xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, &audit_info); + xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif - audit_info.loginuid = INVALID_UID; - audit_info.sessionid = -1; - audit_info.secid = 0; - xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info); + xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); WARN_ON(!list_empty(&net->xfrm.policy_all)); @@ -2950,8 +2896,19 @@ static int __net_init xfrm_net_init(struct net *net) rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; + rv = flow_cache_init(net); + if (rv < 0) + goto out; + + /* Initialize the per-net locks here */ + spin_lock_init(&net->xfrm.xfrm_state_lock); + rwlock_init(&net->xfrm.xfrm_policy_lock); + mutex_init(&net->xfrm.xfrm_cfg_mutex); + return 0; +out: + xfrm_sysctl_fini(net); out_sysctl: xfrm_policy_fini(net); out_policy: @@ -2964,6 +2921,7 @@ out_statistics: static void __net_exit xfrm_net_exit(struct net *net) { + flow_cache_fini(net); xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); @@ -2992,7 +2950,7 @@ static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); - switch(sel->family) { + switch (sel->family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); if (sel->prefixlen_s != 32) @@ -3016,15 +2974,14 @@ static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, } } -void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, - kuid_t auid, u32 sessionid, u32 secid) +void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; - xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); @@ -3032,14 +2989,14 @@ void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, - kuid_t auid, u32 sessionid, u32 secid) + bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; - xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); @@ -3069,15 +3026,15 @@ static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, return false; } -static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector *sel, - u8 dir, u8 type) +static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, + u8 dir, u8 type, struct net *net) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; u32 priority = ~0U; - read_lock_bh(&xfrm_policy_lock); - chain = policy_hash_direct(&init_net, &sel->daddr, &sel->saddr, sel->family, dir); + read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/ + chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { @@ -3086,7 +3043,7 @@ static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector break; } } - chain = &init_net.xfrm.policy_inexact[dir]; + chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type && @@ -3099,7 +3056,7 @@ static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector if (ret) xfrm_pol_hold(ret); - read_unlock_bh(&xfrm_policy_lock); + read_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } @@ -3210,7 +3167,7 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, - struct xfrm_kmaddress *k) + struct xfrm_kmaddress *k, struct net *net) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -3223,14 +3180,14 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, goto out; /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type)) == NULL) { + if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { - if ((x = xfrm_migrate_state_find(mp))) { + if ((x = xfrm_migrate_state_find(mp, net))) { x_cur[nx_cur] = x; nx_cur++; if ((xc = xfrm_state_migrate(x, mp))) { diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 80cd1e55b83..9c4fbd8935f 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -52,10 +52,9 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; int i; - for (i=0; xfrm_mib_list[i].name; i++) + for (i = 0; xfrm_mib_list[i].name; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, - snmp_fold_field((void __percpu **) - net->mib.xfrm_statistics, + snmp_fold_field(net->mib.xfrm_statistics, xfrm_mib_list[i].entry)); return 0; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 68c2f357a18..0ab54134bb4 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -35,8 +35,6 @@ destination/tunnel endpoint. (output) */ -static DEFINE_SPINLOCK(xfrm_state_lock); - static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; static inline unsigned int xfrm_dst_hash(struct net *net, @@ -127,7 +125,7 @@ static void xfrm_hash_resize(struct work_struct *work) goto out_unlock; } - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; for (i = net->xfrm.state_hmask; i >= 0; i--) @@ -144,7 +142,7 @@ static void xfrm_hash_resize(struct work_struct *work) net->xfrm.state_byspi = nspi; net->xfrm.state_hmask = nhashmask; - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); osize = (ohashmask + 1) * sizeof(struct hlist_head); xfrm_hash_free(odst, osize); @@ -163,6 +161,7 @@ static DEFINE_SPINLOCK(xfrm_state_gc_lock); int __xfrm_state_delete(struct xfrm_state *x); int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); +bool km_is_alive(const struct km_event *c); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); static DEFINE_SPINLOCK(xfrm_type_lock); @@ -374,8 +373,6 @@ static void xfrm_state_gc_task(struct work_struct *work) hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) xfrm_state_gc_destroy(x); - - wake_up(&net->xfrm.km_waitq); } static inline unsigned long make_jiffies(long secs) @@ -386,11 +383,10 @@ static inline unsigned long make_jiffies(long secs) return secs*HZ; } -static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) +static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) { struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer); struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer); - struct net *net = xs_net(x); unsigned long now = get_seconds(); long next = LONG_MAX; int warn = 0; @@ -453,27 +449,21 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) if (warn) km_state_expired(x, 0, 0); resched: - if (next != LONG_MAX){ + if (next != LONG_MAX) { tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL); } goto out; expired: - if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) { + if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) x->km.state = XFRM_STATE_EXPIRED; - wake_up(&net->xfrm.km_waitq); - next = 2; - goto resched; - } err = __xfrm_state_delete(x); if (!err) km_state_expired(x, 1, 0); - xfrm_audit_state_delete(x, err ? 0 : 1, - audit_get_loginuid(current), - audit_get_sessionid(current), 0); + xfrm_audit_state_delete(x, err ? 0 : 1, true); out: spin_unlock(&x->lock); @@ -535,14 +525,14 @@ int __xfrm_state_delete(struct xfrm_state *x) if (x->km.state != XFRM_STATE_DEAD) { x->km.state = XFRM_STATE_DEAD; - spin_lock(&xfrm_state_lock); + spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); hlist_del(&x->bydst); hlist_del(&x->bysrc); if (x->id.spi) hlist_del(&x->byspi); net->xfrm.state_num--; - spin_unlock(&xfrm_state_lock); + spin_unlock(&net->xfrm.xfrm_state_lock); /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that @@ -570,7 +560,7 @@ EXPORT_SYMBOL(xfrm_state_delete); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int -xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audit_info) +xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { int i, err = 0; @@ -580,10 +570,7 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audi hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (xfrm_id_proto_match(x->id.proto, proto) && (err = security_xfrm_state_delete(x)) != 0) { - xfrm_audit_state_delete(x, 0, - audit_info->loginuid, - audit_info->sessionid, - audit_info->secid); + xfrm_audit_state_delete(x, 0, task_valid); return err; } } @@ -593,18 +580,18 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audi } #else static inline int -xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audit_info) +xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { return 0; } #endif -int xfrm_state_flush(struct net *net, u8 proto, struct xfrm_audit *audit_info) +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) { int i, err = 0, cnt = 0; - spin_lock_bh(&xfrm_state_lock); - err = xfrm_state_flush_secctx_check(net, proto, audit_info); + spin_lock_bh(&net->xfrm.xfrm_state_lock); + err = xfrm_state_flush_secctx_check(net, proto, task_valid); if (err) goto out; @@ -616,18 +603,16 @@ restart: if (!xfrm_state_kern(x) && xfrm_id_proto_match(x->id.proto, proto)) { xfrm_state_hold(x); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, - audit_info->loginuid, - audit_info->sessionid, - audit_info->secid); + task_valid); xfrm_state_put(x); if (!err) cnt++; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); goto restart; } } @@ -636,19 +621,18 @@ restart: err = 0; out: - spin_unlock_bh(&xfrm_state_lock); - wake_up(&net->xfrm.km_waitq); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_flush); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) { - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); si->sadcnt = net->xfrm.state_num; si->sadhcnt = net->xfrm.state_hmask; si->sadhmcnt = xfrm_state_hashmax; - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_sad_getinfo); @@ -798,10 +782,11 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; unsigned short encap_family = tmpl->encap_family; + struct km_event c; to_put = NULL; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == encap_family && @@ -842,6 +827,17 @@ found: error = -EEXIST; goto out; } + + c.net = net; + /* If the KMs have no listeners (yet...), avoid allocating an SA + * for each and every packet - garbage collection might not + * handle the flood. + */ + if (!km_is_alive(&c)) { + error = -ESRCH; + goto out; + } + x = xfrm_state_alloc(net); if (x == NULL) { error = -ENOMEM; @@ -886,7 +882,7 @@ out: xfrm_state_hold(x); else *err = acquire_in_progress ? -EAGAIN : error; - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (to_put) xfrm_state_put(to_put); return x; @@ -900,7 +896,7 @@ xfrm_stateonly_find(struct net *net, u32 mark, unsigned int h; struct xfrm_state *rx = NULL, *x = NULL; - spin_lock(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_dst_hash(net, daddr, saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && @@ -918,13 +914,35 @@ xfrm_stateonly_find(struct net *net, u32 mark, if (rx) xfrm_state_hold(rx); - spin_unlock(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return rx; } EXPORT_SYMBOL(xfrm_stateonly_find); +struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, + unsigned short family) +{ + struct xfrm_state *x; + struct xfrm_state_walk *w; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + list_for_each_entry(w, &net->xfrm.state_all, all) { + x = container_of(w, struct xfrm_state, km); + if (x->props.family != family || + x->id.spi != spi) + continue; + + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + xfrm_state_hold(x); + return x; + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + return NULL; +} +EXPORT_SYMBOL(xfrm_state_lookup_byspi); + static void __xfrm_state_insert(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -950,14 +968,12 @@ static void __xfrm_state_insert(struct xfrm_state *x) if (x->replay_maxage) mod_timer(&x->rtimer, jiffies + x->replay_maxage); - wake_up(&net->xfrm.km_waitq); - net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); } -/* xfrm_state_lock is held */ +/* net->xfrm.xfrm_state_lock is held */ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) { struct net *net = xs_net(xnew); @@ -980,14 +996,16 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) void xfrm_state_insert(struct xfrm_state *x) { - spin_lock_bh(&xfrm_state_lock); + struct net *net = xs_net(x); + + spin_lock_bh(&net->xfrm.xfrm_state_lock); __xfrm_state_bump_genids(x); __xfrm_state_insert(x); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_state_insert); -/* xfrm_state_lock is held */ +/* net->xfrm.xfrm_state_lock is held */ static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, @@ -1079,7 +1097,7 @@ int xfrm_state_add(struct xfrm_state *x) to_put = NULL; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); x1 = __xfrm_state_locate(x, use_spi, family); if (x1) { @@ -1108,7 +1126,7 @@ int xfrm_state_add(struct xfrm_state *x) err = 0; out: - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (x1) { xfrm_state_delete(x1); @@ -1123,10 +1141,9 @@ out: EXPORT_SYMBOL(xfrm_state_add); #ifdef CONFIG_XFRM_MIGRATE -static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) +static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig) { struct net *net = xs_net(orig); - int err = -ENOMEM; struct xfrm_state *x = xfrm_state_alloc(net); if (!x) goto out; @@ -1147,6 +1164,11 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) } x->props.aalgo = orig->props.aalgo; + if (orig->aead) { + x->aead = xfrm_algo_aead_clone(orig->aead); + if (!x->aead) + goto error; + } if (orig->ealg) { x->ealg = xfrm_algo_clone(orig->ealg); if (!x->ealg) @@ -1175,20 +1197,21 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) } if (orig->replay_esn) { - err = xfrm_replay_clone(x, orig); - if (err) + if (xfrm_replay_clone(x, orig)) goto error; } memcpy(&x->mark, &orig->mark, sizeof(x->mark)); - err = xfrm_init_state(x); - if (err) + if (xfrm_init_state(x) < 0) goto error; x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; + x->tfcpad = orig->tfcpad; + x->replay_maxdiff = orig->replay_maxdiff; + x->replay_maxage = orig->replay_maxage; x->curlft.add_time = orig->curlft.add_time; x->km.state = orig->km.state; x->km.seq = orig->km.seq; @@ -1198,21 +1221,20 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) error: xfrm_state_put(x); out: - if (errp) - *errp = err; return NULL; } -/* xfrm_state_lock is held */ -struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m) +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net) { unsigned int h; - struct xfrm_state *x; + struct xfrm_state *x = NULL; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); if (m->reqid) { - h = xfrm_dst_hash(&init_net, &m->old_daddr, &m->old_saddr, + h = xfrm_dst_hash(net, &m->old_daddr, &m->old_saddr, m->reqid, m->old_family); - hlist_for_each_entry(x, init_net.xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; @@ -1224,12 +1246,12 @@ struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m) m->old_family)) continue; xfrm_state_hold(x); - return x; + break; } } else { - h = xfrm_src_hash(&init_net, &m->old_daddr, &m->old_saddr, + h = xfrm_src_hash(net, &m->old_daddr, &m->old_saddr, m->old_family); - hlist_for_each_entry(x, init_net.xfrm.state_bysrc+h, bysrc) { + hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; @@ -1239,21 +1261,22 @@ struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m) m->old_family)) continue; xfrm_state_hold(x); - return x; + break; } } - return NULL; + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + + return x; } EXPORT_SYMBOL(xfrm_migrate_state_find); -struct xfrm_state * xfrm_state_migrate(struct xfrm_state *x, - struct xfrm_migrate *m) +struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, + struct xfrm_migrate *m) { struct xfrm_state *xc; - int err; - xc = xfrm_state_clone(x, &err); + xc = xfrm_state_clone(x); if (!xc) return NULL; @@ -1266,7 +1289,7 @@ struct xfrm_state * xfrm_state_migrate(struct xfrm_state *x, state is to be updated as it is a part of triplet */ xfrm_state_insert(xc); } else { - if ((err = xfrm_state_add(xc)) < 0) + if (xfrm_state_add(xc) < 0) goto error; } @@ -1283,10 +1306,11 @@ int xfrm_state_update(struct xfrm_state *x) struct xfrm_state *x1, *to_put; int err; int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY); + struct net *net = xs_net(x); to_put = NULL; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); x1 = __xfrm_state_locate(x, use_spi, x->props.family); err = -ESRCH; @@ -1306,7 +1330,7 @@ int xfrm_state_update(struct xfrm_state *x) err = 0; out: - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (to_put) xfrm_state_put(to_put); @@ -1357,7 +1381,7 @@ int xfrm_state_check_expire(struct xfrm_state *x) if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { x->km.state = XFRM_STATE_EXPIRED; - tasklet_hrtimer_start(&x->mtimer, ktime_set(0,0), HRTIMER_MODE_REL); + tasklet_hrtimer_start(&x->mtimer, ktime_set(0, 0), HRTIMER_MODE_REL); return -EINVAL; } @@ -1377,9 +1401,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 { struct xfrm_state *x; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_state_lookup); @@ -1391,9 +1415,9 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark, { struct xfrm_state *x; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_state_lookup_byaddr); @@ -1405,9 +1429,9 @@ xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, { struct xfrm_state *x; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } @@ -1416,17 +1440,17 @@ EXPORT_SYMBOL(xfrm_find_acq); #ifdef CONFIG_XFRM_SUB_POLICY int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, - unsigned short family) + unsigned short family, struct net *net) { int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (!afinfo) return -EAFNOSUPPORT; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/ if (afinfo->tmpl_sort) err = afinfo->tmpl_sort(dst, src, n); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); xfrm_state_put_afinfo(afinfo); return err; } @@ -1438,13 +1462,15 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, { int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + struct net *net = xs_net(*src); + if (!afinfo) return -EAFNOSUPPORT; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); if (afinfo->state_sort) err = afinfo->state_sort(dst, src, n); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); xfrm_state_put_afinfo(afinfo); return err; } @@ -1476,9 +1502,9 @@ struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) { struct xfrm_state *x; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __xfrm_find_acq_byseq(net, mark, seq); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_find_acq_byseq); @@ -1496,6 +1522,30 @@ u32 xfrm_get_acqseq(void) } EXPORT_SYMBOL(xfrm_get_acqseq); +int verify_spi_info(u8 proto, u32 min, u32 max) +{ + switch (proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + break; + + case IPPROTO_COMP: + /* IPCOMP spi is 16-bits. */ + if (max >= 0x10000) + return -EINVAL; + break; + + default: + return -EINVAL; + } + + if (min > max) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(verify_spi_info); + int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) { struct net *net = xs_net(x); @@ -1525,8 +1575,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) x->id.spi = minspi; } else { u32 spi = 0; - for (h=0; h<high-low+1; h++) { - spi = low + net_random()%(high-low+1); + for (h = 0; h < high-low+1; h++) { + spi = low + prandom_u32()%(high-low+1); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { x->id.spi = htonl(spi); @@ -1536,10 +1586,10 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) } } if (x->id.spi) { - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; } @@ -1551,6 +1601,23 @@ unlock: } EXPORT_SYMBOL(xfrm_alloc_spi); +static bool __xfrm_state_filter_match(struct xfrm_state *x, + struct xfrm_address_filter *filter) +{ + if (filter) { + if ((filter->family == AF_INET || + filter->family == AF_INET6) && + x->props.family != filter->family) + return false; + + return addr_match(&x->props.saddr, &filter->saddr, + filter->splen) && + addr_match(&x->id.daddr, &filter->daddr, + filter->dplen); + } + return true; +} + int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, int (*func)(struct xfrm_state *, int, void*), void *data) @@ -1562,7 +1629,7 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, if (walk->seq != 0 && list_empty(&walk->all)) return 0; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); if (list_empty(&walk->all)) x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all); else @@ -1573,6 +1640,8 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, state = container_of(x, struct xfrm_state, km); if (!xfrm_id_proto_match(state->id.proto, walk->proto)) continue; + if (!__xfrm_state_filter_match(state, walk->filter)) + continue; err = func(state, walk->seq, data); if (err) { list_move_tail(&walk->all, &x->all); @@ -1586,34 +1655,38 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, } list_del_init(&walk->all); out: - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_walk); -void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto) +void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto, + struct xfrm_address_filter *filter) { INIT_LIST_HEAD(&walk->all); walk->proto = proto; walk->state = XFRM_STATE_DEAD; walk->seq = 0; + walk->filter = filter; } EXPORT_SYMBOL(xfrm_state_walk_init); -void xfrm_state_walk_done(struct xfrm_state_walk *walk) +void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net) { + kfree(walk->filter); + if (list_empty(&walk->all)) return; - spin_lock_bh(&xfrm_state_lock); + spin_lock_bh(&net->xfrm.xfrm_state_lock); list_del(&walk->all); - spin_unlock_bh(&xfrm_state_lock); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_state_walk_done); static void xfrm_replay_timer_handler(unsigned long data) { - struct xfrm_state *x = (struct xfrm_state*)data; + struct xfrm_state *x = (struct xfrm_state *)data; spin_lock(&x->lock); @@ -1655,16 +1728,12 @@ EXPORT_SYMBOL(km_state_notify); void km_state_expired(struct xfrm_state *x, int hard, u32 portid) { - struct net *net = xs_net(x); struct km_event c; c.data.hard = hard; c.portid = portid; c.event = XFRM_MSG_EXPIRE; km_state_notify(x, &c); - - if (hard) - wake_up(&net->xfrm.km_waitq); } EXPORT_SYMBOL(km_state_expired); @@ -1707,16 +1776,12 @@ EXPORT_SYMBOL(km_new_mapping); void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid) { - struct net *net = xp_net(pol); struct km_event c; c.data.hard = hard; c.portid = portid; c.event = XFRM_MSG_POLEXPIRE; km_policy_notify(pol, dir, &c); - - if (hard) - wake_up(&net->xfrm.km_waitq); } EXPORT_SYMBOL(km_policy_expired); @@ -1762,6 +1827,24 @@ int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address } EXPORT_SYMBOL(km_report); +bool km_is_alive(const struct km_event *c) +{ + struct xfrm_mgr *km; + bool is_alive = false; + + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) { + if (km->is_alive && km->is_alive(c)) { + is_alive = true; + break; + } + } + rcu_read_unlock(); + + return is_alive; +} +EXPORT_SYMBOL(km_is_alive); + int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) { int err; @@ -2025,7 +2108,7 @@ int __net_init xfrm_state_init(struct net *net) INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); INIT_HLIST_HEAD(&net->xfrm.state_gc_list); INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task); - init_waitqueue_head(&net->xfrm.km_waitq); + spin_lock_init(&net->xfrm.xfrm_state_lock); return 0; out_byspi: @@ -2038,14 +2121,10 @@ out_bydst: void xfrm_state_fini(struct net *net) { - struct xfrm_audit audit_info; unsigned int sz; flush_work(&net->xfrm.state_hash_work); - audit_info.loginuid = INVALID_UID; - audit_info.sessionid = -1; - audit_info.secid = 0; - xfrm_state_flush(net, IPSEC_PROTO_ANY, &audit_info); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); flush_work(&net->xfrm.state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); @@ -2070,7 +2149,7 @@ static void xfrm_audit_helper_sainfo(struct xfrm_state *x, audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); - switch(x->props.family) { + switch (x->props.family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4 dst=%pI4", &x->props.saddr.a4, &x->id.daddr.a4); @@ -2100,7 +2179,7 @@ static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family, iph6 = ipv6_hdr(skb); audit_log_format(audit_buf, " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x", - &iph6->saddr,&iph6->daddr, + &iph6->saddr, &iph6->daddr, iph6->flow_lbl[0] & 0x0f, iph6->flow_lbl[1], iph6->flow_lbl[2]); @@ -2108,30 +2187,28 @@ static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family, } } -void xfrm_audit_state_add(struct xfrm_state *x, int result, - kuid_t auid, u32 sessionid, u32 secid) +void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SAD-add"); if (audit_buf == NULL) return; - xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + xfrm_audit_helper_usrinfo(task_valid, audit_buf); xfrm_audit_helper_sainfo(x, audit_buf); audit_log_format(audit_buf, " res=%u", result); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_add); -void xfrm_audit_state_delete(struct xfrm_state *x, int result, - kuid_t auid, u32 sessionid, u32 secid) +void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SAD-delete"); if (audit_buf == NULL) return; - xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + xfrm_audit_helper_usrinfo(task_valid, audit_buf); xfrm_audit_helper_sainfo(x, audit_buf); audit_log_format(audit_buf, " res=%u", result); audit_log_end(audit_buf); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index f964d4c00ff..d4db6ebb089 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -32,11 +32,6 @@ #include <linux/in6.h> #endif -static inline int aead_len(struct xfrm_algo_aead *alg) -{ - return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); -} - static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) { struct nlattr *rt = attrs[type]; @@ -142,7 +137,8 @@ static inline int verify_replay(struct xfrm_usersa_info *p, if (!rt) return 0; - if (p->id.proto != IPPROTO_ESP) + /* As only ESP and AH support ESN feature. */ + if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH)) return -EINVAL; if (p->replay_window != 0) @@ -209,7 +205,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, attrs[XFRMA_ALG_AUTH] || attrs[XFRMA_ALG_AUTH_TRUNC] || attrs[XFRMA_ALG_CRYPT] || - attrs[XFRMA_TFCPAD]) + attrs[XFRMA_TFCPAD] || + (ntohl(p->id.spi) >= 0x10000)) goto out; break; @@ -599,9 +596,6 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_state *x; int err; struct km_event c; - kuid_t loginuid = audit_get_loginuid(current); - u32 sessionid = audit_get_sessionid(current); - u32 sid; err = verify_newsa_info(p, attrs); if (err) @@ -617,8 +611,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, else err = xfrm_state_update(x); - security_task_getsecid(current, &sid); - xfrm_audit_state_add(x, err ? 0 : 1, loginuid, sessionid, sid); + xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; @@ -678,9 +671,6 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, int err = -ESRCH; struct km_event c; struct xfrm_usersa_id *p = nlmsg_data(nlh); - kuid_t loginuid = audit_get_loginuid(current); - u32 sessionid = audit_get_sessionid(current); - u32 sid; x = xfrm_user_state_lookup(net, p, attrs, &err); if (x == NULL) @@ -705,8 +695,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, km_state_notify(x, &c); out: - security_task_getsecid(current, &sid); - xfrm_audit_state_delete(x, err ? 0 : 1, loginuid, sessionid, sid); + xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; } @@ -877,10 +866,14 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) static int xfrm_dump_sa_done(struct netlink_callback *cb) { struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1]; - xfrm_state_walk_done(walk); + struct sock *sk = cb->skb->sk; + struct net *net = sock_net(sk); + + xfrm_state_walk_done(walk, net); return 0; } +static const struct nla_policy xfrma_policy[XFRMA_MAX+1]; static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -896,8 +889,31 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) info.nlmsg_flags = NLM_F_MULTI; if (!cb->args[0]) { + struct nlattr *attrs[XFRMA_MAX+1]; + struct xfrm_address_filter *filter = NULL; + u8 proto = 0; + int err; + cb->args[0] = 1; - xfrm_state_walk_init(walk, 0); + + err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, + xfrma_policy); + if (err < 0) + return err; + + if (attrs[XFRMA_ADDRESS_FILTER]) { + filter = kmalloc(sizeof(*filter), GFP_KERNEL); + if (filter == NULL) + return -ENOMEM; + + memcpy(filter, nla_data(attrs[XFRMA_ADDRESS_FILTER]), + sizeof(*filter)); + } + + if (attrs[XFRMA_PROTO]) + proto = nla_get_u8(attrs[XFRMA_PROTO]); + + xfrm_state_walk_init(walk, proto, filter); } (void) xfrm_state_walk(net, walk, dump_one_state, &info); @@ -930,6 +946,20 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, return skb; } +/* A wrapper for nlmsg_multicast() checking that nlsk is still available. + * Must be called with RCU read lock. + */ +static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, + u32 pid, unsigned int group) +{ + struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); + + if (nlsk) + return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); + else + return -1; +} + static inline size_t xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) @@ -1074,29 +1104,6 @@ out_noput: return err; } -static int verify_userspi_info(struct xfrm_userspi_info *p) -{ - switch (p->info.id.proto) { - case IPPROTO_AH: - case IPPROTO_ESP: - break; - - case IPPROTO_COMP: - /* IPCOMP spi is 16-bits. */ - if (p->max >= 0x10000) - return -EINVAL; - break; - - default: - return -EINVAL; - } - - if (p->min > p->max) - return -EINVAL; - - return 0; -} - static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { @@ -1111,7 +1118,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_mark m; p = nlmsg_data(nlh); - err = verify_userspi_info(p); + err = verify_spi_info(p->info.id.proto, p->min, p->max); if (err) goto out_noput; @@ -1189,6 +1196,8 @@ static int verify_policy_type(u8 type) static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) { + int ret; + switch (p->share) { case XFRM_SHARE_ANY: case XFRM_SHARE_SESSION: @@ -1224,7 +1233,13 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) return -EINVAL; } - return verify_policy_dir(p->dir); + ret = verify_policy_dir(p->dir); + if (ret) + return ret; + if (p->index && ((p->index & XFRM_POLICY_MAX) != p->dir)) + return -EINVAL; + + return 0; } static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs) @@ -1236,7 +1251,7 @@ static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs return 0; uctx = nla_data(rt); - return security_xfrm_policy_alloc(&pol->security, uctx); + return security_xfrm_policy_alloc(&pol->security, uctx, GFP_KERNEL); } static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, @@ -1404,9 +1419,6 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct km_event c; int err; int excl; - kuid_t loginuid = audit_get_loginuid(current); - u32 sessionid = audit_get_sessionid(current); - u32 sid; err = verify_newpolicy_info(p); if (err) @@ -1425,8 +1437,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, * a type XFRM_MSG_UPDPOLICY - JHS */ excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; err = xfrm_policy_insert(p->dir, xp, excl); - security_task_getsecid(current, &sid); - xfrm_audit_policy_add(xp, err ? 0 : 1, loginuid, sessionid, sid); + xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) { security_xfrm_policy_free(xp->security); @@ -1547,8 +1558,9 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr static int xfrm_dump_policy_done(struct netlink_callback *cb) { struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1]; + struct net *net = sock_net(cb->skb->sk); - xfrm_policy_walk_done(walk); + xfrm_policy_walk_done(walk, net); return 0; } @@ -1640,7 +1652,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (rt) { struct xfrm_user_sec_ctx *uctx = nla_data(rt); - err = security_xfrm_policy_alloc(&ctx, uctx); + err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL); if (err) return err; } @@ -1662,13 +1674,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid); } } else { - kuid_t loginuid = audit_get_loginuid(current); - u32 sessionid = audit_get_sessionid(current); - u32 sid; - - security_task_getsecid(current, &sid); - xfrm_audit_policy_delete(xp, err ? 0 : 1, loginuid, sessionid, - sid); + xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err != 0) goto out; @@ -1693,13 +1699,9 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct km_event c; struct xfrm_usersa_flush *p = nlmsg_data(nlh); - struct xfrm_audit audit_info; int err; - audit_info.loginuid = audit_get_loginuid(current); - audit_info.sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &audit_info.secid); - err = xfrm_state_flush(net, p->proto, &audit_info); + err = xfrm_state_flush(net, p->proto, true); if (err) { if (err == -ESRCH) /* empty table */ return 0; @@ -1740,11 +1742,11 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct return -EMSGSIZE; id = nlmsg_data(nlh); - memcpy(&id->sa_id.daddr, &x->id.daddr,sizeof(x->id.daddr)); + memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr)); id->sa_id.spi = x->id.spi; id->sa_id.family = x->props.family; id->sa_id.proto = x->id.proto; - memcpy(&id->saddr, &x->props.saddr,sizeof(x->props.saddr)); + memcpy(&id->saddr, &x->props.saddr, sizeof(x->props.saddr)); id->reqid = x->props.reqid; id->flags = c->data.aevent; @@ -1833,7 +1835,7 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct xfrm_state *x; struct km_event c; - int err = - EINVAL; + int err = -EINVAL; u32 mark = 0; struct xfrm_mark m; struct xfrm_aevent_id *p = nlmsg_data(nlh); @@ -1883,16 +1885,12 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct km_event c; u8 type = XFRM_POLICY_TYPE_MAIN; int err; - struct xfrm_audit audit_info; err = copy_from_user_policy_type(&type, attrs); if (err) return err; - audit_info.loginuid = audit_get_loginuid(current); - audit_info.sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &audit_info.secid); - err = xfrm_policy_flush(net, type, &audit_info); + err = xfrm_policy_flush(net, type, true); if (err) { if (err == -ESRCH) /* empty table */ return 0; @@ -1942,7 +1940,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (rt) { struct xfrm_user_sec_ctx *uctx = nla_data(rt); - err = security_xfrm_policy_alloc(&ctx, uctx); + err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL); if (err) return err; } @@ -1958,14 +1956,8 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, err = 0; if (up->hard) { - kuid_t loginuid = audit_get_loginuid(current); - u32 sessionid = audit_get_sessionid(current); - u32 sid; - - security_task_getsecid(current, &sid); xfrm_policy_delete(xp, p->dir); - xfrm_audit_policy_delete(xp, 1, loginuid, sessionid, sid); - + xfrm_audit_policy_delete(xp, 1, true); } else { // reset the timers here? WARN(1, "Dont know what to do with soft policy expire\n"); @@ -2001,13 +1993,8 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, km_state_expired(x, ue->hard, nlh->nlmsg_pid); if (ue->hard) { - kuid_t loginuid = audit_get_loginuid(current); - u32 sessionid = audit_get_sessionid(current); - u32 sid; - - security_task_getsecid(current, &sid); __xfrm_state_delete(x); - xfrm_audit_state_delete(x, 1, loginuid, sessionid, sid); + xfrm_audit_state_delete(x, 1, true); } err = 0; out: @@ -2129,6 +2116,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, u8 type; int err; int n = 0; + struct net *net = sock_net(skb->sk); if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2146,7 +2134,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, if (!n) return 0; - xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp); + xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net); return 0; } @@ -2253,7 +2241,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, if (build_migrate(skb, m, num_migrate, k, sel, dir, type) < 0) BUG(); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MIGRATE, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MIGRATE); } #else static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, @@ -2316,6 +2304,8 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_TFCPAD] = { .type = NLA_U32 }, [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) }, [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, + [XFRMA_PROTO] = { .type = NLA_U8 }, + [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, }; static const struct xfrm_link { @@ -2363,7 +2353,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (!netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || @@ -2394,9 +2384,11 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) static void xfrm_netlink_rcv(struct sk_buff *skb) { - mutex_lock(&xfrm_cfg_mutex); + struct net *net = sock_net(skb->sk); + + mutex_lock(&net->xfrm.xfrm_cfg_mutex); netlink_rcv_skb(skb, &xfrm_user_rcv_msg); - mutex_unlock(&xfrm_cfg_mutex); + mutex_unlock(&net->xfrm.xfrm_cfg_mutex); } static inline size_t xfrm_expire_msgsize(void) @@ -2440,7 +2432,7 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c) return -EMSGSIZE; } - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE); } static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c) @@ -2455,7 +2447,7 @@ static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event if (build_aevent(skb, x, c) < 0) BUG(); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_AEVENTS, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_AEVENTS); } static int xfrm_notify_sa_flush(const struct km_event *c) @@ -2481,7 +2473,7 @@ static int xfrm_notify_sa_flush(const struct km_event *c) nlmsg_end(skb, nlh); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA); } static inline size_t xfrm_sa_len(struct xfrm_state *x) @@ -2568,7 +2560,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c) nlmsg_end(skb, nlh); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA); out_free_skb: kfree_skb(skb); @@ -2659,7 +2651,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, if (build_acquire(skb, x, xt, xp) < 0) BUG(); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_ACQUIRE); } /* User gives us xfrm_user_policy_info followed by an array of 0 @@ -2773,7 +2765,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct if (build_polexpire(skb, xp, dir, c) < 0) BUG(); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE); } static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) @@ -2835,7 +2827,7 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e nlmsg_end(skb, nlh); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); out_free_skb: kfree_skb(skb); @@ -2863,7 +2855,7 @@ static int xfrm_notify_policy_flush(const struct km_event *c) nlmsg_end(skb, nlh); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); out_free_skb: kfree_skb(skb); @@ -2932,7 +2924,7 @@ static int xfrm_send_report(struct net *net, u8 proto, if (build_report(skb, proto, sel, addr) < 0) BUG(); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_REPORT, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_REPORT); } static inline size_t xfrm_mapping_msgsize(void) @@ -2984,7 +2976,12 @@ static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, if (build_mapping(skb, x, ipaddr, sport) < 0) BUG(); - return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MAPPING, GFP_ATOMIC); + return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MAPPING); +} + +static bool xfrm_is_alive(const struct km_event *c) +{ + return (bool)xfrm_acquire_is_on(c->net); } static struct xfrm_mgr netlink_mgr = { @@ -2996,6 +2993,7 @@ static struct xfrm_mgr netlink_mgr = { .report = xfrm_send_report, .migrate = xfrm_send_migrate, .new_mapping = xfrm_send_mapping, + .is_alive = xfrm_is_alive, }; static int __net_init xfrm_user_net_init(struct net *net) |
