diff options
Diffstat (limited to 'net/core')
38 files changed, 8914 insertions, 4619 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 674641b13ae..71093d94ad2 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,10 +9,11 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o + sock_diag.o dev_ioctl.o tso.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o +obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NET_DMA) += user_dma.o @@ -20,4 +21,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o -obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o +obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o +obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o +obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 0337e2b7686..488dd1a825c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -48,6 +48,7 @@ #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -56,6 +57,7 @@ #include <net/sock.h> #include <net/tcp_states.h> #include <trace/events/skb.h> +#include <net/busy_poll.h> /* * Is a socket 'connection oriented' ? @@ -78,9 +80,10 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn return autoremove_wake_function(wait, mode, sync, key); } /* - * Wait for a packet.. + * Wait for the last received packet to be different from skb */ -static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); @@ -92,7 +95,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) if (error) goto out_err; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (sk->sk_receive_queue.prev != skb) goto out; /* Socket shut down? */ @@ -131,9 +134,9 @@ out_noerr: * __skb_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags + * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts - * @peeked: returns non-zero if this packet has been seen before * @err: error code returned * * Get a datagram skbuff, understands the peeking, nonblocking wakeups @@ -161,7 +164,7 @@ out_noerr: struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, int *peeked, int *off, int *err) { - struct sk_buff *skb; + struct sk_buff *skb, *last; long timeo; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() @@ -182,13 +185,17 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, */ unsigned long cpu_flags; struct sk_buff_head *queue = &sk->sk_receive_queue; + int _off = *off; + last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); skb_queue_walk(queue, skb) { + last = skb; *peeked = skb->peeked; if (flags & MSG_PEEK) { - if (*off >= skb->len) { - *off -= skb->len; + if (_off >= skb->len && (skb->len || _off || + skb->peeked)) { + _off -= skb->len; continue; } skb->peeked = 1; @@ -197,16 +204,21 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, __skb_unlink(skb, queue); spin_unlock_irqrestore(&queue->lock, cpu_flags); + *off = _off; return skb; } spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (sk_can_busy_loop(sk) && + sk_busy_loop(sk, flags & MSG_DONTWAIT)) + continue; + /* User doesn't want to wait */ error = -EAGAIN; if (!timeo) goto no_packet; - } while (!wait_for_packet(sk, err, &timeo)); + } while (!wait_for_more_packets(sk, err, &timeo, last)); return NULL; @@ -562,6 +574,77 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +/** + * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * @skb: buffer to copy + * @from: io vector to copy from + * @offset: offset in the io vector to start copying from + * @count: amount of vectors to copy to buffer from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + * Note: the iovec is not modified during the copy + */ +int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + int offset, size_t count) +{ + int len = iov_length(from, count) - offset; + int copy = min_t(int, skb_headlen(skb), len); + int size; + int i = 0; + + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) + return -EFAULT; + + if (len == copy) + return 0; + + offset += copy; + while (count--) { + struct page *page[MAX_SKB_FRAGS]; + int num_pages; + unsigned long base; + unsigned long truesize; + + /* Skip over from offset and copied */ + if (offset >= from->iov_len) { + offset -= from->iov_len; + ++from; + continue; + } + len = from->iov_len - offset; + base = (unsigned long)from->iov_base + offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; + num_pages = get_user_pages_fast(base, size, 0, &page[i]); + if (num_pages != size) { + release_pages(&page[i], num_pages, 0); + return -EFAULT; + } + truesize = size * PAGE_SIZE; + skb->data_len += len; + skb->len += len; + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (len) { + int off = base & ~PAGE_MASK; + int size = min_t(int, len, PAGE_SIZE - off); + skb_fill_page_desc(skb, i, page[i], off, size); + base += size; + len -= size; + i++; + } + offset = 0; + ++from; + } + return 0; +} +EXPORT_SYMBOL(zerocopy_sg_from_iovec); + static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 __user *to, int len, __wsum *csump) @@ -657,17 +740,37 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev); - skb->ip_summed = CHECKSUM_UNNECESSARY; } + skb->csum_valid = !sum; return sum; } EXPORT_SYMBOL(__skb_checksum_complete_head); __sum16 __skb_checksum_complete(struct sk_buff *skb) { - return __skb_checksum_complete_head(skb, skb->len); + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + /* skb->csum holds pseudo checksum */ + sum = csum_fold(csum_add(skb->csum, csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + + return sum; } EXPORT_SYMBOL(__skb_checksum_complete); @@ -749,7 +852,9 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= POLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); + if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) diff --git a/net/core/dev.c b/net/core/dev.c index f64e439b4a0..367a586d0c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -97,8 +97,6 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <linux/stat.h> #include <net/dst.h> #include <net/pkt_sched.h> @@ -106,12 +104,10 @@ #include <net/xfrm.h> #include <linux/highmem.h> #include <linux/init.h> -#include <linux/kmod.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> -#include <net/wext.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> @@ -132,9 +128,10 @@ #include <linux/pci.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> -#include <linux/net_tstamp.h> #include <linux/static_key.h> -#include <net/flow_keys.h> +#include <linux/hashtable.h> +#include <linux/vmalloc.h> +#include <linux/if_macvlan.h> #include "net-sysfs.h" @@ -144,43 +141,17 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) -/* - * The list of packet types we will receive (as opposed to discard) - * and the routines to invoke. - * - * Why 16. Because with 16 the only overlap we get on a hash of the - * low nibble of the protocol value is RARP/SNAP/X.25. - * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversely affected. - * --BLG - * - * 0800 IP - * 8100 802.1Q VLAN - * 0001 802.3 - * 0002 AX.25 - * 0004 802.2 - * 8035 RARP - * 0005 SNAP - * 0805 X.25 - * 0806 ARP - * 8137 IPX - * 0009 Localtalk - * 86DD IPv6 - */ - -#define PTYPE_HASH_SIZE (16) -#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) - static DEFINE_SPINLOCK(ptype_lock); static DEFINE_SPINLOCK(offload_lock); -static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -static struct list_head ptype_all __read_mostly; /* Taps */ +struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); + /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. @@ -203,7 +174,13 @@ static struct list_head offload_base __read_mostly; DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -seqcount_t devnet_rename_seq; +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + +static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { @@ -237,7 +214,7 @@ static inline void rps_unlock(struct softnet_data *sd) } /* Device list insertion */ -static int list_netdevice(struct net_device *dev) +static void list_netdevice(struct net_device *dev) { struct net *net = dev_net(dev); @@ -251,8 +228,6 @@ static int list_netdevice(struct net_device *dev) write_unlock_bh(&dev_base_lock); dev_base_seq_inc(net); - - return 0; } /* Device list removal @@ -510,7 +485,7 @@ EXPORT_SYMBOL(dev_add_offload); * and must not be freed until after all the CPU's have gone * through a quiescent state. */ -void __dev_remove_offload(struct packet_offload *po) +static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &offload_base; struct packet_offload *po1; @@ -528,7 +503,6 @@ void __dev_remove_offload(struct packet_offload *po) out: spin_unlock(&offload_lock); } -EXPORT_SYMBOL(__dev_remove_offload); /** * dev_remove_offload - remove packet offload handler @@ -695,11 +669,10 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each_entry(dev, p, head, name_hlist) + hlist_for_each_entry(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; @@ -721,11 +694,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each_entry_rcu(dev, p, head, name_hlist) + hlist_for_each_entry_rcu(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; @@ -772,11 +744,10 @@ EXPORT_SYMBOL(dev_get_by_name); struct net_device *__dev_get_by_index(struct net *net, int ifindex) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each_entry(dev, p, head, index_hlist) + hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; @@ -797,11 +768,10 @@ EXPORT_SYMBOL(__dev_get_by_index); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each_entry_rcu(dev, p, head, index_hlist) + hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; @@ -835,6 +805,40 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** + * netdev_get_name - get a netdevice name, knowing its ifindex. + * @net: network namespace + * @name: a pointer to the buffer where the name will be stored. + * @ifindex: the ifindex of the interface to get the name from. + * + * The use of raw_seqcount_begin() and cond_resched() before + * retrying is required as we want to give the writers a chance + * to complete when CONFIG_PREEMPT is not set. + */ +int netdev_get_name(struct net *net, char *name, int ifindex) +{ + struct net_device *dev; + unsigned int seq; + +retry: + seq = raw_seqcount_begin(&devnet_rename_seq); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + + strcpy(name, dev->name); + rcu_read_unlock(); + if (read_seqcount_retry(&devnet_rename_seq, seq)) { + cond_resched(); + goto retry; + } + + return 0; +} + +/** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device @@ -1118,6 +1122,8 @@ rollback: write_seqcount_end(&devnet_rename_seq); + netdev_adjacent_rename_links(dev, oldname); + write_lock_bh(&dev_base_lock); hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); @@ -1137,6 +1143,7 @@ rollback: err = ret; write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); + memcpy(oldname, newname, IFNAMSIZ); goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", @@ -1203,8 +1210,12 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } EXPORT_SYMBOL(netdev_state_change); @@ -1227,36 +1238,6 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -/** - * dev_load - load a network module - * @net: the applicable net namespace - * @name: name of interface - * - * If a network interface is not present and the process has suitable - * privileges this function loads the module. If module loading is not - * available in this kernel then it becomes a nop. - */ - -void dev_load(struct net *net, const char *name) -{ - struct net_device *dev; - int no_module; - - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); - rcu_read_unlock(); - - no_module = !dev; - if (no_module && capable(CAP_NET_ADMIN)) - no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } -} -EXPORT_SYMBOL(dev_load); - static int __dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1267,6 +1248,12 @@ static int __dev_open(struct net_device *dev) if (!netif_device_present(dev)) return -ENODEV; + /* Block netpoll from trying to do any rx path servicing. + * If we don't do this there is a chance ndo_poll_controller + * or ndo_poll may be running while we open the device + */ + netpoll_poll_disable(dev); + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); if (ret) @@ -1280,6 +1267,8 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); + netpoll_poll_enable(dev); + if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { @@ -1316,7 +1305,7 @@ int dev_open(struct net_device *dev) if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; @@ -1330,7 +1319,10 @@ static int __dev_close_many(struct list_head *head) ASSERT_RTNL(); might_sleep(); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1341,12 +1333,12 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* @@ -1361,6 +1353,7 @@ static int __dev_close_many(struct list_head *head) dev->flags &= ~IFF_UP; net_dmaengine_put(); + netpoll_poll_enable(dev); } return 0; @@ -1371,30 +1364,30 @@ static int __dev_close(struct net_device *dev) int retval; LIST_HEAD(single); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); retval = __dev_close_many(&single); list_del(&single); + return retval; } static int dev_close_many(struct list_head *head) { struct net_device *dev, *tmp; - LIST_HEAD(tmp_list); - list_for_each_entry_safe(dev, tmp, head, unreg_list) + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) - list_move(&dev->unreg_list, &tmp_list); + list_del_init(&dev->close_list); __dev_close_many(head); - list_for_each_entry(dev, head, unreg_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + list_for_each_entry_safe(dev, tmp, head, close_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); + list_del_init(&dev->close_list); } - /* rollback_registered_many needs the complete original list */ - list_splice(&tmp_list, head); return 0; } @@ -1412,7 +1405,7 @@ int dev_close(struct net_device *dev) if (dev->flags & IFF_UP) { LIST_HEAD(single); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); dev_close_many(&single); list_del(&single); } @@ -1438,6 +1431,10 @@ void dev_disable_lro(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); + /* the same for macvlan devices */ + if (netif_is_macvlan(dev)) + dev = macvlan_dev_real_dev(dev); + dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); @@ -1446,6 +1443,14 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return nb->notifier_call(nb, val, &info); +} static int dev_boot_phase = 1; @@ -1478,7 +1483,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; for_each_net(net) { for_each_netdev(net, dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) goto rollback; @@ -1486,7 +1491,7 @@ int register_netdevice_notifier(struct notifier_block *nb) if (!(dev->flags & IFF_UP)) continue; - nb->notifier_call(nb, NETDEV_UP, dev); + call_netdevice_notifier(nb, NETDEV_UP, dev); } } @@ -1502,10 +1507,11 @@ rollback: goto outroll; if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } @@ -1543,10 +1549,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } unlock: @@ -1556,6 +1563,25 @@ unlock: EXPORT_SYMBOL(unregister_netdevice_notifier); /** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info) +{ + ASSERT_RTNL(); + netdev_notifier_info_init(info, dev); + return raw_notifier_call_chain(&netdev_chain, val, info); +} + +/** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function @@ -1566,8 +1592,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); - return raw_notifier_call_chain(&netdev_chain, val, dev); + struct netdev_notifier_info info; + + return call_netdevice_notifiers_info(val, dev, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -1591,7 +1618,6 @@ void net_enable_timestamp(void) return; } #endif - WARN_ON(in_interrupt()); static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1621,59 +1647,7 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -static int net_hwtstamp_validate(struct ifreq *ifr) -{ - struct hwtstamp_config cfg; - enum hwtstamp_tx_types tx_type; - enum hwtstamp_rx_filters rx_filter; - int tx_type_valid = 0; - int rx_filter_valid = 0; - - if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) - return -EFAULT; - - if (cfg.flags) /* reserved for future extensions */ - return -EINVAL; - - tx_type = cfg.tx_type; - rx_filter = cfg.rx_filter; - - switch (tx_type) { - case HWTSTAMP_TX_OFF: - case HWTSTAMP_TX_ON: - case HWTSTAMP_TX_ONESTEP_SYNC: - tx_type_valid = 1; - break; - } - - switch (rx_filter) { - case HWTSTAMP_FILTER_NONE: - case HWTSTAMP_FILTER_ALL: - case HWTSTAMP_FILTER_SOME: - case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: - case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: - case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: - case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: - case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: - case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: - case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_EVENT: - case HWTSTAMP_FILTER_PTP_V2_SYNC: - case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: - rx_filter_valid = 1; - break; - } - - if (!tx_type_valid || !rx_filter_valid) - return -ERANGE; - - return 0; -} - -static inline bool is_skb_forwardable(struct net_device *dev, - struct sk_buff *skb) +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { unsigned int len; @@ -1692,6 +1666,30 @@ static inline bool is_skb_forwardable(struct net_device *dev, return false; } +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + if (unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_scrub_packet(skb, true); + skb->protocol = eth_type_trans(skb, dev); + + return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif @@ -1713,32 +1711,7 @@ static inline bool is_skb_forwardable(struct net_device *dev, */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - skb_orphan(skb); - nf_reset(skb); - - if (unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - skb->skb_iif = 0; - skb->dev = dev; - skb_dst_drop(skb); - skb->tstamp.tv64 = 0; - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); - return netif_rx(skb); + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1802,7 +1775,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || - skb2->network_header > skb2->tail) { + skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); @@ -1857,6 +1830,231 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) } } +#ifdef CONFIG_XPS +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, + int cpu, u16 index) +{ + struct xps_map *map = NULL; + int pos; + + if (dev_maps) + map = xmap_dereference(dev_maps->cpu_map[cpu]); + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] == index) { + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; + } else { + RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); + kfree_rcu(map, rcu); + map = NULL; + } + break; + } + } + + return map; +} + +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + struct xps_dev_maps *dev_maps; + int cpu, i; + bool active = false; + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (!dev_maps) + goto out_no_maps; + + for_each_possible_cpu(cpu) { + for (i = index; i < dev->num_tx_queues; i++) { + if (!remove_xps_queue(dev_maps, cpu, i)) + break; + } + if (i == dev->num_tx_queues) + active = true; + } + + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + + for (i = index; i < dev->num_tx_queues; i++) + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + +out_no_maps: + mutex_unlock(&xps_map_mutex); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, + int cpu, u16 index) +{ + struct xps_map *new_map; + int alloc_len = XPS_MIN_MAP_ALLOC; + int i, pos; + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] != index) + continue; + return map; + } + + /* Need to add queue to this CPU's existing map */ + if (map) { + if (pos < map->alloc_len) + return map; + + alloc_len = map->alloc_len * 2; + } + + /* Need to allocate new map to store queue on this CPU's map */ + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + return NULL; + + for (i = 0; i < pos; i++) + new_map->queues[i] = map->queues[i]; + new_map->alloc_len = alloc_len; + new_map->len = pos; + + return new_map; +} + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_map *map, *new_map; + int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); + int cpu, numa_node_id = -2; + bool active = false; + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + /* allocate memory for queue storage */ + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + + if (!new_dev_maps) + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + + map = expand_xps_map(map, cpu, index); + if (!map) + goto error; + + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + if (!new_dev_maps) + goto out_no_new_maps; + + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { + /* add queue to CPU maps */ + int pos = 0; + + map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + while ((pos < map->len) && (map->queues[pos] != index)) + pos++; + + if (pos == map->len) + map->queues[map->len++] = index; +#ifdef CONFIG_NUMA + if (numa_node_id == -2) + numa_node_id = cpu_to_node(cpu); + else if (numa_node_id != cpu_to_node(cpu)) + numa_node_id = -1; +#endif + } else if (dev_maps) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[cpu]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + } + + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + + /* Cleanup old maps */ + if (dev_maps) { + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->cpu_map[cpu]); + if (map && map != new_map) + kfree_rcu(map, rcu); + } + + kfree_rcu(dev_maps, rcu); + } + + dev_maps = new_dev_maps; + active = true; + +out_no_new_maps: + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? numa_node_id : + NUMA_NO_NODE); + + if (!dev_maps) + goto out_no_maps; + + /* removes queue from unused CPUs */ + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) + continue; + + if (remove_xps_queue(dev_maps, cpu, index)) + active = true; + } + + /* free map if not active */ + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + +out_no_maps: + mutex_unlock(&xps_map_mutex); + + return 0; +error: + /* remove any maps that we added */ + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } + + mutex_unlock(&xps_map_mutex); + + kfree(new_dev_maps); + return -ENOMEM; +} +EXPORT_SYMBOL(netif_set_xps_queue); + +#endif /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -1880,8 +2078,12 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); - if (txq < dev->real_num_tx_queues) + if (txq < dev->real_num_tx_queues) { qdisc_reset_all_tx_gt(dev, txq); +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, txq); +#endif + } } dev->real_num_tx_queues = txq; @@ -1889,7 +2091,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -1955,30 +2157,42 @@ void __netif_schedule(struct Qdisc *q) } EXPORT_SYMBOL(__netif_schedule); -void dev_kfree_skb_irq(struct sk_buff *skb) +struct dev_kfree_skb_cb { + enum skb_free_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { - if (atomic_dec_and_test(&skb->users)) { - struct softnet_data *sd; - unsigned long flags; + return (struct dev_kfree_skb_cb *)skb->cb; +} + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +{ + unsigned long flags; - local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); - skb->next = sd->completion_queue; - sd->completion_queue = skb; - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); + if (likely(atomic_read(&skb->users) == 1)) { + smp_rmb(); + atomic_set(&skb->users, 0); + } else if (likely(!atomic_dec_and_test(&skb->users))) { + return; } + get_kfree_skb_cb(skb)->reason = reason; + local_irq_save(flags); + skb->next = __this_cpu_read(softnet_data.completion_queue); + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); } -EXPORT_SYMBOL(dev_kfree_skb_irq); +EXPORT_SYMBOL(__dev_kfree_skb_irq); -void dev_kfree_skb_any(struct sk_buff *skb) +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { if (in_irq() || irqs_disabled()) - dev_kfree_skb_irq(skb); + __dev_kfree_skb_irq(skb, reason); else dev_kfree_skb(skb); } -EXPORT_SYMBOL(dev_kfree_skb_any); +EXPORT_SYMBOL(__dev_kfree_skb_any); /** @@ -2018,6 +2232,9 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) struct net_device *dev = skb->dev; const char *driver = ""; + if (!net_ratelimit()) + return; + if (dev && dev->dev.parent) driver = dev_driver_string(dev->dev.parent); @@ -2046,6 +2263,15 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (skb_has_shared_frag(skb)) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -2068,52 +2294,77 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -/** - * skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, - netdev_features_t features) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; + unsigned int vlan_depth = skb->mac_len; __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; - int err; - while (type == htons(ETH_P_8021Q)) { - struct vlan_hdr *vh; + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return ERR_PTR(-EINVAL); + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; + eth = (struct ethhdr *)skb_mac_header(skb); + type = eth->h_proto; } - skb_reset_mac_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - __skb_pull(skb, skb->mac_len); + /* if skb->protocol is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - skb_warn_bad_offload(skb); + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) - return ERR_PTR(err); + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); } + *depth = vlan_depth; + + return type; +} + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + int err; + err = ptype->callbacks.gso_send_check(skb); segs = ERR_PTR(err); if (err || skb_gso_ok(skb, features)) @@ -2131,7 +2382,52 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, return segs; } -EXPORT_SYMBOL(skb_gso_segment); +EXPORT_SYMBOL(skb_mac_gso_segment); + + +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL; + else + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + skb_warn_bad_offload(skb); + + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return skb_mac_gso_segment(skb, features); +} +EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG @@ -2188,13 +2484,8 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) { struct dev_gso_cb *cb; - do { - struct sk_buff *nskb = skb->next; - - skb->next = nskb->next; - nskb->next = NULL; - kfree_skb(nskb); - } while (skb->next); + kfree_skb_list(skb->next); + skb->next = NULL; cb = DEV_GSO_CB(skb); if (cb->destructor) @@ -2229,24 +2520,40 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) return 0; } -static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) { - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_V4_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_V6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); + if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + features &= skb->dev->mpls_features; + + return features; } +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} +#endif static netdev_features_t harmonize_features(struct sk_buff *skb, - __be16 protocol, netdev_features_t features) + netdev_features_t features) { + int tmp; + __be16 type; + + type = skb_network_protocol(skb, &tmp); + features = net_mpls_features(skb, features, type); + if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, protocol)) { + !can_checksum_protocol(features, type)) { features &= ~NETIF_F_ALL_CSUM; - features &= ~NETIF_F_SG; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2262,39 +2569,24 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; - if (protocol == htons(ETH_P_8021Q)) { + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, protocol, features); + return harmonize_features(skb, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); - if (protocol != htons(ETH_P_8021Q)) { - return harmonize_features(skb, protocol, features); - } else { + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; - return harmonize_features(skb, protocol, features); - } -} -EXPORT_SYMBOL(netif_skb_features); + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; -/* - * Returns true if either: - * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG. - */ -static inline int skb_needs_linearize(struct sk_buff *skb, - int features) -{ - return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && - !(features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && - !(features & NETIF_F_SG))); + return harmonize_features(skb, features); } +EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) @@ -2316,8 +2608,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, features = netif_skb_features(skb); if (vlan_tx_tag_present(skb) && - !(features & NETIF_F_HW_VLAN_TX)) { - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (unlikely(!skb)) goto out; @@ -2362,6 +2655,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); skb_len = skb->len; + trace_net_dev_start_xmit(skb, dev); rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); if (rc == NETDEV_TX_OK) @@ -2376,17 +2670,11 @@ gso: skb->next = nskb->next; nskb->next = NULL; - /* - * If device doesn't need nskb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(nskb); - if (!list_empty(&ptype_all)) dev_queue_xmit_nit(nskb, dev); skb_len = nskb->len; + trace_net_dev_start_xmit(nskb, dev); rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { @@ -2402,134 +2690,46 @@ gso: } while (skb->next); out_kfree_gso_skb: - if (likely(skb->next == NULL)) + if (likely(skb->next == NULL)) { skb->destructor = DEV_GSO_CB(skb)->destructor; + consume_skb(skb); + return rc; + } out_kfree_skb: kfree_skb(skb); out: return rc; } +EXPORT_SYMBOL_GPL(dev_hard_start_xmit); -static u32 hashrnd __read_mostly; - -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. - */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, - unsigned int num_tx_queues) +static void qdisc_pkt_len_init(struct sk_buff *skb) { - u32 hash; - u16 qoffset = 0; - u16 qcount = num_tx_queues; + const struct skb_shared_info *shinfo = skb_shinfo(skb); - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; - } - - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = jhash_1word(hash, hashrnd); - - return (u16) (((u64) hash * qcount) >> 32) + qoffset; -} -EXPORT_SYMBOL(__skb_tx_hash); - -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) -{ -#ifdef CONFIG_XPS - struct xps_dev_maps *dev_maps; - struct xps_map *map; - int queue_index = -1; - - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[raw_smp_processor_id()]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->rxhash; - hash = jhash_1word(hash, hashrnd); - queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; - } - } - rcu_read_unlock(); - - return queue_index; -#else - return -1; -#endif -} - -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb) -{ - int queue_index; - const struct net_device_ops *ops = dev->netdev_ops; + qdisc_skb_cb(skb)->pkt_len = skb->len; - if (dev->real_num_tx_queues == 1) - queue_index = 0; - else if (ops->ndo_select_queue) { - queue_index = ops->ndo_select_queue(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); - } else { - struct sock *sk = skb->sk; - queue_index = sk_tx_queue_get(sk); + /* To get more precise estimation of bytes sent on wire, + * we add to pkt_len the headers size of all segments + */ + if (shinfo->gso_size) { + unsigned int hdr_len; + u16 gso_segs = shinfo->gso_segs; - if (queue_index < 0 || skb->ooo_okay || - queue_index >= dev->real_num_tx_queues) { - int old_index = queue_index; + /* mac layer + network layer */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - queue_index = get_xps_queue(dev, skb); - if (queue_index < 0) - queue_index = skb_tx_hash(dev, skb); + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + hdr_len += tcp_hdrlen(skb); + else + hdr_len += sizeof(struct udphdr); - if (queue_index != old_index && sk) { - struct dst_entry *dst = - rcu_dereference_check(sk->sk_dst_cache, 1); + if (shinfo->gso_type & SKB_GSO_DODGY) + gso_segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); - if (dst && skb_dst(skb) == dst) - sk_tx_queue_set(sk, queue_index); - } - } + qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, @@ -2540,7 +2740,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, bool contended; int rc; - qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a @@ -2595,7 +2795,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); @@ -2632,8 +2832,9 @@ int dev_loopback_xmit(struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); /** - * dev_queue_xmit - transmit a buffer + * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit + * @accel_priv: private data used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -2656,13 +2857,15 @@ EXPORT_SYMBOL(dev_loopback_xmit); * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -int dev_queue_xmit(struct sk_buff *skb) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; + skb_reset_mac_header(skb); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ @@ -2670,7 +2873,7 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = netdev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT @@ -2729,14 +2932,26 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); + atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return rc; out: rcu_read_unlock_bh(); return rc; } + +int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} EXPORT_SYMBOL(dev_queue_xmit); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ + return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); + /*======================================================================= Receiver routines @@ -2757,41 +2972,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -/* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Sets rxhash in skb to non-zero hash value - * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb - * if hash is a canonical 4-tuple hash over transport ports. - */ -void __skb_get_rxhash(struct sk_buff *skb) -{ - struct flow_keys keys; - u32 hash; - - if (!skb_flow_dissect(skb, &keys)) - return; - - if (keys.ports) - skb->l4_rxhash = 1; - - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys.dst < (__force u32)keys.src) || - (((__force u32)keys.dst == (__force u32)keys.src) && - ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { - swap(keys.dst, keys.src); - swap(keys.port16[0], keys.port16[1]); - } - - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, hashrnd); - if (!hash) - hash = 1; - - skb->rxhash = hash; -} -EXPORT_SYMBOL(__skb_get_rxhash); - #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ @@ -2825,7 +3005,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb->rxhash & flow_table->mask; + flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) @@ -2859,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_sock_flow_table *sock_flow_table; int cpu = -1; u16 tcpu; + u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); @@ -2887,7 +3068,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } skb_reset_network_header(skb); - if (!skb_get_rxhash(skb)) + hash = skb_get_hash(skb); + if (!hash) goto done; flow_table = rcu_dereference(rxqueue->rps_flow_table); @@ -2896,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, u16 next_cpu; struct rps_dev_flow *rflow; - rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; - next_cpu = sock_flow_table->ents[skb->rxhash & - sock_flow_table->mask]; + next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; /* * If the desired CPU (where last recvmsg was done) is @@ -2929,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } if (map) { - tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + tcpu = map->cpus[((u64) hash * map->len) >> 32]; if (cpu_online(tcpu)) { cpu = tcpu; @@ -3013,6 +3194,46 @@ static int rps_ipi_queued(struct softnet_data *sd) return 0; } +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (netdev_max_backlog >> 1)) + return false; + + sd = &__get_cpu_var(softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). @@ -3022,13 +3243,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, { struct softnet_data *sd; unsigned long flags; + unsigned int qlen; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); - if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -3058,29 +3281,10 @@ enqueue: return NET_RX_DROP; } -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) +static int netif_rx_internal(struct sk_buff *skb) { int ret; - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; - net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); @@ -3109,14 +3313,38 @@ int netif_rx(struct sk_buff *skb) } return ret; } + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + trace_netif_rx_entry(skb); + + return netif_rx_internal(skb); +} EXPORT_SYMBOL(netif_rx); int netif_rx_ni(struct sk_buff *skb) { int err; + trace_netif_rx_ni_entry(skb); + preempt_disable(); - err = netif_rx(skb); + err = netif_rx_internal(skb); if (local_softirq_pending()) do_softirq(); preempt_enable(); @@ -3142,7 +3370,10 @@ static void net_tx_action(struct softirq_action *h) clist = clist->next; WARN_ON(atomic_read(&skb->users)); - trace_kfree_skb(skb, net_tx_action); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) + trace_consume_skb(skb); + else + trace_kfree_skb(skb, net_tx_action); __kfree_skb(skb); } } @@ -3164,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h) root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3174,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h) &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -3260,7 +3491,7 @@ out: * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * - * Register a receive hander for a device. This handler will then be + * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * @@ -3277,6 +3508,7 @@ int netdev_rx_handler_register(struct net_device *dev, if (dev->rx_handler) return -EBUSY; + /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); @@ -3288,7 +3520,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register); * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * - * Unregister a receive hander from a device. + * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ @@ -3297,6 +3529,11 @@ void netdev_rx_handler_unregister(struct net_device *dev) ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); + /* a reader seeing a non NULL rx_handler in a rcu_read_lock() + * section has a guarantee to see a non NULL rx_handler_data + * as well. + */ + synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); @@ -3308,17 +3545,18 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_ARP): - case __constant_htons(ETH_P_IP): - case __constant_htons(ETH_P_IPV6): - case __constant_htons(ETH_P_8021Q): + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): return true; default: return false; } } -static int __netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; @@ -3327,32 +3565,16 @@ static int __netif_receive_skb(struct sk_buff *skb) bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; - unsigned long pflags = current->flags; net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); - /* - * PFMEMALLOC skbs are special, they should - * - be delivered to SOCK_MEMALLOC sockets only - * - stay away from userspace - * - have bounded memory usage - * - * Use PF_MEMALLOC as this saves us from propagating the allocation - * context down to all allocation sites. - */ - if (sk_memalloc_socks() && skb_pfmemalloc(skb)) - current->flags |= PF_MEMALLOC; - - /* if we've gotten here through NAPI, check netpoll */ - if (netpoll_receive_skb(skb)) - goto out; - orig_dev = skb->dev; skb_reset_network_header(skb); - skb_reset_transport_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); skb_reset_mac_len(skb); pt_prev = NULL; @@ -3364,7 +3586,8 @@ another_round: __this_cpu_inc(softnet_data.processed); - if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || + skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = vlan_untag(skb); if (unlikely(!skb)) goto unlock; @@ -3377,7 +3600,7 @@ another_round: } #endif - if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -3396,8 +3619,7 @@ skip_taps: ncls: #endif - if (sk_memalloc_socks() && skb_pfmemalloc(skb) - && !skb_pfmemalloc_protocol(skb)) + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (vlan_tx_tag_present(skb)) { @@ -3419,6 +3641,7 @@ ncls: } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: + ret = NET_RX_SUCCESS; goto unlock; case RX_HANDLER_ANOTHER: goto another_round; @@ -3431,8 +3654,15 @@ ncls: } } - if (vlan_tx_nonzero_tag_present(skb)) - skb->pkt_type = PACKET_OTHERHOST; + if (unlikely(vlan_tx_tag_present(skb))) { + if (vlan_tx_tag_get_id(skb)) + skb->pkt_type = PACKET_OTHERHOST; + /* Note: we might in the future use prio bits + * and set skb->priority like in vlan_do_receive() + * For the time being, just ignore Priority Code Point + */ + skb->vlan_tci = 0; + } /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; @@ -3466,27 +3696,35 @@ drop: unlock: rcu_read_unlock(); -out: - tsk_restore_flags(current, pflags, PF_MEMALLOC); return ret; } -/** - * netif_receive_skb - process receive buffer from network - * @skb: buffer to process - * - * netif_receive_skb() is the main receive data processing function. - * It always succeeds. The buffer may be dropped during processing - * for congestion control or by the protocol layers. - * - * This function may only be called from softirq context and interrupts - * should be enabled. - * - * Return values (usually ignored): - * NET_RX_SUCCESS: no congestion - * NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb(struct sk_buff *skb) +{ + int ret; + + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { + unsigned long pflags = current->flags; + + /* + * PFMEMALLOC skbs are special, they should + * - be delivered to SOCK_MEMALLOC sockets only + * - stay away from userspace + * - have bounded memory usage + * + * Use PF_MEMALLOC as this saves us from propagating the allocation + * context down to all allocation sites. + */ + current->flags |= PF_MEMALLOC; + ret = __netif_receive_skb_core(skb, true); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + } else + ret = __netif_receive_skb_core(skb, false); + + return ret; +} + +static int netif_receive_skb_internal(struct sk_buff *skb) { net_timestamp_check(netdev_tstamp_prequeue, skb); @@ -3512,6 +3750,28 @@ int netif_receive_skb(struct sk_buff *skb) #endif return __netif_receive_skb(skb); } + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + trace_netif_receive_skb_entry(skb); + + return netif_receive_skb_internal(skb); +} EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending @@ -3561,7 +3821,7 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb); + err = ptype->callbacks.gro_complete(skb, 0); break; } rcu_read_unlock(); @@ -3573,7 +3833,7 @@ static int napi_gro_complete(struct sk_buff *skb) } out: - return netif_receive_skb(skb); + return netif_receive_skb_internal(skb); } /* napi->gro_list contains packets ordered by age. @@ -3609,21 +3869,66 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), + skb_mac_header(skb), maclen); NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; + } +} + +static void skb_gro_reset_offset(struct sk_buff *skb) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb_mac_header(skb) == skb_tail_pointer(skb) && + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + pinfo->frags[0].page_offset += grow; + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } @@ -3634,16 +3939,17 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff __be16 type = skb->protocol; struct list_head *head = &offload_base; int same_flow; - int mac_len; enum gro_result ret; + int grow; - if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) + if (!(skb->dev->features & NETIF_F_GRO)) goto normal; if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; gro_list_prepare(napi, skb); + NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3651,11 +3957,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff continue; skb_set_network_header(skb, skb_gro_offset(skb)); - mac_len = skb->network_header - skb->mac_header; - skb->mac_len = mac_len; + skb_reset_mac_len(skb); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->udp_mark = 0; pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; @@ -3680,39 +3986,35 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (same_flow) goto ok; - if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) + if (NAPI_GRO_CB(skb)->flush) goto normal; - napi->gro_count++; + if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { + struct sk_buff *nskb = napi->gro_list; + + /* locate the end of the list to select the 'oldest' flow */ + while (nskb->next) { + pp = &nskb->next; + nskb = *pp; + } + *pp = NULL; + nskb->next = NULL; + napi_gro_complete(nskb); + } else { + napi->gro_count++; + } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; ret = GRO_HELD; pull: - if (skb_headlen(skb) < skb_gro_offset(skb)) { - int grow = skb_gro_offset(skb) - skb_headlen(skb); - - BUG_ON(skb->end - skb->tail < grow); - - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - - skb->tail += grow; - skb->data_len -= grow; - - skb_shinfo(skb)->frags[0].page_offset += grow; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - - if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(skb_shinfo(skb)->frags, - skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); - } - } - + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); ok: return ret; @@ -3721,12 +4023,39 @@ normal: goto pull; } +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb(skb)) + if (netif_receive_skb_internal(skb)) ret = GRO_DROP; break; @@ -3749,25 +4078,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) return ret; } -static void skb_gro_reset_offset(struct sk_buff *skb) -{ - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; - - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; - - if (skb->mac_header == skb->tail && - pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); - } -} - gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + trace_napi_gro_receive_entry(skb); + skb_gro_reset_offset(skb); return napi_skb_finish(dev_gro_receive(napi, skb), skb); @@ -3782,6 +4096,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; } @@ -3792,24 +4109,22 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); - if (skb) - napi->skb = skb; + napi->skb = skb; } return skb; } EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: + __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); - - if (ret == GRO_HELD) - skb_gro_pull(skb, -ETH_HLEN); - else if (netif_receive_skb(skb)) + if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) ret = GRO_DROP; break; @@ -3825,39 +4140,42 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff * return ret; } +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - unsigned int hlen; - unsigned int off; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb); - off = skb_gro_offset(skb); - hlen = off + sizeof(*eth); - eth = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - eth = skb_gro_header_slow(skb, hlen, off); + eth = skb_gro_header_fast(skb, 0); + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + return NULL; } + } else { + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; } - - skb_gro_pull(skb, sizeof(*eth)); + __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. + * special handling. + * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; -out: return skb; } @@ -3868,12 +4186,14 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) if (!skb) return GRO_DROP; + trace_napi_gro_frags_entry(skb); + return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); } EXPORT_SYMBOL(napi_gro_frags); /* - * net_rps_action sends any pending IPI's for rps. + * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) @@ -3891,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) - __smp_call_function_single(remsd->cpu, - &remsd->csd, 0); + smp_call_function_single_async(remsd->cpu, + &remsd->csd); remsd = next; } } else @@ -3916,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota) #endif napi->weight = weight_p; local_irq_disable(); - while (work < quota) { + while (1) { struct sk_buff *skb; - unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); @@ -3932,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota) } rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - - if (qlen < quota - work) { + if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&napi->poll_list); napi->state = 0; + rps_unlock(sd); - quota = work + qlen; + break; } + + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); rps_unlock(sd); } local_irq_enable(); @@ -3979,7 +4298,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(n->gro_list); list_del(&n->poll_list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); @@ -4002,6 +4321,58 @@ void napi_complete(struct napi_struct *n) } EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -4010,6 +4381,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->gro_list = NULL; napi->skb = NULL; napi->poll = poll; + if (weight > NAPI_POLL_WEIGHT) + pr_err_once("netif_napi_add() called with weight %d on device %s\n", + weight, dev->name); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; @@ -4023,17 +4397,10 @@ EXPORT_SYMBOL(netif_napi_add); void netif_napi_del(struct napi_struct *napi) { - struct sk_buff *skb, *next; - list_del_init(&napi->dev_list); napi_free_frags(napi); - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; - skb->next = NULL; - kfree_skb(skb); - } - + kfree_skb_list(napi->gro_list); napi->gro_list = NULL; napi->gro_count = 0; } @@ -4056,7 +4423,7 @@ static void net_rx_action(struct softirq_action *h) * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ - if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) + if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; local_irq_enable(); @@ -4134,540 +4501,717 @@ softnet_break: goto out; } -static gifconf_func_t *gifconf_list[NPROTO]; +struct netdev_adjacent { + struct net_device *dev; + + /* upper master flag, there can only be one master device per list */ + bool master; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + + /* private field for the users */ + void *private; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + list_for_each_entry(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; + } + return NULL; +} /** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler + * netdev_has_upper_dev - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks only immediate upper device, + * not through a complete stack of devices. The caller must hold the RTNL lock. */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +bool netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) { - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} -EXPORT_SYMBOL(register_gifconf); + ASSERT_RTNL(); + return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); +} +EXPORT_SYMBOL(netdev_has_upper_dev); -/* - * Map an interface index to its name (SIOCGIFNAME) +/** + * netdev_has_any_upper_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to an upper device and return true in case + * it is. The caller must hold the RTNL lock. */ +static bool netdev_has_any_upper_dev(struct net_device *dev) +{ + ASSERT_RTNL(); -/* - * We need this ioctl for efficient implementation of the - * if_indextoname() function required by the IPv6 API. Without - * it, we would have to search all the interfaces to find a - * match. --pb - */ + return !list_empty(&dev->all_adj_list.upper); +} -static int dev_ifname(struct net *net, struct ifreq __user *arg) +/** + * netdev_master_upper_dev_get - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RTNL lock. + */ +struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct net_device *dev; - struct ifreq ifr; - unsigned seq; + struct netdev_adjacent *upper; - /* - * Fetch the caller's info block. - */ + ASSERT_RTNL(); - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; + if (list_empty(&dev->adj_list.upper)) + return NULL; -retry: - seq = read_seqcount_begin(&devnet_rename_seq); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); - if (!dev) { - rcu_read_unlock(); - return -ENODEV; - } + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master)) + return upper->dev; + return NULL; +} +EXPORT_SYMBOL(netdev_master_upper_dev_get); - strcpy(ifr.ifr_name, dev->name); - rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) - goto retry; +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; } +EXPORT_SYMBOL(netdev_adjacent_get_private); -/* - * Perform a SIOCGIFCONF call. This structure will change - * size eventually, and there is nothing I can do about it. - * Thus we will need a 'compatibility mode'. +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. */ - -static int dev_ifconf(struct net *net, char __user *arg) +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) { - struct ifconf ifc; - struct net_device *dev; - char __user *pos; - int len; - int total; - int i; - - /* - * Fetch the caller's info block. - */ + struct netdev_adjacent *upper; - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); - pos = ifc.ifc_buf; - len = ifc.ifc_len; + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - /* - * Loop over the interfaces, and write an info block for each. - */ + if (&upper->list == &dev->adj_list.upper) + return NULL; - total = 0; - for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0); - else - done = gifconf_list[i](dev, pos + total, - len - total); - if (done < 0) - return -EFAULT; - total += done; - } - } - } + *iter = &upper->list; - /* - * All done. Write the updated control block back to the caller. - */ - ifc.ifc_len = total; - - /* - * Both BSD and Solaris return 0 here, so we do too. - */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; + return upper->dev; } +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); -#ifdef CONFIG_PROC_FS +/** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net *net = seq_file_net(seq); - struct net_device *dev; - struct hlist_node *p; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); + if (&upper->list == &dev->all_adj_list.upper) + return NULL; - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, p, h, name_hlist) { - if (++count == offset) - return dev; - } + *iter = &upper->list; - return NULL; + return upper->dev; } +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) { - struct net_device *dev; - unsigned int bucket; + struct netdev_adjacent *lower; - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; + lower = list_entry(*iter, struct netdev_adjacent, list); - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); + if (&lower->list == &dev->adj_list.lower) + return NULL; - return NULL; + *iter = lower->list.next; + + return lower->private; } +EXPORT_SYMBOL(netdev_lower_get_next_private); -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. */ -void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) { - rcu_read_lock(); - if (!*pos) - return SEQ_START_TOKEN; + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held()); - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) return NULL; - return dev_from_bucket(seq, pos); -} + *iter = &lower->list; -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return dev_from_bucket(seq, pos); + return lower->private; } +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); -void dev_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { - rcu_read_unlock(); -} + struct netdev_adjacent *lower; -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -{ - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + lower = list_entry((*iter)->next, struct netdev_adjacent, list); - seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " - "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); -} + if (&lower->list == &dev->adj_list.lower) + return NULL; -/* - * Called from the PROCfs module. This now uses the new arbitrary sized - * /proc/net interface to create /proc/net/dev - */ -static int dev_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); - else - dev_seq_printf_stats(seq, v); - return 0; + *iter = &lower->list; + + return lower->dev; } +EXPORT_SYMBOL(netdev_lower_get_next); -static struct softnet_data *softnet_get_online(loff_t *pos) +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) { - struct softnet_data *sd = NULL; + struct netdev_adjacent *lower; - while (*pos < nr_cpu_ids) - if (cpu_online(*pos)) { - sd = &per_cpu(softnet_data, *pos); - break; - } else - ++*pos; - return sd; + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; } +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +/** + * netdev_master_upper_dev_get_rcu - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RCU read lock. + */ +struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - return softnet_get_online(pos); + struct netdev_adjacent *upper; + + upper = list_first_or_null_rcu(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (upper && likely(upper->master)) + return upper->dev; + return NULL; } +EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static int netdev_adjacent_sysfs_add(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) { - ++*pos; - return softnet_get_online(pos); + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", adj_dev->name); + return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), + linkname); } - -static void softnet_seq_stop(struct seq_file *seq, void *v) +static void netdev_adjacent_sysfs_del(struct net_device *dev, + char *name, + struct list_head *dev_list) { + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", name); + sysfs_remove_link(&(dev->dev.kobj), linkname); } -static int softnet_seq_show(struct seq_file *seq, void *v) +#define netdev_adjacent_is_neigh_list(dev, dev_list) \ + (dev_list == &dev->adj_list.upper || \ + dev_list == &dev->adj_list.lower) + +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list, + void *private, bool master) { - struct softnet_data *sd = v; + struct netdev_adjacent *adj; + int ret; + + adj = __netdev_find_adj(dev, adj_dev, dev_list); + + if (adj) { + adj->ref_nr++; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->ref_nr = 1; + adj->private = private; + dev_hold(adj_dev); + + pr_debug("dev_hold for %s, because of link added from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + + if (netdev_adjacent_is_neigh_list(dev, dev_list)) { + ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); + if (ret) + goto free_adj; + } + + /* Ensure that master link is always the first item in list. */ + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto remove_symlinks; + + list_add_rcu(&adj->list, dev_list); + } else { + list_add_tail_rcu(&adj->list, dev_list); + } - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, - 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps); return 0; -} -static const struct seq_operations dev_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_seq_show, -}; +remove_symlinks: + if (netdev_adjacent_is_neigh_list(dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: + kfree(adj); + dev_put(adj_dev); -static int dev_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); + return ret; } -static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, - .open = dev_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + struct netdev_adjacent *adj; -static const struct seq_operations softnet_seq_ops = { - .start = softnet_seq_start, - .next = softnet_seq_next, - .stop = softnet_seq_stop, - .show = softnet_seq_show, -}; + adj = __netdev_find_adj(dev, adj_dev, dev_list); -static int softnet_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &softnet_seq_ops); -} + if (!adj) { + pr_err("tried to remove device %s from %s\n", + dev->name, adj_dev->name); + BUG(); + } -static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, - .open = softnet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; + if (adj->ref_nr > 1) { + pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, + adj->ref_nr-1); + adj->ref_nr--; + return; + } + + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); -static void *ptype_get_idx(loff_t pos) + if (netdev_adjacent_is_neigh_list(dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} + +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) { - struct packet_type *pt = NULL; - loff_t i = 0; - int t; + int ret; - list_for_each_entry_rcu(pt, &ptype_all, list) { - if (i == pos) - return pt; - ++i; - } + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, + master); + if (ret) + return ret; - for (t = 0; t < PTYPE_HASH_SIZE; t++) { - list_for_each_entry_rcu(pt, &ptype_base[t], list) { - if (i == pos) - return pt; - ++i; - } + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, + false); + if (ret) { + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + return ret; } - return NULL; + + return 0; } -static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) +static int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower, + NULL, false); } -static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list) { - struct packet_type *pt; - struct list_head *nxt; - int hash; + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, down_list); +} - ++*pos; - if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); +static void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower); +} - pt = v; - nxt = pt->list.next; - if (pt->type == htons(ETH_P_ALL)) { - if (nxt != &ptype_all) - goto found; - hash = 0; - nxt = ptype_base[0].next; - } else - hash = ntohs(pt->type) & PTYPE_HASH_MASK; +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) +{ + int ret = __netdev_adjacent_dev_link(dev, upper_dev); - while (nxt == &ptype_base[hash]) { - if (++hash >= PTYPE_HASH_SIZE) - return NULL; - nxt = ptype_base[hash].next; + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); + if (ret) { + __netdev_adjacent_dev_unlink(dev, upper_dev); + return ret; } -found: - return list_entry(nxt, struct packet_type, list); + + return 0; } -static void ptype_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) { - rcu_read_unlock(); + __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); } -static int ptype_seq_show(struct seq_file *seq, void *v) +static int __netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, bool master, + void *private) { - struct packet_type *pt = v; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { - if (pt->type == htons(ETH_P_ALL)) - seq_puts(seq, "ALL "); - else - seq_printf(seq, "%04x", ntohs(pt->type)); + ASSERT_RTNL(); + + if (dev == upper_dev) + return -EBUSY; - seq_printf(seq, " %-8s %pF\n", - pt->dev ? pt->dev->name : "", pt->func); + /* To prevent loops, check if dev is not upper device to upper_dev. */ + if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) + return -EBUSY; + + if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) + return -EEXIST; + + if (master && netdev_master_upper_dev_get(dev)) + return -EBUSY; + + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + master); + if (ret) + return ret; + + /* Now that we linked these devs, make all the upper_dev's + * all_adj_list.upper visible to every dev's all_adj_list.lower an + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + pr_debug("Interlinking %s with %s, non-neighbour\n", + i->dev->name, j->dev->name); + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } + } + + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + pr_debug("linking %s's upper device %s with %s\n", + upper_dev->name, i->dev->name, dev->name); + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; } + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + pr_debug("linking %s's lower device %s with %s\n", dev->name, + i->dev->name, upper_dev->name); + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } + + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; -} -static const struct seq_operations ptype_seq_ops = { - .start = ptype_seq_start, - .next = ptype_seq_next, - .stop = ptype_seq_stop, - .show = ptype_seq_show, -}; +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } -static int ptype_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ptype_seq_ops, - sizeof(struct seq_net_private)); -} + i = NULL; -static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, - .open = ptype_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } + i = j = NULL; -static int __net_init dev_proc_net_init(struct net *net) -{ - int rc = -ENOMEM; +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } - if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) - goto out; - if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) - goto out_dev; - if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) - goto out_softnet; + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - if (wext_proc_init(net)) - goto out_ptype; - rc = 0; -out: - return rc; -out_ptype: - proc_net_remove(net, "ptype"); -out_softnet: - proc_net_remove(net, "softnet_stat"); -out_dev: - proc_net_remove(net, "dev"); - goto out; + return ret; } -static void __net_exit dev_proc_net_exit(struct net *net) +/** + * netdev_upper_dev_link - Add a link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. The caller must hold + * the RTNL lock. On a failure a negative errno code is returned. + * On success the reference counts are adjusted and the function + * returns zero. + */ +int netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - wext_proc_exit(net); - - proc_net_remove(net, "ptype"); - proc_net_remove(net, "softnet_stat"); - proc_net_remove(net, "dev"); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL); } +EXPORT_SYMBOL(netdev_upper_dev_link); -static struct pernet_operations __net_initdata dev_proc_ops = { - .init = dev_proc_net_init, - .exit = dev_proc_net_exit, -}; - -static int __init dev_proc_init(void) +/** + * netdev_master_upper_dev_link - Add a master link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. In this case, only + * one master upper device can be linked, although other non-master devices + * might be linked as well. The caller must hold the RTNL lock. + * On a failure a negative errno code is returned. On success the reference + * counts are adjusted and the function returns zero. + */ +int netdev_master_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - return register_pernet_subsys(&dev_proc_ops); + return __netdev_upper_dev_link(dev, upper_dev, true, NULL); } -#else -#define dev_proc_init() 0 -#endif /* CONFIG_PROC_FS */ +EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, + struct net_device *upper_dev, + void *private) +{ + return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); /** - * netdev_set_master - set up master pointer - * @slave: slave device - * @master: new master device - * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success the reference counts - * are adjusted and the function returns zero. + * netdev_upper_dev_unlink - Removes a link to upper device + * @dev: device + * @upper_dev: new upper device + * + * Removes a link to device which is upper to this one. The caller must hold + * the RTNL lock. */ -int netdev_set_master(struct net_device *slave, struct net_device *master) +void netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) { - struct net_device *old = slave->master; - + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - if (master) { - if (old) - return -EBUSY; - dev_hold(master); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); +} +EXPORT_SYMBOL(netdev_upper_dev_unlink); + +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ + struct netdev_adjacent *iter; + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); } - slave->master = master; + list_for_each_entry(iter, &dev->adj_list.lower, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } +} - if (old) - dev_put(old); - return 0; +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_lower_dev_get_private); -/** - * netdev_set_bond_master - set up bonding master/slave pair - * @slave: slave device - * @master: new master device - * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success %RTM_NEWLINK is sent - * to the routing socket and the function returns zero. - */ -int netdev_set_bond_master(struct net_device *slave, struct net_device *master) + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) { - int err; + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; ASSERT_RTNL(); - err = netdev_set_master(slave, master); - if (err) - return err; - if (master) - slave->flags |= IFF_SLAVE; - else - slave->flags &= ~IFF_SLAVE; + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; + } - rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); - return 0; + if (type_check(dev)) + max_nest++; + + return max_nest; } -EXPORT_SYMBOL(netdev_set_bond_master); +EXPORT_SYMBOL(dev_get_nest_level); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; - if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) + if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; kuid_t uid; @@ -4710,6 +5254,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) dev_change_rx_flags(dev, IFF_PROMISC); } + if (notify) + __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } @@ -4729,7 +5275,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) unsigned int old_flags = dev->flags; int err; - err = __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) @@ -4738,22 +5284,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc) } EXPORT_SYMBOL(dev_set_promiscuity); -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { - unsigned int old_flags = dev->flags; + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ASSERT_RTNL(); @@ -4776,9 +5309,30 @@ int dev_set_allmulti(struct net_device *dev, int inc) if (dev->flags ^ old_flags) { dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags); } return 0; } + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + return __dev_set_allmulti(dev, inc, true); +} EXPORT_SYMBOL(dev_set_allmulti); /* @@ -4803,10 +5357,10 @@ void __dev_set_rx_mode(struct net_device *dev) * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); + __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); + __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } @@ -4895,9 +5449,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; + unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -4908,16 +5466,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + __dev_set_allmulti(dev, inc, false); } return ret; } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges) { unsigned int changes = dev->flags ^ old_flags; + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); + if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); @@ -4926,8 +5488,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) } if (dev->flags & IFF_UP && - (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = changes; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + } } /** @@ -4941,21 +5508,29 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) int dev_change_flags(struct net_device *dev, unsigned int flags) { int ret; - unsigned int changes, old_flags = dev->flags; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags); if (ret < 0) return ret; - changes = old_flags ^ dev->flags; - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - __dev_notify_flags(dev, old_flags); + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); + __dev_notify_flags(dev, old_flags, changes); return ret; } EXPORT_SYMBOL(dev_change_flags); +static int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + + dev->mtu = new_mtu; + return 0; +} + /** * dev_set_mtu - Change maximum transfer unit * @dev: device @@ -4965,8 +5540,7 @@ EXPORT_SYMBOL(dev_change_flags); */ int dev_set_mtu(struct net_device *dev, int new_mtu) { - const struct net_device_ops *ops = dev->netdev_ops; - int err; + int err, orig_mtu; if (new_mtu == dev->mtu) return 0; @@ -4978,14 +5552,25 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) if (!netif_device_present(dev)) return -ENODEV; - err = 0; - if (ops->ndo_change_mtu) - err = ops->ndo_change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + orig_mtu = dev->mtu; + err = __dev_set_mtu(dev, new_mtu); + + if (!err) { + err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) { + /* setting mtu back and notifying everyone again, + * so that they have a chance to revert changes. + */ + __dev_set_mtu(dev, orig_mtu); + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + } + } return err; } EXPORT_SYMBOL(dev_set_mtu); @@ -5020,381 +5605,51 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) if (!netif_device_present(dev)) return -ENODEV; err = ops->ndo_set_mac_address(dev, sa); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + if (err) + return err; + dev->addr_assign_type = NET_ADDR_SET; + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); - return err; + return 0; } EXPORT_SYMBOL(dev_set_mac_address); -/* - * Perform the SIOCxIFxxx calls, inside rcu_read_lock() - */ -static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ - int err; - struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); - - if (!dev) - return -ENODEV; - - switch (cmd) { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); - return 0; - - case SIOCGIFMETRIC: /* Get the metric on the interface - (currently unused) */ - ifr->ifr_metric = 0; - return 0; - - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; - - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - - case SIOCGIFSLAVE: - err = -EINVAL; - break; - - case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; - - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; - - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; - - default: - /* dev_ioctl() should ensure this case - * is never reached - */ - WARN_ON(1); - err = -ENOTTY; - break; - - } - return err; -} - -/* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() +/** + * dev_change_carrier - Change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +int dev_change_carrier(struct net_device *dev, bool new_carrier) { - int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - const struct net_device_ops *ops; - - if (!dev) - return -ENODEV; - - ops = dev->netdev_ops; - - switch (cmd) { - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); - - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ - return -EOPNOTSUPP; - - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); - - case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return 0; + const struct net_device_ops *ops = dev->netdev_ops; - case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } + if (!ops->ndo_change_carrier) return -EOPNOTSUPP; - - case SIOCADDMULTI: - if (!ops->ndo_set_rx_mode || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); - - case SIOCDELMULTI: - if (!ops->ndo_set_rx_mode || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); - - case SIOCSIFTXQLEN: - if (ifr->ifr_qlen < 0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; - - case SIOCSIFNAME: - ifr->ifr_newname[IFNAMSIZ-1] = '\0'; - return dev_change_name(dev, ifr->ifr_newname); - - case SIOCSHWTSTAMP: - err = net_hwtstamp_validate(ifr); - if (err) - return err; - /* fall through */ - - /* - * Unknown or private ioctl - */ - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; - - } - return err; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_carrier(dev, new_carrier); } - -/* - * This function handles all "interface"-type I/O control requests. The actual - * 'doing' part of this is dev_ifsioc above. - */ +EXPORT_SYMBOL(dev_change_carrier); /** - * dev_ioctl - network device ioctl - * @net: the applicable net namespace - * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID * - * Issue ioctl functions to devices. This is normally called by the - * user space syscall interfaces but can sometimes be useful for - * other purposes. The return value is the return from the syscall if - * positive or a negative errno code on error. + * Get device physical port ID */ - -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) { - struct ifreq ifr; - int ret; - char *colon; - - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } - if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; - - colon = strchr(ifr.ifr_name, ':'); - if (colon) - *colon = 0; - - /* - * See which interface the caller is talking about. - */ - - switch (cmd) { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); - rcu_read_lock(); - ret = dev_ifsioc_locked(net, &ifr, cmd); - rcu_read_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(net, &ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - return a value - */ - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSIFNAME: - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFMAP: - case SIOCSIFTXQLEN: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* fall through */ - /* - * These ioctl calls: - * - require local superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSMIIREG: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: - case SIOCSHWTSTAMP: - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - /* fall through */ - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - return ret; - - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but - * currently do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. - * Not applicable in our case */ - case SIOCSIFLINK: - return -ENOTTY; + const struct net_device_ops *ops = dev->netdev_ops; - /* - * Unknown or private ioctl. - */ - default: - if (cmd == SIOCWANDEV || - (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(net, &ifr, cmd, arg); - return -ENOTTY; - } + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); } - +EXPORT_SYMBOL(dev_get_phys_port_id); /** * dev_new_index - allocate an ifindex @@ -5417,15 +5672,18 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) { struct net_device *dev, *tmp; + LIST_HEAD(close_head); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -5448,7 +5706,9 @@ static void rollback_registered_many(struct list_head *head) } /* If device is running, close it first. */ - dev_close_many(head); + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ @@ -5469,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); - /* * Flush the unicast and multicast chains */ @@ -5482,11 +5738,19 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif } synchronize_net(); @@ -5514,20 +5778,25 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } - /* Fix illegal SG+CSUM combinations. */ - if ((features & NETIF_F_SG) && - !(features & NETIF_F_ALL_CSUM)) { - netdev_dbg(dev, - "Dropping NETIF_F_SG since no checksum feature.\n"); - features &= ~NETIF_F_SG; - } - /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } + if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IP_CSUM)) { + netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO; + features &= ~NETIF_F_TSO_ECN; + } + + if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IPV6_CSUM)) { + netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO6; + } + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; @@ -5556,6 +5825,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; + return features; } @@ -5655,7 +5931,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; @@ -5664,10 +5940,9 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - pr_err("netdev: Unable to allocate %u rx queues\n", count); + if (!rx) return -ENOMEM; - } + dev->_rx = rx; for (i = 0; i < count; i++) @@ -5690,17 +5965,24 @@ static void netdev_init_one_queue(struct net_device *dev, #endif } +static void netif_free_tx_queues(struct net_device *dev) +{ + kvfree(dev->_tx); +} + static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; + size_t sz = count * sizeof(*tx); - BUG_ON(count < 1); + BUG_ON(count < 1 || count > 0xffff); - tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!tx) { - pr_err("netdev: Unable to allocate %u tx queues\n", count); - return -ENOMEM; + tx = vzalloc(sz); + if (!tx) + return -ENOMEM; } dev->_tx = tx; @@ -5760,6 +6042,15 @@ int register_netdevice(struct net_device *dev) } } + if (((dev->hw_features | dev->features) & + NETIF_F_HW_VLAN_CTAG_FILTER) && + (!dev->netdev_ops->ndo_vlan_rx_add_vid || + !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { + netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); + ret = -EINVAL; + goto err_uninit; + } + ret = -EBUSY; if (!dev->ifindex) dev->ifindex = dev_new_index(net); @@ -5776,19 +6067,22 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_SOFT_FEATURES; dev->wanted_features = dev->features & dev->hw_features; - /* Turn on no cache copy if HW is doing checksum */ if (!(dev->flags & IFF_LOOPBACK)) { dev->hw_features |= NETIF_F_NOCACHE_COPY; - if (dev->features & NETIF_F_ALL_CSUM) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; - } } /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; + /* Make NETIF_F_SG inheritable to tunnel devices. + */ + dev->hw_enc_features |= NETIF_F_SG; + + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) @@ -5815,6 +6109,13 @@ int register_netdevice(struct net_device *dev) list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); + /* If the device has permanent device address, driver should + * set dev_addr and also addr_assign_type should be set to + * NET_ADDR_PERM (default value). + */ + if (dev->addr_assign_type == NET_ADDR_PERM) + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); + /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); @@ -5828,7 +6129,7 @@ int register_netdevice(struct net_device *dev) */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); out: return ret; @@ -6046,6 +6347,12 @@ void netdev_run_todo(void) if (dev->destructor) dev->destructor(dev); + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + /* Free network device */ kobject_put(&dev->dev.kobj); } @@ -6097,6 +6404,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, &dev->stats); } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + storage->tx_dropped += atomic_long_read(&dev->tx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -6129,6 +6437,13 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +void netdev_freemem(struct net_device *dev) +{ + char *addr = (char *)dev - dev->padded; + + kvfree(addr); +} + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -6138,7 +6453,7 @@ EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subquue structs + * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, @@ -6156,7 +6471,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; @@ -6172,18 +6487,18 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL); - if (!p) { - pr_err("alloc_netdev: Unable to allocate device\n"); + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!p) + p = vzalloc(alloc_size); + if (!p) return NULL; - } dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_p; + goto free_dev; if (dev_addr_init(dev)) goto free_pcpu; @@ -6198,7 +6513,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->all_adj_list.upper); + INIT_LIST_HEAD(&dev->all_adj_list.lower); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); @@ -6207,7 +6527,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) @@ -6226,13 +6546,8 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - kfree(dev->_tx); -#ifdef CONFIG_RPS - kfree(dev->_rx); -#endif - -free_p: - kfree(p); +free_dev: + netdev_freemem(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); @@ -6251,8 +6566,8 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); - kfree(dev->_tx); -#ifdef CONFIG_RPS + netif_free_tx_queues(dev); +#ifdef CONFIG_SYSFS kfree(dev->_rx); #endif @@ -6269,7 +6584,7 @@ void free_netdev(struct net_device *dev) /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); + netdev_freemem(dev); return; } @@ -6327,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdevice_many - unregister many devices * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -6336,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head) rollback_registered_many(head); list_for_each_entry(dev, head, unreg_list) net_set_todo(dev); + list_del(head); } } EXPORT_SYMBOL(unregister_netdevice_many); @@ -6431,7 +6750,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); /* * Flush the unicast and multicast chains @@ -6470,7 +6789,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); synchronize_net(); err = 0; @@ -6522,11 +6841,11 @@ static int dev_cpu_callback(struct notifier_block *nfb, /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { - netif_rx(skb); + netif_rx_internal(skb); input_queue_head_incr(oldsd); } while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx(skb); + netif_rx_internal(skb); input_queue_head_incr(oldsd); } @@ -6562,7 +6881,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); -static struct hlist_head *netdev_create_hash(void) +static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; @@ -6731,6 +7050,34 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -6742,7 +7089,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - rtnl_lock(); + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops) @@ -6752,7 +7110,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } } unregister_netdevice_many(&dev_kill_list); - list_del(&dev_kill_list); rtnl_unlock(); } @@ -6800,24 +7157,18 @@ static int __init net_dev_init(void) for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); - memset(sd, 0, sizeof(*sd)); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); - sd->completion_queue = NULL; INIT_LIST_HEAD(&sd->poll_list); - sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; - sd->csd.flags = 0; sd->cpu = i; #endif sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; - sd->backlog.gro_list = NULL; - sd->backlog.gro_count = 0; } dev_boot_phase = 0; @@ -6842,19 +7193,9 @@ static int __init net_dev_init(void) hotcpu_notifier(dev_cpu_callback, 0); dst_init(); - dev_mcast_init(); rc = 0; out: return rc; } subsys_initcall(net_dev_init); - -static int __init initialize_hashrnd(void) -{ - get_random_bytes(&hashrnd, sizeof(hashrnd)); - return 0; -} - -late_initcall_sync(initialize_hashrnd); - diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index b079c7bbc15..b6b230600b9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -15,7 +15,6 @@ #include <linux/rtnetlink.h> #include <linux/export.h> #include <linux/list.h> -#include <linux/proc_fs.h> /* * General list handling functions @@ -23,7 +22,8 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global) + unsigned char addr_type, bool global, + bool sync) { struct netdev_hw_addr *ha; int alloc_size; @@ -38,7 +38,8 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, ha->type = addr_type; ha->refcount = 1; ha->global_use = global; - ha->synced = false; + ha->synced = sync ? 1 : 0; + ha->sync_cnt = 0; list_add_tail_rcu(&ha->list, &list->list); list->count++; @@ -47,7 +48,8 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global) + unsigned char addr_type, bool global, bool sync, + int sync_count) { struct netdev_hw_addr *ha; @@ -64,43 +66,63 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, else ha->global_use = true; } + if (sync) { + if (ha->synced && sync_count) + return -EEXIST; + else + ha->synced++; + } ha->refcount++; return 0; } } - return __hw_addr_create_ex(list, addr, addr_len, addr_type, global); + return __hw_addr_create_ex(list, addr, addr_len, addr_type, global, + sync); } static int __hw_addr_add(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, + 0); +} + +static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, + struct netdev_hw_addr *ha, bool global, + bool sync) +{ + if (global && !ha->global_use) + return -ENOENT; + + if (sync && !ha->synced) + return -ENOENT; + + if (global) + ha->global_use = false; + + if (sync) + ha->synced--; + + if (--ha->refcount) + return 0; + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + list->count--; + return 0; } static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global) + unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; list_for_each_entry(ha, &list->list, list) { if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) { - if (global) { - if (!ha->global_use) - break; - else - ha->global_use = false; - } - if (--ha->refcount) - return 0; - list_del_rcu(&ha->list); - kfree_rcu(ha, rcu_head); - list->count--; - return 0; - } + (ha->type == addr_type || !addr_type)) + return __hw_addr_del_entry(list, ha, global, sync); } return -ENOENT; } @@ -109,50 +131,68 @@ static int __hw_addr_del(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); + return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false); } -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) +static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr *ha, + int addr_len) { int err; - struct netdev_hw_addr *ha, *ha2; - unsigned char type; - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); - if (err) - goto unroll; + err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, + false, true, ha->sync_cnt); + if (err && err != -EEXIST) + return err; + + if (!err) { + ha->sync_cnt++; + ha->refcount++; } + return 0; +} -unroll: - list_for_each_entry(ha2, &from_list->list, list) { - if (ha2 == ha) - break; - type = addr_type ? addr_type : ha2->type; - __hw_addr_del(to_list, ha2->addr, addr_len, type); - } - return err; +static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + struct netdev_hw_addr *ha, + int addr_len) +{ + int err; + + err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type, + false, true); + if (err) + return; + ha->sync_cnt--; + /* address on from list is not marked synced */ + __hw_addr_del_entry(from_list, ha, false, false); } -EXPORT_SYMBOL(__hw_addr_add_multiple); -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) +static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) { - struct netdev_hw_addr *ha; - unsigned char type; + int err = 0; + struct netdev_hw_addr *ha, *tmp; - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, type); + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->sync_cnt == ha->refcount) { + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); + } else { + err = __hw_addr_sync_one(to_list, ha, addr_len); + if (err) + break; + } } + return err; } -EXPORT_SYMBOL(__hw_addr_del_multiple); +/* This function only works where there is a strict 1-1 relationship + * between source and destionation of they synch. If you ever need to + * sync addresses to more then 1 destination, you need to use + * __hw_addr_sync_multiple(). + */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) @@ -161,17 +201,12 @@ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (!ha->synced) { - err = __hw_addr_add(to_list, ha->addr, - addr_len, ha->type); + if (!ha->sync_cnt) { + err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; - ha->synced = true; - ha->refcount++; - } else if (ha->refcount == 1) { - __hw_addr_del(to_list, ha->addr, addr_len, ha->type); - __hw_addr_del(from_list, ha->addr, addr_len, ha->type); - } + } else if (ha->refcount == 1) + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } return err; } @@ -184,18 +219,98 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (ha->synced) { - __hw_addr_del(to_list, ha->addr, - addr_len, ha->type); - ha->synced = false; - __hw_addr_del(from_list, ha->addr, - addr_len, ha->type); - } + if (ha->sync_cnt) + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } } EXPORT_SYMBOL(__hw_addr_unsync); -void __hw_addr_flush(struct netdev_hw_addr_list *list) +/** + * __hw_addr_sync_dev - Synchonize device's multicast list + * @list: address list to syncronize + * @dev: device to sync + * @sync: function to call if address should be added + * @unsync: function to call if address should be removed + * + * This funciton is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address add/remove + * notifications. The unsync function may be NULL in which case + * the addresses requiring removal will simply be removed without + * any notification to the device. + **/ +int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, const unsigned char *), + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + int err; + + /* first go through and flush out any stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt || ha->refcount != 1) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (ha->sync_cnt) + continue; + + err = sync(dev, ha->addr); + if (err) + return err; + + ha->sync_cnt++; + ha->refcount++; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_sync_dev); + +/** + * __hw_addr_unsync_dev - Remove synchonized addresses from device + * @list: address list to remove syncronized addresses from + * @dev: device to sync + * @unsync: function to call if address should be removed + * + * Remove all addresses that were added to the device by __hw_addr_sync_dev(). + * This function is intended to be called from the ndo_stop or ndo_open + * functions on devices that require explicit address add/remove + * notifications. If the unsync function pointer is NULL then this function + * can be used to just reset the sync_cnt for the addresses in the list. + **/ +void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_unsync_dev); + +static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; @@ -205,7 +320,6 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list) } list->count = 0; } -EXPORT_SYMBOL(__hw_addr_flush); void __hw_addr_init(struct netdev_hw_addr_list *list) { @@ -331,59 +445,6 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, } EXPORT_SYMBOL(dev_addr_del); -/** - * dev_addr_add_multiple - Add device addresses from another device - * @to_dev: device to which addresses will be added - * @from_dev: device from which addresses will be added - * @addr_type: address type - 0 means type will be used from from_dev - * - * Add device addresses of the one device to another. - ** - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - * dev_addr_del_multiple - Delete device addresses by another device - * @to_dev: device where the addresses will be deleted - * @from_dev: device supplying the addresses to be deleted - * @addr_type: address type - 0 means type will be used from from_dev - * - * Deletes addresses in to device by the list of addresses in from device. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - /* * Unicast list handling functions */ @@ -407,7 +468,7 @@ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) } } err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST, true); + NETDEV_HW_ADDR_T_UNICAST, true, false); if (!err) __dev_set_rx_mode(dev); out: @@ -470,7 +531,8 @@ EXPORT_SYMBOL(dev_uc_del); * locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode - * function of layered software devices. + * function of layered software devices. This function assumes that + * addresses will only ever be synced to the @to devices and no other. */ int dev_uc_sync(struct net_device *to, struct net_device *from) { @@ -489,6 +551,36 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) EXPORT_SYMBOL(dev_uc_sync); /** + * dev_uc_sync_multiple - Synchronize device's unicast list to another + * device, but allow for multiple calls to sync to multiple devices. + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have been deleted from the source. The source device + * must be locked by netif_addr_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. It allows for a single source + * device to be synced to multiple destination devices. + */ +int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_nested(to); + err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync_multiple); + +/** * dev_uc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device @@ -560,7 +652,7 @@ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) } } err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, true); + NETDEV_HW_ADDR_T_MULTICAST, true, false); if (!err) __dev_set_rx_mode(dev); out: @@ -576,7 +668,7 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global); + NETDEV_HW_ADDR_T_MULTICAST, global, false, 0); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); @@ -616,7 +708,7 @@ static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global); + NETDEV_HW_ADDR_T_MULTICAST, global, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); @@ -652,7 +744,7 @@ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) EXPORT_SYMBOL(dev_mc_del_global); /** - * dev_mc_sync - Synchronize device's unicast list to another device + * dev_mc_sync - Synchronize device's multicast list to another device * @to: destination device * @from: source device * @@ -680,6 +772,36 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) EXPORT_SYMBOL(dev_mc_sync); /** + * dev_mc_sync_multiple - Synchronize device's multicast list to another + * device, but allow for multiple calls to sync to multiple devices. + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the ndo_set_rx_mode + * function of layered software devices. It allows for a single + * source device to be synced to multiple destination devices. + */ +int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_nested(to); + err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync_multiple); + +/** * dev_mc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device @@ -727,76 +849,3 @@ void dev_mc_init(struct net_device *dev) __hw_addr_init(&dev->mc); } EXPORT_SYMBOL(dev_mc_init); - -#ifdef CONFIG_PROC_FS -#include <linux/seq_file.h> - -static int dev_mc_seq_show(struct seq_file *seq, void *v) -{ - struct netdev_hw_addr *ha; - struct net_device *dev = v; - - if (v == SEQ_START_TOKEN) - return 0; - - netif_addr_lock_bh(dev); - netdev_for_each_mc_addr(ha, dev) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, ha->refcount, ha->global_use); - - for (i = 0; i < dev->addr_len; i++) - seq_printf(seq, "%02x", ha->addr[i]); - - seq_putc(seq, '\n'); - } - netif_addr_unlock_bh(dev); - return 0; -} - -static const struct seq_operations dev_mc_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_mc_seq_show, -}; - -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, - .open = dev_mc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -#endif - -static int __net_init dev_mc_net_init(struct net *net) -{ - if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) - return -ENOMEM; - return 0; -} - -static void __net_exit dev_mc_net_exit(struct net *net) -{ - proc_net_remove(net, "dev_mcast"); -} - -static struct pernet_operations __net_initdata dev_mc_net_ops = { - .init = dev_mc_net_init, - .exit = dev_mc_net_exit, -}; - -void __init dev_mcast_init(void) -{ - register_pernet_subsys(&dev_mc_net_ops); -} - diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c new file mode 100644 index 00000000000..cf999e09bcd --- /dev/null +++ b/net/core/dev_ioctl.c @@ -0,0 +1,567 @@ +#include <linux/kmod.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <linux/net_tstamp.h> +#include <linux/wireless.h> +#include <net/wext.h> + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct net *net, struct ifreq __user *arg) +{ + struct ifreq ifr; + int error; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); + if (error) + return error; + + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +static gifconf_func_t *gifconf_list[NPROTO]; + +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +{ + if (family >= NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} +EXPORT_SYMBOL(register_gifconf); + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(struct net *net, char __user *arg) +{ + struct ifconf ifc; + struct net_device *dev; + char __user *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for_each_netdev(net, dev) { + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (!pos) + done = gifconf_list[i](dev, NULL, 0); + else + done = gifconf_list[i](dev, pos + total, + len - total); + if (done < 0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +/* + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (short) dev_get_flags(dev); + return 0; + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; + + case SIOCGIFSLAVE: + err = -EINVAL; + break; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -ENOTTY; + break; + + } + return err; +} + +static int net_hwtstamp_validate(struct ifreq *ifr) +{ + struct hwtstamp_config cfg; + enum hwtstamp_tx_types tx_type; + enum hwtstamp_rx_filters rx_filter; + int tx_type_valid = 0; + int rx_filter_valid = 0; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + if (cfg.flags) /* reserved for future extensions */ + return -EINVAL; + + tx_type = cfg.tx_type; + rx_filter = cfg.rx_filter; + + switch (tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + case HWTSTAMP_TX_ONESTEP_SYNC: + tx_type_valid = 1; + break; + } + + switch (rx_filter) { + case HWTSTAMP_FILTER_NONE: + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + rx_filter_valid = 1; + break; + } + + if (!tx_type_valid || !rx_filter_valid) + return -ERANGE; + + return 0; +} + +/* + * Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + const struct net_device_ops *ops; + + if (!dev) + return -ENODEV; + + ops = dev->netdev_ops; + + switch (cmd) { + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCSIFMAP: + if (ops->ndo_set_config) { + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCDELMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + case SIOCSHWTSTAMP: + err = net_hwtstamp_validate(ifr); + if (err) + return err; + /* fall through */ + + /* + * Unknown or private ioctl + */ + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCSHWTSTAMP || + cmd == SIOCGHWTSTAMP || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; + + } + return err; +} + +/** + * dev_load - load a network module + * @net: the applicable net namespace + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ + struct net_device *dev; + int no_module; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); + + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", + name); + } +} +EXPORT_SYMBOL(dev_load); + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @net: the applicable net namespace + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_lock(); + ret = dev_ifconf(net, (char __user *) arg); + rtnl_unlock(); + return ret; + } + if (cmd == SIOCGIFNAME) + return dev_ifname(net, (struct ifreq __user *)arg); + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(net, ifr.ifr_name); + rcu_read_lock(); + ret = dev_ifsioc_locked(net, &ifr, cmd); + rcu_read_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + case SIOCETHTOOL: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(net, &ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFMAP: + case SIOCSIFTXQLEN: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + /* + * These ioctl calls: + * - require local superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + case SIOCSHWTSTAMP: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -ENOTTY; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + cmd == SIOCGHWTSTAMP || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) + return wext_handle_ioctl(net, &ifr, cmd, arg); + return -ENOTTY; + } +} diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index d23b6682f4e..e70301eb7a4 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -64,7 +64,6 @@ static struct genl_family net_drop_monitor_family = { .hdrsize = 0, .name = "NET_DM", .version = 2, - .maxattr = NET_DM_CMD_MAX, }; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); @@ -106,6 +105,10 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) return skb; } +static struct genl_multicast_group dropmon_mcgrps[] = { + { .name = "events", }, +}; + static void send_dm_alert(struct work_struct *work) { struct sk_buff *skb; @@ -116,7 +119,8 @@ static void send_dm_alert(struct work_struct *work) skb = reset_per_cpu_data(data); if (skb) - genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + genlmsg_multicast(&net_drop_monitor_family, skb, 0, + 0, GFP_KERNEL); } /* @@ -295,9 +299,9 @@ static int net_dm_cmd_trace(struct sk_buff *skb, } static int dropmon_net_event(struct notifier_block *ev_block, - unsigned long event, void *ptr) + unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dm_hw_stat_delta *new_stat = NULL; struct dm_hw_stat_delta *tmp; @@ -333,7 +337,7 @@ out: return NOTIFY_DONE; } -static struct genl_ops dropmon_ops[] = { +static const struct genl_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, .doit = net_dm_cmd_config, @@ -364,13 +368,13 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family_with_ops(&net_drop_monitor_family, - dropmon_ops, - ARRAY_SIZE(dropmon_ops)); + rc = genl_register_family_with_ops_groups(&net_drop_monitor_family, + dropmon_ops, dropmon_mcgrps); if (rc) { pr_err("Could not create drop monitor netlink family\n"); return rc; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { diff --git a/net/core/dst.c b/net/core/dst.c index ee6153e2cf4..a028409ee43 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -142,12 +142,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard(struct sk_buff *skb) +int dst_discard_sk(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard); +EXPORT_SYMBOL(dst_discard_sk); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -179,11 +179,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; + dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard; + dst->output = dst_discard_sk; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -208,8 +209,10 @@ static void ___dst_free(struct dst_entry *dst) /* The first case (dev==NULL) is required, when protocol module is unloaded. */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) - dst->input = dst->output = dst_discard; + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard; + dst->output = dst_discard_sk; + } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -266,6 +269,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_destroy_rcu(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -273,11 +285,8 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) + call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); @@ -319,27 +328,28 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) EXPORT_SYMBOL(__dst_destroy_metrics_generic); /** - * skb_dst_set_noref - sets skb dst, without a reference + * __skb_dst_set_noref - sets skb dst, without a reference * @skb: buffer * @dst: dst entry + * @force: if force is set, use noref version even for DST_NOCACHE entries * * Sets skb dst, assuming a reference was not taken on dst * skb_dst_drop() should not dst_release() this dst */ -void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - if (unlikely(dst->flags & DST_NOCACHE)) { + if (unlikely((dst->flags & DST_NOCACHE) && !force)) { dst_hold(dst); skb_dst_set(skb, dst); } else { skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } } -EXPORT_SYMBOL(skb_dst_set_noref); +EXPORT_SYMBOL(__skb_dst_set_noref); /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat @@ -359,7 +369,8 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, return; if (!unregister) { - dst->input = dst->output = dst_discard; + dst->input = dst_discard; + dst->output = dst_discard_sk; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); @@ -370,7 +381,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dst_entry *dst, *last = NULL; switch (event) { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a8705432e4b..17cb912793f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -60,10 +60,13 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", [NETIF_F_HIGHDMA_BIT] = "highdma", [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", - [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert", + [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", - [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse", - [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", + [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", + [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", [NETIF_F_GSO_BIT] = "tx-generic-segmentation", [NETIF_F_LLTX_BIT] = "tx-lockless", @@ -77,6 +80,11 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", + [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", @@ -88,6 +96,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LOOPBACK_BIT] = "loopback", [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", + [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) @@ -175,7 +185,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); - if (ops && ops->get_sset_count && ops->get_strings) + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else return -EOPNOTSUPP; @@ -265,18 +275,24 @@ static int ethtool_set_one_feature(struct net_device *dev, #define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) -#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \ - NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ + NETIF_F_RXHASH) static u32 __ethtool_get_flags(struct net_device *dev) { u32 flags = 0; - if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; - if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN; - if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN; - if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; - if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; + if (dev->features & NETIF_F_LRO) + flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) + flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) + flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) + flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) + flags |= ETH_FLAG_RXHASH; return flags; } @@ -288,11 +304,16 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) if (data & ~ETH_ALL_FLAGS) return -EINVAL; - if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; - if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX; - if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX; - if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; - if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; + if (data & ETH_FLAG_LRO) + features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_RX; + if (data & ETH_FLAG_TXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_TX; + if (data & ETH_FLAG_NTUPLE) + features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) + features |= NETIF_F_RXHASH; /* allow changing only bits set in hw_features */ changed = (features ^ dev->features) & ETH_ALL_FEATURES; @@ -311,7 +332,7 @@ int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) { ASSERT_RTNL(); - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) + if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; memset(cmd, 0, sizeof(struct ethtool_cmd)); @@ -355,7 +376,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; - if (ops && ops->get_drvinfo) { + if (ops->get_drvinfo) { ops->get_drvinfo(dev, &info); } else if (dev->dev.parent && dev->dev.parent->driver) { strlcpy(info.bus_info, dev_name(dev->dev.parent), @@ -370,7 +391,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ - if (ops && ops->get_sset_count) { + if (ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -383,9 +404,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops && ops->get_regs_len) + if (ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); - if (ops && ops->get_eeprom_len) + if (ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) @@ -536,6 +557,23 @@ err_out: return ret; } +static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, + struct ethtool_rxnfc *rx_rings, + u32 size) +{ + int i; + + if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) + return -EFAULT; + + /* Validate ring indices */ + for (i = 0; i < size; i++) + if (indir[i] >= rx_rings->data) + return -EINVAL; + + return 0; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -544,7 +582,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->get_rxfh_indir) + !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); if (dev_size == 0) @@ -570,7 +608,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); if (ret) goto out; @@ -590,13 +628,15 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, struct ethtool_rxnfc rx_rings; u32 user_size, dev_size, i; u32 *indir; + const struct ethtool_ops *ops = dev->ethtool_ops; int ret; + u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); - if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->set_rxfh_indir || - !dev->ethtool_ops->get_rxnfc) + if (!ops->get_rxfh_indir_size || !ops->set_rxfh || + !ops->get_rxnfc) return -EOPNOTSUPP; - dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + + dev_size = ops->get_rxfh_indir_size(dev); if (dev_size == 0) return -EOPNOTSUPP; @@ -613,7 +653,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL); + ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; @@ -621,28 +661,184 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, for (i = 0; i < dev_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { - if (copy_from_user(indir, - useraddr + - offsetof(struct ethtool_rxfh_indir, - ring_index[0]), - dev_size * sizeof(indir[0]))) { + ret = ethtool_copy_validate_indir(indir, + useraddr + ringidx_offset, + &rx_rings, + dev_size); + if (ret) + goto out; + } + + ret = ops->set_rxfh(dev, indir, NULL); + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, + void __user *useraddr) +{ + int ret; + const struct ethtool_ops *ops = dev->ethtool_ops; + u32 user_indir_size, user_key_size; + u32 dev_indir_size = 0, dev_key_size = 0; + struct ethtool_rxfh rxfh; + u32 total_size; + u32 indir_bytes; + u32 *indir = NULL; + u8 *hkey = NULL; + u8 *rss_config; + + if (!(dev->ethtool_ops->get_rxfh_indir_size || + dev->ethtool_ops->get_rxfh_key_size) || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + + if (ops->get_rxfh_indir_size) + dev_indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + dev_key_size = ops->get_rxfh_key_size(dev); + + if ((dev_key_size + dev_indir_size) == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) + return -EFAULT; + user_indir_size = rxfh.indir_size; + user_key_size = rxfh.key_size; + + /* Check that reserved fields are 0 for now */ + if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + return -EINVAL; + + rxfh.indir_size = dev_indir_size; + rxfh.key_size = dev_key_size; + if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) + return -EFAULT; + + /* If the user buffer size is 0, this is just a query for the + * device table size and key size. Otherwise, if the User size is + * not equal to device table size or key size it's an error. + */ + if (!user_indir_size && !user_key_size) + return 0; + + if ((user_indir_size && (user_indir_size != dev_indir_size)) || + (user_key_size && (user_key_size != dev_key_size))) + return -EINVAL; + + indir_bytes = user_indir_size * sizeof(indir[0]); + total_size = indir_bytes + user_key_size; + rss_config = kzalloc(total_size, GFP_USER); + if (!rss_config) + return -ENOMEM; + + if (user_indir_size) + indir = (u32 *)rss_config; + + if (user_key_size) + hkey = rss_config + indir_bytes; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); + if (!ret) { + if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) ret = -EFAULT; + } + + kfree(rss_config); + + return ret; +} + +static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, + void __user *useraddr) +{ + int ret; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings; + struct ethtool_rxfh rxfh; + u32 dev_indir_size = 0, dev_key_size = 0, i; + u32 *indir = NULL, indir_bytes = 0; + u8 *hkey = NULL; + u8 *rss_config; + u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + + if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || + !ops->get_rxnfc || !ops->set_rxfh) + return -EOPNOTSUPP; + + if (ops->get_rxfh_indir_size) + dev_indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); + if ((dev_key_size + dev_indir_size) == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) + return -EFAULT; + + /* Check that reserved fields are 0 for now */ + if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + return -EINVAL; + + /* If either indir or hash key is valid, proceed further. + * It is not valid to request that both be unchanged. + */ + if ((rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && + rxfh.indir_size != dev_indir_size) || + (rxfh.key_size && (rxfh.key_size != dev_key_size)) || + (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && + rxfh.key_size == 0)) + return -EINVAL; + + if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + indir_bytes = dev_indir_size * sizeof(indir[0]); + + rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); + if (!rss_config) + return -ENOMEM; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret) + goto out; + + /* rxfh.indir_size == 0 means reset the indir table to default. + * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. + */ + if (rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { + indir = (u32 *)rss_config; + ret = ethtool_copy_validate_indir(indir, + useraddr + rss_cfg_offset, + &rx_rings, + rxfh.indir_size); + if (ret) goto out; - } + } else if (rxfh.indir_size == 0) { + indir = (u32 *)rss_config; + for (i = 0; i < dev_indir_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } - /* Validate ring indices */ - for (i = 0; i < dev_size; i++) { - if (indir[i] >= rx_rings.data) { - ret = -EINVAL; - goto out; - } + if (rxfh.key_size) { + hkey = rss_config + indir_bytes; + if (copy_from_user(hkey, + useraddr + rss_cfg_offset + indir_bytes, + rxfh.key_size)) { + ret = -EFAULT; + goto out; } } - ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); + ret = ops->set_rxfh(dev, indir, hkey); out: - kfree(indir); + kfree(rss_config); return ret; } @@ -1082,9 +1278,10 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; static bool busy; + const struct ethtool_ops *ops = dev->ethtool_ops; int rc; - if (!dev->ethtool_ops->set_phys_id) + if (!ops->set_phys_id) return -EOPNOTSUPP; if (busy) @@ -1093,7 +1290,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; - rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); + rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); if (rc < 0) return rc; @@ -1118,7 +1315,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) i = n; do { rtnl_lock(); - rc = dev->ethtool_ops->set_phys_id(dev, + rc = ops->set_phys_id(dev, (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); rtnl_unlock(); if (rc) @@ -1133,7 +1330,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) dev_put(dev); busy = false; - (void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); + (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); return rc; } @@ -1275,7 +1472,7 @@ static int ethtool_get_dump_flag(struct net_device *dev, struct ethtool_dump dump; const struct ethtool_ops *ops = dev->ethtool_ops; - if (!dev->ethtool_ops->get_dump_flag) + if (!ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) @@ -1299,8 +1496,7 @@ static int ethtool_get_dump_data(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; void *data = NULL; - if (!dev->ethtool_ops->get_dump_data || - !dev->ethtool_ops->get_dump_flag) + if (!ops->get_dump_data || !ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) @@ -1312,10 +1508,19 @@ static int ethtool_get_dump_data(struct net_device *dev, if (ret) return ret; - len = (tmp.len > dump.len) ? dump.len : tmp.len; + len = min(tmp.len, dump.len); if (!len) return -EFAULT; + /* Don't ever let the driver think there's more space available + * than it requested with .get_dump_flag(). + */ + dump.len = len; + + /* Always allocate enough space to hold the whole thing so that the + * driver does not need to check the length and bother with partial + * dumping. + */ data = vzalloc(tmp.len); if (!data) return -ENOMEM; @@ -1323,6 +1528,16 @@ static int ethtool_get_dump_data(struct net_device *dev, if (ret) goto out; + /* There are two sane possibilities: + * 1. The driver's .get_dump_data() does not touch dump.len. + * 2. Or it may set dump.len to how much it really writes, which + * should be tmp.len (or len if it can do a partial dump). + * In any case respond to userspace with the actual length of data + * it's receiving. + */ + WARN_ON(dump.len != len && dump.len != tmp.len); + dump.len = len; + if (copy_to_user(useraddr, &dump, sizeof(dump))) { ret = -EFAULT; goto out; @@ -1346,13 +1561,9 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) info.cmd = ETHTOOL_GET_TS_INFO; if (phydev && phydev->drv && phydev->drv->ts_info) { - err = phydev->drv->ts_info(phydev, &info); - - } else if (dev->ethtool_ops && dev->ethtool_ops->get_ts_info) { - + } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, &info); - } else { info.so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | @@ -1410,7 +1621,7 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } -/* The main entry point in this file. Called from net/core/dev.c */ +/* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { @@ -1418,7 +1629,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) void __user *useraddr = ifr->ifr_data; u32 ethcmd; int rc; - u32 old_features; + netdev_features_t old_features; if (!dev || !netif_device_present(dev)) return -ENODEV; @@ -1454,6 +1665,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: case ETHTOOL_GRXFHINDIR: + case ETHTOOL_GRSSH: case ETHTOOL_GFEATURES: case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: @@ -1591,6 +1803,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; + case ETHTOOL_GRSSH: + rc = ethtool_get_rxfh(dev, useraddr); + break; + case ETHTOOL_SRSSH: + rc = ethtool_set_rxfh(dev, useraddr); + break; case ETHTOOL_GFEATURES: rc = ethtool_get_features(dev, useraddr); break; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 58a4ba27dfe..185c341fafb 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->flags = flags; r->fr_net = hold_net(ops->fro_net); + r->suppress_prefixlen = -1; + r->suppress_ifgroup = -1; + /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); @@ -226,6 +229,9 @@ jumped: else err = ops->action(rule, fl, flags, arg); + if (!err && ops->suppress && ops->suppress(rule, arg)) + continue; + if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(atomic_inc_not_zero(&rule->refcnt))) { @@ -266,7 +272,7 @@ errout: return err; } -static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -337,6 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); + if (tb[FRA_SUPPRESS_PREFIXLEN]) + rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + else + rule->suppress_prefixlen = -1; + + if (tb[FRA_SUPPRESS_IFGROUP]) + rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + else + rule->suppress_ifgroup = -1; if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -415,7 +430,7 @@ errout: return err; } -static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -445,7 +460,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (frh->action && (frh->action != rule->action)) continue; - if (frh->table && (frh_get_table(frh, tb) != rule->table)) + if (frh_get_table(frh, tb) && + (frh_get_table(frh, tb) != rule->table)) continue; if (tb[FRA_PRIORITY] && @@ -523,6 +539,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -548,6 +566,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; + if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) + goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; @@ -580,6 +600,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target))) goto nla_put_failure; + + if (rule->suppress_ifgroup != -1) { + if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) + goto nla_put_failure; + } + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; @@ -705,9 +731,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) static int fib_rules_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; @@ -719,6 +745,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, attach_rules(&ops->rules_list, dev); break; + case NETDEV_CHANGENAME: + list_for_each_entry(ops, &net->rules_ops, list) { + detach_rules(&ops->rules_list, dev); + attach_rules(&ops->rules_list, dev); + } + break; + case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); diff --git a/net/core/filter.c b/net/core/filter.c index c23543cba13..1dbf6462f76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1,11 +1,16 @@ /* * Linux Socket Filter - Kernel level socket filtering * - * Author: - * Jay Schulist <jschlst@samba.org> + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: * - * Based on the design of: - * - The Berkeley Packet Filter + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -36,11 +41,31 @@ #include <asm/uaccess.h> #include <asm/unaligned.h> #include <linux/filter.h> -#include <linux/reciprocal_div.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> +/* Registers */ +#define BPF_R0 regs[BPF_REG_0] +#define BPF_R1 regs[BPF_REG_1] +#define BPF_R2 regs[BPF_REG_2] +#define BPF_R3 regs[BPF_REG_3] +#define BPF_R4 regs[BPF_REG_4] +#define BPF_R5 regs[BPF_REG_5] +#define BPF_R6 regs[BPF_REG_6] +#define BPF_R7 regs[BPF_REG_7] +#define BPF_R8 regs[BPF_REG_8] +#define BPF_R9 regs[BPF_REG_9] +#define BPF_R10 regs[BPF_REG_10] + +/* Named registers */ +#define DST regs[insn->dst_reg] +#define SRC regs[insn->src_reg] +#define FP regs[BPF_REG_FP] +#define ARG1 regs[BPF_REG_ARG1] +#define CTX regs[BPF_REG_CTX] +#define IMM insn->imm + /* No hurry in this branch * * Exported for the bpf jit load helper. @@ -53,9 +78,9 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns ptr = skb_network_header(skb) + k - SKF_NET_OFF; else if (k >= SKF_LL_OFF) ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; + return NULL; } @@ -64,6 +89,7 @@ static inline void *load_pointer(const struct sk_buff *skb, int k, { if (k >= 0) return skb_header_pointer(skb, k, size, buffer); + return bpf_internal_load_pointer_neg_helper(skb, k, size); } @@ -109,301 +135,960 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + /** - * sk_run_filter - run a filter on a socket - * @skb: buffer to run the filter on - * @fentry: filter to apply + * __sk_run_filter - run a filter on a given context + * @ctx: buffer to run the filter on + * @insn: filter to apply * - * Decode and apply filter instructions to the skb->data. - * Return length to keep, 0 for none. @skb is the data we are - * filtering, @filter is the array of filter instructions. - * Because all jumps are guaranteed to be before last instruction, - * and last instruction guaranteed to be a RET, we dont need to check - * flen. (We used to pass to this function the length of filter) + * Decode and apply filter instructions to the skb->data. Return length to + * keep, 0 for none. @ctx is the data we are operating on, @insn is the + * array of filter instructions. */ -unsigned int sk_run_filter(const struct sk_buff *skb, - const struct sock_filter *fentry) +static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) { + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG], tmp; + static const void *jumptable[256] = { + [0 ... 255] = &&default_label, + /* Now overwrite non-defaults ... */ + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, + [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, + [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, + [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, + [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, + [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, + [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, + [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, + [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, + [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, + [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, + [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, + [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, + [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, + [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, + [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, + [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, + [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, + [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, + [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, + [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, + [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, + [BPF_ALU | BPF_NEG] = &&ALU_NEG, + [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, + [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, + /* 64 bit ALU operations */ + [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, + [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, + [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, + [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, + [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, + [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, + [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, + [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, + [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, + [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, + [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, + [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, + [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, + [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, + [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, + [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, + [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, + [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, + [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, + [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, + [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, + [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, + [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, + [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, + [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, + /* Call instruction */ + [BPF_JMP | BPF_CALL] = &&JMP_CALL, + /* Jumps */ + [BPF_JMP | BPF_JA] = &&JMP_JA, + [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, + [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, + [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, + [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, + [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, + [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, + [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, + [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, + [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, + [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, + /* Program return */ + [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, + /* Store instructions */ + [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, + [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, + [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, + [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, + [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, + [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, + [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, + [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, + [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, + [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, + [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, + [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, + [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, + [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, + [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + }; void *ptr; - u32 A = 0; /* Accumulator */ - u32 X = 0; /* Index Register */ - u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ - u32 tmp; - int k; + int off; - /* - * Process array of filter instructions. - */ - for (;; fentry++) { -#if defined(CONFIG_X86_32) -#define K (fentry->k) -#else - const u32 K = fentry->k; -#endif +#define CONT ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) - switch (fentry->code) { - case BPF_S_ALU_ADD_X: - A += X; - continue; - case BPF_S_ALU_ADD_K: - A += K; - continue; - case BPF_S_ALU_SUB_X: - A -= X; - continue; - case BPF_S_ALU_SUB_K: - A -= K; - continue; - case BPF_S_ALU_MUL_X: - A *= X; - continue; - case BPF_S_ALU_MUL_K: - A *= K; - continue; - case BPF_S_ALU_DIV_X: - if (X == 0) - return 0; - A /= X; - continue; - case BPF_S_ALU_DIV_K: - A = reciprocal_divide(A, K); - continue; - case BPF_S_ALU_MOD_X: - if (X == 0) - return 0; - A %= X; - continue; - case BPF_S_ALU_MOD_K: - A %= K; - continue; - case BPF_S_ALU_AND_X: - A &= X; - continue; - case BPF_S_ALU_AND_K: - A &= K; - continue; - case BPF_S_ALU_OR_X: - A |= X; - continue; - case BPF_S_ALU_OR_K: - A |= K; - continue; - case BPF_S_ANC_ALU_XOR_X: - case BPF_S_ALU_XOR_X: - A ^= X; - continue; - case BPF_S_ALU_XOR_K: - A ^= K; - continue; - case BPF_S_ALU_LSH_X: - A <<= X; - continue; - case BPF_S_ALU_LSH_K: - A <<= K; - continue; - case BPF_S_ALU_RSH_X: - A >>= X; - continue; - case BPF_S_ALU_RSH_K: - A >>= K; - continue; - case BPF_S_ALU_NEG: - A = -A; - continue; - case BPF_S_JMP_JA: - fentry += K; - continue; - case BPF_S_JMP_JGT_K: - fentry += (A > K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGE_K: - fentry += (A >= K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JEQ_K: - fentry += (A == K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JSET_K: - fentry += (A & K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGT_X: - fentry += (A > X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGE_X: - fentry += (A >= X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JEQ_X: - fentry += (A == X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JSET_X: - fentry += (A & X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_LD_W_ABS: - k = K; -load_w: - ptr = load_pointer(skb, k, 4, &tmp); - if (ptr != NULL) { - A = get_unaligned_be32(ptr); - continue; - } + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + + /* Registers used in classic BPF programs need to be reset first. */ + regs[BPF_REG_A] = 0; + regs[BPF_REG_X] = 0; + +select_insn: + goto *jumptable[insn->code]; + + /* ALU */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + + ALU(ADD, +) + ALU(SUB, -) + ALU(AND, &) + ALU(OR, |) + ALU(LSH, <<) + ALU(RSH, >>) + ALU(XOR, ^) + ALU(MUL, *) +#undef ALU + ALU_NEG: + DST = (u32) -DST; + CONT; + ALU64_NEG: + DST = -DST; + CONT; + ALU_MOV_X: + DST = (u32) SRC; + CONT; + ALU_MOV_K: + DST = (u32) IMM; + CONT; + ALU64_MOV_X: + DST = SRC; + CONT; + ALU64_MOV_K: + DST = IMM; + CONT; + ALU64_ARSH_X: + (*(s64 *) &DST) >>= SRC; + CONT; + ALU64_ARSH_K: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_H_ABS: - k = K; -load_h: - ptr = load_pointer(skb, k, 2, &tmp); - if (ptr != NULL) { - A = get_unaligned_be16(ptr); - continue; - } + tmp = DST; + DST = do_div(tmp, SRC); + CONT; + ALU_MOD_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_B_ABS: - k = K; -load_b: - ptr = load_pointer(skb, k, 1, &tmp); - if (ptr != NULL) { - A = *(u8 *)ptr; - continue; - } + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: + tmp = DST; + DST = do_div(tmp, IMM); + CONT; + ALU_MOD_K: + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_W_LEN: - A = skb->len; - continue; - case BPF_S_LDX_W_LEN: - X = skb->len; - continue; - case BPF_S_LD_W_IND: - k = X + K; - goto load_w; - case BPF_S_LD_H_IND: - k = X + K; - goto load_h; - case BPF_S_LD_B_IND: - k = X + K; - goto load_b; - case BPF_S_LDX_B_MSH: - ptr = load_pointer(skb, K, 1, &tmp); - if (ptr != NULL) { - X = (*(u8 *)ptr & 0xf) << 2; - continue; - } + do_div(DST, SRC); + CONT; + ALU_DIV_X: + if (unlikely(SRC == 0)) return 0; - case BPF_S_LD_IMM: - A = K; - continue; - case BPF_S_LDX_IMM: - X = K; - continue; - case BPF_S_LD_MEM: - A = mem[K]; - continue; - case BPF_S_LDX_MEM: - X = mem[K]; - continue; - case BPF_S_MISC_TAX: - X = A; - continue; - case BPF_S_MISC_TXA: - A = X; - continue; - case BPF_S_RET_K: - return K; - case BPF_S_RET_A: - return A; - case BPF_S_ST: - mem[K] = A; - continue; - case BPF_S_STX: - mem[K] = X; - continue; - case BPF_S_ANC_PROTOCOL: - A = ntohs(skb->protocol); - continue; - case BPF_S_ANC_PKTTYPE: - A = skb->pkt_type; - continue; - case BPF_S_ANC_IFINDEX: - if (!skb->dev) - return 0; - A = skb->dev->ifindex; - continue; - case BPF_S_ANC_MARK: - A = skb->mark; - continue; - case BPF_S_ANC_QUEUE: - A = skb->queue_mapping; - continue; - case BPF_S_ANC_HATYPE: - if (!skb->dev) - return 0; - A = skb->dev->type; - continue; - case BPF_S_ANC_RXHASH: - A = skb->rxhash; - continue; - case BPF_S_ANC_CPU: - A = raw_smp_processor_id(); - continue; - case BPF_S_ANC_VLAN_TAG: - A = vlan_tx_tag_get(skb); - continue; - case BPF_S_ANC_VLAN_TAG_PRESENT: - A = !!vlan_tx_tag_present(skb); - continue; - case BPF_S_ANC_NLATTR: { - struct nlattr *nla; - - if (skb_is_nonlinear(skb)) - return 0; - if (A > skb->len - sizeof(struct nlattr)) - return 0; - - nla = nla_find((struct nlattr *)&skb->data[A], - skb->len - A, X); - if (nla) - A = (void *)nla - (void *)skb->data; - else - A = 0; - continue; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; + CONT; + ALU64_DIV_K: + do_div(DST, IMM); + CONT; + ALU_DIV_K: + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_be16(DST); + break; + case 32: + DST = (__force u32) cpu_to_be32(DST); + break; + case 64: + DST = (__force u64) cpu_to_be64(DST); + break; + } + CONT; + ALU_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_le16(DST); + break; + case 32: + DST = (__force u32) cpu_to_le32(DST); + break; + case 64: + DST = (__force u64) cpu_to_le64(DST); + break; + } + CONT; + + /* CALL */ + JMP_CALL: + /* Function call scratches BPF_R1-BPF_R5 registers, + * preserves BPF_R6-BPF_R9, and stores return value + * into BPF_R0. + */ + BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5); + CONT; + + /* JMP */ + JMP_JA: + insn += insn->off; + CONT; + JMP_JEQ_X: + if (DST == SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JEQ_K: + if (DST == IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_X: + if (DST != SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_K: + if (DST != IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_X: + if (DST > SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_K: + if (DST > IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_X: + if (DST >= SRC) { + insn += insn->off; + CONT_JMP; } - case BPF_S_ANC_NLATTR_NEST: { - struct nlattr *nla; - - if (skb_is_nonlinear(skb)) - return 0; - if (A > skb->len - sizeof(struct nlattr)) - return 0; - - nla = (struct nlattr *)&skb->data[A]; - if (nla->nla_len > A - skb->len) - return 0; - - nla = nla_find_nested(nla, X); - if (nla) - A = (void *)nla - (void *)skb->data; - else - A = 0; - continue; + CONT; + JMP_JGE_K: + if (DST >= IMM) { + insn += insn->off; + CONT_JMP; } -#ifdef CONFIG_SECCOMP_FILTER - case BPF_S_ANC_SECCOMP_LD_W: - A = seccomp_bpf_load(fentry->k); - continue; + CONT; + JMP_JSGT_X: + if (((s64) DST) > ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_K: + if (((s64) DST) > ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_X: + if (((s64) DST) >= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_K: + if (((s64) DST) >= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_X: + if (DST & SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_K: + if (DST & IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_EXIT: + return BPF_R0; + + /* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ + CONT; \ + ST_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + + LDST(B, u8) + LDST(H, u16) + LDST(W, u32) + LDST(DW, u64) +#undef LDST + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + CONT; + STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + CONT; + LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ + off = IMM; +load_word: + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are + * only appearing in the programs where ctx == + * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] + * == BPF_R6, sk_convert_filter() saves it in BPF_R6, + * internal BPF verifier will check that BPF_R6 == + * ctx. + * + * BPF_ABS and BPF_IND are wrappers of function calls, + * so they scratch BPF_R1-BPF_R5 registers, preserve + * BPF_R6-BPF_R9, and store return value into BPF_R0. + * + * Implicit input: + * ctx == skb == BPF_R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness + */ + + ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be32(ptr); + CONT; + } + + return 0; + LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ + off = IMM; +load_half: + ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be16(ptr); + CONT; + } + + return 0; + LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ + off = IMM; +load_byte: + ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = *(u8 *)ptr; + CONT; + } + + return 0; + LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_word; + LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_half; + LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ + off = IMM + SRC; + goto load_byte; + + default_label: + /* If we ever reach this, we have a bug somewhere. */ + WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + return 0; +} + +/* Helper to find the offset of pkt_type in sk_buff structure. We want + * to make sure its still a 3bit field starting at a byte boundary; + * taken from arch/x86/net/bpf_jit_comp.c. + */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX (7 << 5) +#else +#define PKT_TYPE_MAX 7 #endif +static unsigned int pkt_type_offset(void) +{ + struct sk_buff skb_probe = { .pkt_type = ~0, }; + u8 *ct = (u8 *) &skb_probe; + unsigned int off; + + for (off = 0; off < sizeof(struct sk_buff); off++) { + if (ct[off] == PKT_TYPE_MAX) + return off; + } + + pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); + return -1; +} + +static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); +} + +static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = (struct nlattr *) &skb->data[a]; + if (nla->nla_len > skb->len - a) + return 0; + + nla = nla_find_nested(nla, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +/* note that this only generates 32-bit random numbers */ +static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +static bool convert_bpf_extensions(struct sock_filter *fp, + struct sock_filter_int **insnp) +{ + struct sock_filter_int *insn = *insnp; + + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PROTOCOL: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + + /* A = *(u16 *) (CTX + offsetof(protocol)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, protocol)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + + case SKF_AD_OFF + SKF_AD_PKTTYPE: + *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, + pkt_type_offset()); + if (insn->off < 0) + return false; + insn++; + *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + insn++; + *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); +#endif + break; + + case SKF_AD_OFF + SKF_AD_IFINDEX: + case SKF_AD_OFF + SKF_AD_HATYPE: + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); + BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); + + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, dev)); + /* if (tmp != 0) goto pc + 1 */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); + *insn++ = BPF_EXIT_INSN(); + if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, ifindex)); + else + *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, type)); + break; + + case SKF_AD_OFF + SKF_AD_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_OFF + SKF_AD_RXHASH: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, hash)); + break; + + case SKF_AD_OFF + SKF_AD_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + + *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, queue_mapping)); + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TAG: + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, vlan_tci)); + if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { + *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, + ~VLAN_TAG_PRESENT); + } else { + /* A >>= 12 */ + *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); + /* A &= 1 */ + *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1); + } + break; + + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + case SKF_AD_OFF + SKF_AD_NLATTR: + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + case SKF_AD_OFF + SKF_AD_CPU: + case SKF_AD_OFF + SKF_AD_RANDOM: + /* arg1 = CTX */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + /* arg2 = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); + /* arg3 = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); + /* Emit call(arg1=CTX, arg2=A, arg3=X) */ + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + break; + case SKF_AD_OFF + SKF_AD_NLATTR: + *insn = BPF_EMIT_CALL(__skb_get_nlattr); + break; + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + break; + case SKF_AD_OFF + SKF_AD_CPU: + *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + break; + case SKF_AD_OFF + SKF_AD_RANDOM: + *insn = BPF_EMIT_CALL(__get_random_u32); + break; + } + break; + + case SKF_AD_OFF + SKF_AD_ALU_XOR_X: + /* A ^= X */ + *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); + break; + + default: + /* This is just a dummy call to avoid letting the compiler + * evict __bpf_call_base() as an optimization. Placed here + * where no-one bothers. + */ + BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); + return false; + } + + *insnp = insn; + return true; +} + +/** + * sk_convert_filter - convert filter program + * @prog: the user passed filter program + * @len: the length of the user passed filter program + * @new_prog: buffer where converted program will be stored + * @new_len: pointer to store length of converted program + * + * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. + * Conversion workflow: + * + * 1) First pass for calculating the new program length: + * sk_convert_filter(old_prog, old_len, NULL, &new_len) + * + * 2) 2nd pass to remap in two passes: 1st pass finds new + * jump offsets, 2nd pass remapping: + * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len); + * sk_convert_filter(old_prog, old_len, new_prog, &new_len); + * + * User BPF's register A is mapped to our BPF register 6, user BPF + * register X is mapped to BPF register 7; frame pointer is always + * register 10; Context 'void *ctx' is stored in register 1, that is, + * for socket filters: ctx == 'struct sk_buff *', for seccomp: + * ctx == 'struct seccomp_data *'. + */ +int sk_convert_filter(struct sock_filter *prog, int len, + struct sock_filter_int *new_prog, int *new_len) +{ + int new_flen = 0, pass = 0, target, i; + struct sock_filter_int *new_insn; + struct sock_filter *fp; + int *addrs = NULL; + u8 bpf_src; + + BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); + BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); + + if (len <= 0 || len > BPF_MAXINSNS) + return -EINVAL; + + if (new_prog) { + addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + } + +do_pass: + new_insn = new_prog; + fp = prog; + + if (new_insn) + *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + new_insn++; + + for (i = 0; i < len; fp++, i++) { + struct sock_filter_int tmp_insns[6] = { }; + struct sock_filter_int *insn = tmp_insns; + + if (addrs) + addrs[i] = new_insn - new_prog; + + switch (fp->code) { + /* All arithmetic insns and skb loads map as-is. */ + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + /* Check for overloaded BPF extension and + * directly convert it if found, otherwise + * just move on with mapping. + */ + if (BPF_CLASS(fp->code) == BPF_LD && + BPF_MODE(fp->code) == BPF_ABS && + convert_bpf_extensions(fp, &insn)) + break; + + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); + break; + + /* Jump transformation cannot use BPF block macros + * everywhere as offset calculation and target updates + * require a bit more work than the rest, i.e. jump + * opcodes map as-is, but offsets need adjustment. + */ + +#define BPF_EMIT_JMP \ + do { \ + if (target >= len || target < 0) \ + goto err; \ + insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ + /* Adjust pc relative offset for 2nd or 3rd insn. */ \ + insn->off -= insn - tmp_insns; \ + } while (0) + + case BPF_JMP | BPF_JA: + target = i + fp->k + 1; + insn->code = fp->code; + BPF_EMIT_JMP; + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { + /* BPF immediates are signed, zero extend + * immediate into tmp register and use it + * in compare insn. + */ + *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); + + insn->dst_reg = BPF_REG_A; + insn->src_reg = BPF_REG_TMP; + bpf_src = BPF_X; + } else { + insn->dst_reg = BPF_REG_A; + insn->src_reg = BPF_REG_X; + insn->imm = fp->k; + bpf_src = BPF_SRC(fp->code); + } + + /* Common case where 'jump_false' is next insn. */ + if (fp->jf == 0) { + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + target = i + fp->jt + 1; + BPF_EMIT_JMP; + break; + } + + /* Convert JEQ into JNE when 'jump_true' is next insn. */ + if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { + insn->code = BPF_JMP | BPF_JNE | bpf_src; + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + } + + /* Other jumps are mapped into two insns: Jxx and JA. */ + target = i + fp->jt + 1; + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + BPF_EMIT_JMP; + insn++; + + insn->code = BPF_JMP | BPF_JA; + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + + /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ + case BPF_LDX | BPF_MSH | BPF_B: + /* tmp = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + /* A = BPF_R0 = *(u8 *) (skb->data + K) */ + *insn++ = BPF_LD_ABS(BPF_B, fp->k); + /* A &= 0xf */ + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); + /* A <<= 2 */ + *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + /* A = tmp */ + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); + break; + + /* RET_K, RET_A are remaped into 2 insns. */ + case BPF_RET | BPF_A: + case BPF_RET | BPF_K: + *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? + BPF_K : BPF_X, BPF_REG_0, + BPF_REG_A, fp->k); + *insn = BPF_EXIT_INSN(); + break; + + /* Store to stack. */ + case BPF_ST: + case BPF_STX: + *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == + BPF_ST ? BPF_REG_A : BPF_REG_X, + -(BPF_MEMWORDS - fp->k) * 4); + break; + + /* Load from stack. */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_FP, + -(BPF_MEMWORDS - fp->k) * 4); + break; + + /* A = K or X = K */ + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, fp->k); + break; + + /* X = A */ + case BPF_MISC | BPF_TAX: + *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + break; + + /* A = X */ + case BPF_MISC | BPF_TXA: + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); + break; + + /* A = skb->len or X = skb->len */ + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LDX | BPF_W | BPF_LEN: + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + break; + + /* Access seccomp_data fields. */ + case BPF_LDX | BPF_ABS | BPF_W: + /* A = *(u32 *) (ctx + K) */ + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); + break; + + /* Unkown instruction. */ default: - WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", - fentry->code, fentry->jt, - fentry->jf, fentry->k); - return 0; + goto err; } + + insn++; + if (new_prog) + memcpy(new_insn, tmp_insns, + sizeof(*insn) * (insn - tmp_insns)); + new_insn += insn - tmp_insns; } + if (!new_prog) { + /* Only calculating new length. */ + *new_len = new_insn - new_prog; + return 0; + } + + pass++; + if (new_flen != new_insn - new_prog) { + new_flen = new_insn - new_prog; + if (pass > 2) + goto err; + goto do_pass; + } + + kfree(addrs); + BUG_ON(*new_len != new_flen); return 0; +err: + kfree(addrs); + return -EINVAL; } -EXPORT_SYMBOL(sk_run_filter); -/* - * Security : +/* Security: + * * A BPF program is able to use 16 cells of memory to store intermediate - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()). + * * As we dont want to clear mem[] array for each packet going through * sk_run_filter(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure @@ -411,44 +1096,46 @@ EXPORT_SYMBOL(sk_run_filter); */ static int check_load_and_stores(struct sock_filter *filter, int flen) { - u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); - masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + + masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { - case BPF_S_ST: - case BPF_S_STX: + case BPF_ST: + case BPF_STX: memvalid |= (1 << filter[pc].k); break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; - case BPF_S_JMP_JA: - /* a jump must set masks on target */ + case BPF_JMP | BPF_JA: + /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* a jump must set masks on targets */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; @@ -460,6 +1147,72 @@ error: return ret; } +static bool chk_code_allowed(u16 code_to_probe) +{ + static const bool codes[] = { + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_K] = true, + [BPF_ALU | BPF_ADD | BPF_X] = true, + [BPF_ALU | BPF_SUB | BPF_K] = true, + [BPF_ALU | BPF_SUB | BPF_X] = true, + [BPF_ALU | BPF_MUL | BPF_K] = true, + [BPF_ALU | BPF_MUL | BPF_X] = true, + [BPF_ALU | BPF_DIV | BPF_K] = true, + [BPF_ALU | BPF_DIV | BPF_X] = true, + [BPF_ALU | BPF_MOD | BPF_K] = true, + [BPF_ALU | BPF_MOD | BPF_X] = true, + [BPF_ALU | BPF_AND | BPF_K] = true, + [BPF_ALU | BPF_AND | BPF_X] = true, + [BPF_ALU | BPF_OR | BPF_K] = true, + [BPF_ALU | BPF_OR | BPF_X] = true, + [BPF_ALU | BPF_XOR | BPF_K] = true, + [BPF_ALU | BPF_XOR | BPF_X] = true, + [BPF_ALU | BPF_LSH | BPF_K] = true, + [BPF_ALU | BPF_LSH | BPF_X] = true, + [BPF_ALU | BPF_RSH | BPF_K] = true, + [BPF_ALU | BPF_RSH | BPF_X] = true, + [BPF_ALU | BPF_NEG] = true, + /* Load instructions */ + [BPF_LD | BPF_W | BPF_ABS] = true, + [BPF_LD | BPF_H | BPF_ABS] = true, + [BPF_LD | BPF_B | BPF_ABS] = true, + [BPF_LD | BPF_W | BPF_LEN] = true, + [BPF_LD | BPF_W | BPF_IND] = true, + [BPF_LD | BPF_H | BPF_IND] = true, + [BPF_LD | BPF_B | BPF_IND] = true, + [BPF_LD | BPF_IMM] = true, + [BPF_LD | BPF_MEM] = true, + [BPF_LDX | BPF_W | BPF_LEN] = true, + [BPF_LDX | BPF_B | BPF_MSH] = true, + [BPF_LDX | BPF_IMM] = true, + [BPF_LDX | BPF_MEM] = true, + /* Store instructions */ + [BPF_ST] = true, + [BPF_STX] = true, + /* Misc instructions */ + [BPF_MISC | BPF_TAX] = true, + [BPF_MISC | BPF_TXA] = true, + /* Return instructions */ + [BPF_RET | BPF_K] = true, + [BPF_RET | BPF_A] = true, + /* Jump instructions */ + [BPF_JMP | BPF_JA] = true, + [BPF_JMP | BPF_JEQ | BPF_K] = true, + [BPF_JMP | BPF_JEQ | BPF_X] = true, + [BPF_JMP | BPF_JGE | BPF_K] = true, + [BPF_JMP | BPF_JGE | BPF_X] = true, + [BPF_JMP | BPF_JGT | BPF_K] = true, + [BPF_JMP | BPF_JGT | BPF_X] = true, + [BPF_JMP | BPF_JSET | BPF_K] = true, + [BPF_JMP | BPF_JSET | BPF_X] = true, + }; + + if (code_to_probe >= ARRAY_SIZE(codes)) + return false; + + return codes[code_to_probe]; +} + /** * sk_chk_filter - verify socket filter code * @filter: filter to verify @@ -476,185 +1229,303 @@ error: */ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) { - /* - * Valid instructions are initialized to non-0. - * Invalid instructions are initialized to 0. - */ - static const u8 codes[] = { - [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, - [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, - [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, - [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, - [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, - [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, - [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, - [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K, - [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X, - [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, - [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, - [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, - [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, - [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K, - [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X, - [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, - [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, - [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, - [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, - [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, - [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, - [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, - [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, - [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, - [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, - [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, - [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, - [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, - [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, - [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, - [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, - [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, - [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, - [BPF_RET|BPF_K] = BPF_S_RET_K, - [BPF_RET|BPF_A] = BPF_S_RET_A, - [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, - [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, - [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, - [BPF_ST] = BPF_S_ST, - [BPF_STX] = BPF_S_STX, - [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, - [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, - [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, - [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, - [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, - [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, - [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, - [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, - [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, - }; + bool anc_found; int pc; if (flen == 0 || flen > BPF_MAXINSNS) return -EINVAL; - /* check the filter code now */ + /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { struct sock_filter *ftest = &filter[pc]; - u16 code = ftest->code; - if (code >= ARRAY_SIZE(codes)) - return -EINVAL; - code = codes[code]; - if (!code) + /* May we actually operate on this code? */ + if (!chk_code_allowed(ftest->code)) return -EINVAL; + /* Some instructions need special checks */ - switch (code) { - case BPF_S_ALU_DIV_K: - /* check for division by zero */ + switch (ftest->code) { + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; - ftest->k = reciprocal_value(ftest->k); break; - case BPF_S_ALU_MOD_K: - /* check for division by zero */ - if (ftest->k == 0) - return -EINVAL; - break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: - case BPF_S_ST: - case BPF_S_STX: - /* check for invalid memory addresses */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; - case BPF_S_JMP_JA: - /* - * Note, the large ftest->k might cause loops. + case BPF_JMP | BPF_JA: + /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ - if (ftest->k >= (unsigned int)(flen-pc-1)) + if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* for conditionals both must be safe */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: -#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ - code = BPF_S_ANC_##CODE; \ - break - switch (ftest->k) { - ANCILLARY(PROTOCOL); - ANCILLARY(PKTTYPE); - ANCILLARY(IFINDEX); - ANCILLARY(NLATTR); - ANCILLARY(NLATTR_NEST); - ANCILLARY(MARK); - ANCILLARY(QUEUE); - ANCILLARY(HATYPE); - ANCILLARY(RXHASH); - ANCILLARY(CPU); - ANCILLARY(ALU_XOR_X); - ANCILLARY(VLAN_TAG); - ANCILLARY(VLAN_TAG_PRESENT); - } + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: + anc_found = false; + if (bpf_anc_helper(ftest) & BPF_ANC) + anc_found = true; + /* Ancillary operation unknown or unsupported */ + if (anc_found == false && ftest->k >= SKF_AD_OFF) + return -EINVAL; } - ftest->code = code; } - /* last instruction must be a RET code */ + /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { - case BPF_S_RET_K: - case BPF_S_RET_A: + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } + return -EINVAL; } EXPORT_SYMBOL(sk_chk_filter); +static int sk_store_orig_filter(struct sk_filter *fp, + const struct sock_fprog *fprog) +{ + unsigned int fsize = sk_filter_proglen(fprog); + struct sock_fprog_kern *fkprog; + + fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); + if (!fp->orig_prog) + return -ENOMEM; + + fkprog = fp->orig_prog; + fkprog->len = fprog->len; + fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL); + if (!fkprog->filter) { + kfree(fp->orig_prog); + return -ENOMEM; + } + + return 0; +} + +static void sk_release_orig_filter(struct sk_filter *fp) +{ + struct sock_fprog_kern *fprog = fp->orig_prog; + + if (fprog) { + kfree(fprog->filter); + kfree(fprog); + } +} + /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -void sk_filter_release_rcu(struct rcu_head *rcu) +static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); + sk_release_orig_filter(fp); + sk_filter_free(fp); +} + +/** + * sk_filter_release - release a socket filter + * @fp: filter to remove + * + * Remove a filter from a socket and release its resources. + */ +static void sk_filter_release(struct sk_filter *fp) +{ + if (atomic_dec_and_test(&fp->refcnt)) + call_rcu(&fp->rcu, sk_filter_release_rcu); +} + +void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) +{ + atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); + sk_filter_release(fp); +} + +void sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + atomic_inc(&fp->refcnt); + atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); +} + +static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, + struct sock *sk, + unsigned int len) +{ + struct sk_filter *fp_new; + + if (sk == NULL) + return krealloc(fp, len, GFP_KERNEL); + + fp_new = sock_kmalloc(sk, len, GFP_KERNEL); + if (fp_new) { + *fp_new = *fp; + /* As we're keeping orig_prog in fp_new along, + * we need to make sure we're not evicting it + * from the old fp. + */ + fp->orig_prog = NULL; + sk_filter_uncharge(sk, fp); + } + + return fp_new; +} + +static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, + struct sock *sk) +{ + struct sock_filter *old_prog; + struct sk_filter *old_fp; + int err, new_len, old_len = fp->len; + + /* We are free to overwrite insns et al right here as it + * won't be used at this point in time anymore internally + * after the migration to the internal BPF instruction + * representation. + */ + BUILD_BUG_ON(sizeof(struct sock_filter) != + sizeof(struct sock_filter_int)); + + /* Conversion cannot happen on overlapping memory areas, + * so we need to keep the user BPF around until the 2nd + * pass. At this time, the user BPF is stored in fp->insns. + */ + old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), + GFP_KERNEL); + if (!old_prog) { + err = -ENOMEM; + goto out_err; + } + + /* 1st pass: calculate the new program length. */ + err = sk_convert_filter(old_prog, old_len, NULL, &new_len); + if (err) + goto out_err_free; + + /* Expand fp for appending the new filter representation. */ + old_fp = fp; + fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len)); + if (!fp) { + /* The old_fp is still around in case we couldn't + * allocate new memory, so uncharge on that one. + */ + fp = old_fp; + err = -ENOMEM; + goto out_err_free; + } + + fp->len = new_len; + + /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ + err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); + if (err) + /* 2nd sk_convert_filter() can fail only if it fails + * to allocate memory, remapping must succeed. Note, + * that at this time old_fp has already been released + * by __sk_migrate_realloc(). + */ + goto out_err_free; + + sk_filter_select_runtime(fp); + + kfree(old_prog); + return fp; + +out_err_free: + kfree(old_prog); +out_err: + /* Rollback filter setup. */ + if (sk != NULL) + sk_filter_uncharge(sk, fp); + else + kfree(fp); + return ERR_PTR(err); +} + +void __weak bpf_int_jit_compile(struct sk_filter *prog) +{ +} + +/** + * sk_filter_select_runtime - select execution runtime for BPF program + * @fp: sk_filter populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via SK_RUN_FILTER() macro + */ +void sk_filter_select_runtime(struct sk_filter *fp) +{ + fp->bpf_func = (void *) __sk_run_filter; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_select_runtime); + +/* free internal BPF program */ +void sk_filter_free(struct sk_filter *fp) +{ bpf_jit_free(fp); - kfree(fp); } -EXPORT_SYMBOL(sk_filter_release_rcu); +EXPORT_SYMBOL_GPL(sk_filter_free); -static int __sk_prepare_filter(struct sk_filter *fp) +static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, + struct sock *sk) { int err; - fp->bpf_func = sk_run_filter; + fp->bpf_func = NULL; + fp->jited = 0; err = sk_chk_filter(fp->insns, fp->len); - if (err) - return err; + if (err) { + if (sk != NULL) + sk_filter_uncharge(sk, fp); + else + kfree(fp); + return ERR_PTR(err); + } + /* Probe if we can JIT compile the filter and if so, do + * the compilation of the filter. + */ bpf_jit_compile(fp); - return 0; + + /* JIT compiler couldn't process this filter, so do the + * internal BPF translation for the optimized interpreter. + */ + if (!fp->jited) + fp = __sk_migrate_filter(fp, sk); + + return fp; } /** * sk_unattached_filter_create - create an unattached filter - * @fprog: the filter program * @pfp: the unattached filter that is created + * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. @@ -662,33 +1533,38 @@ static int __sk_prepare_filter(struct sk_filter *fp) * a negative errno code is returned. On success the return is zero. */ int sk_unattached_filter_create(struct sk_filter **pfp, - struct sock_fprog *fprog) + struct sock_fprog_kern *fprog) { + unsigned int fsize = sk_filter_proglen(fprog); struct sk_filter *fp; - unsigned int fsize = sizeof(struct sock_filter) * fprog->len; - int err; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL); + fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); if (!fp) return -ENOMEM; + memcpy(fp->insns, fprog->filter, fsize); atomic_set(&fp->refcnt, 1); fp->len = fprog->len; + /* Since unattached filters are not copied back to user + * space through sk_get_filter(), we do not need to hold + * a copy here, and can spare us the work. + */ + fp->orig_prog = NULL; - err = __sk_prepare_filter(fp); - if (err) - goto free_mem; + /* __sk_prepare_filter() already takes care of uncharging + * memory in case something goes wrong. + */ + fp = __sk_prepare_filter(fp, NULL); + if (IS_ERR(fp)) + return PTR_ERR(fp); *pfp = fp; return 0; -free_mem: - kfree(fp); - return err; } EXPORT_SYMBOL_GPL(sk_unattached_filter_create); @@ -711,36 +1587,49 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct sk_filter *fp, *old_fp; - unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + unsigned int fsize = sk_filter_proglen(fprog); + unsigned int sk_fsize = sk_filter_size(fprog->len); int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL); if (!fp) return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { - sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + sock_kfree_s(sk, fp, sk_fsize); return -EFAULT; } atomic_set(&fp->refcnt, 1); fp->len = fprog->len; - err = __sk_prepare_filter(fp); + err = sk_store_orig_filter(fp, fprog); if (err) { sk_filter_uncharge(sk, fp); - return err; + return -ENOMEM; } + /* __sk_prepare_filter() already takes care of uncharging + * memory in case something goes wrong. + */ + fp = __sk_prepare_filter(fp, sk); + if (IS_ERR(fp)) + return PTR_ERR(fp); + old_fp = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); + return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -750,6 +1639,9 @@ int sk_detach_filter(struct sock *sk) int ret = -ENOENT; struct sk_filter *filter; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + filter = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); if (filter) { @@ -757,135 +1649,46 @@ int sk_detach_filter(struct sock *sk) sk_filter_uncharge(sk, filter); ret = 0; } + return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); -static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) -{ - static const u16 decodes[] = { - [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K, - [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X, - [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K, - [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X, - [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K, - [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X, - [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X, - [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K, - [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X, - [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K, - [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X, - [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K, - [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X, - [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K, - [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X, - [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K, - [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X, - [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K, - [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X, - [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG, - [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS, - [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS, - [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN, - [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND, - [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND, - [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND, - [BPF_S_LD_IMM] = BPF_LD|BPF_IMM, - [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN, - [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH, - [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM, - [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX, - [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA, - [BPF_S_RET_K] = BPF_RET|BPF_K, - [BPF_S_RET_A] = BPF_RET|BPF_A, - [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K, - [BPF_S_LD_MEM] = BPF_LD|BPF_MEM, - [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM, - [BPF_S_ST] = BPF_ST, - [BPF_S_STX] = BPF_STX, - [BPF_S_JMP_JA] = BPF_JMP|BPF_JA, - [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K, - [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X, - [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K, - [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X, - [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K, - [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X, - [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K, - [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X, - }; - u16 code; - - code = filt->code; - - to->code = decodes[code]; - to->jt = filt->jt; - to->jf = filt->jf; - - if (code == BPF_S_ALU_DIV_K) { - /* - * When loaded this rule user gave us X, which was - * translated into R = r(X). Now we calculate the - * RR = r(R) and report it back. If next time this - * value is loaded and RRR = r(RR) is calculated - * then the R == RRR will be true. - * - * One exception. X == 1 translates into R == 0 and - * we can't calculate RR out of it with r(). - */ - - if (filt->k == 0) - to->k = 1; - else - to->k = reciprocal_value(filt->k); - - BUG_ON(reciprocal_value(to->k) != filt->k); - } else - to->k = filt->k; -} - -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) +int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, + unsigned int len) { + struct sock_fprog_kern *fprog; struct sk_filter *filter; - int i, ret; + int ret = 0; lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); - ret = 0; + sock_owned_by_user(sk)); if (!filter) goto out; - ret = filter->len; + + /* We're copying the filter that has been originally attached, + * so no conversion/decode needed anymore. + */ + fprog = filter->orig_prog; + + ret = fprog->len; if (!len) + /* User space only enquires number of filter blocks. */ goto out; + ret = -EINVAL; - if (len < filter->len) + if (len < fprog->len) goto out; ret = -EFAULT; - for (i = 0; i < filter->len; i++) { - struct sock_filter fb; - - sk_decode_filter(&filter->insns[i], &fb); - if (copy_to_user(&ubuf[i], &fb, sizeof(fb))) - goto out; - } + if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog))) + goto out; - ret = filter->len; + /* Instead of bytes, the API requests to return the number + * of filter blocks. + */ + ret = fprog->len; out: release_sock(sk); return ret; diff --git a/net/core/flow.c b/net/core/flow.c index b0901ee5a00..a0348fde1fd 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -24,6 +24,7 @@ #include <net/flow.h> #include <linux/atomic.h> #include <linux/security.h> +#include <net/net_namespace.h> struct flow_cache_entry { union { @@ -38,37 +39,14 @@ struct flow_cache_entry { struct flow_cache_object *object; }; -struct flow_cache_percpu { - struct hlist_head *hash_table; - int hash_count; - u32 hash_rnd; - int hash_rnd_recalc; - struct tasklet_struct flush_tasklet; -}; - struct flow_flush_info { struct flow_cache *cache; atomic_t cpuleft; struct completion completion; }; -struct flow_cache { - u32 hash_shift; - struct flow_cache_percpu __percpu *percpu; - struct notifier_block hotcpu_notifier; - int low_watermark; - int high_watermark; - struct timer_list rnd_timer; -}; - -atomic_t flow_cache_genid = ATOMIC_INIT(0); -EXPORT_SYMBOL(flow_cache_genid); -static struct flow_cache flow_cache_global; static struct kmem_cache *flow_cachep __read_mostly; -static DEFINE_SPINLOCK(flow_cache_gc_lock); -static LIST_HEAD(flow_cache_gc_list); - #define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) #define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) @@ -84,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg) add_timer(&fc->rnd_timer); } -static int flow_entry_valid(struct flow_cache_entry *fle) +static int flow_entry_valid(struct flow_cache_entry *fle, + struct netns_xfrm *xfrm) { - if (atomic_read(&flow_cache_genid) != fle->genid) + if (atomic_read(&xfrm->flow_cache_genid) != fle->genid) return 0; if (fle->object && !fle->object->ops->check(fle->object)) return 0; return 1; } -static void flow_entry_kill(struct flow_cache_entry *fle) +static void flow_entry_kill(struct flow_cache_entry *fle, + struct netns_xfrm *xfrm) { if (fle->object) fle->object->ops->delete(fle->object); @@ -104,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work) { struct list_head gc_list; struct flow_cache_entry *fce, *n; + struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, + flow_cache_gc_work); INIT_LIST_HEAD(&gc_list); - spin_lock_bh(&flow_cache_gc_lock); - list_splice_tail_init(&flow_cache_gc_list, &gc_list); - spin_unlock_bh(&flow_cache_gc_lock); + spin_lock_bh(&xfrm->flow_cache_gc_lock); + list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); + spin_unlock_bh(&xfrm->flow_cache_gc_lock); list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) - flow_entry_kill(fce); + flow_entry_kill(fce, xfrm); } -static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, - int deleted, struct list_head *gc_list) + int deleted, struct list_head *gc_list, + struct netns_xfrm *xfrm) { if (deleted) { fcp->hash_count -= deleted; - spin_lock_bh(&flow_cache_gc_lock); - list_splice_tail(gc_list, &flow_cache_gc_list); - spin_unlock_bh(&flow_cache_gc_lock); - schedule_work(&flow_cache_gc_work); + spin_lock_bh(&xfrm->flow_cache_gc_lock); + list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); + spin_unlock_bh(&xfrm->flow_cache_gc_lock); + schedule_work(&xfrm->flow_cache_gc_work); } } @@ -132,17 +114,19 @@ static void __flow_cache_shrink(struct flow_cache *fc, int shrink_to) { struct flow_cache_entry *fle; - struct hlist_node *entry, *tmp; + struct hlist_node *tmp; LIST_HEAD(gc_list); int i, deleted = 0; + struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, + flow_cache_global); for (i = 0; i < flow_cache_hash_size(fc); i++) { int saved = 0; - hlist_for_each_entry_safe(fle, entry, tmp, + hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { if (saved < shrink_to && - flow_entry_valid(fle)) { + flow_entry_valid(fle, xfrm)) { saved++; } else { deleted++; @@ -152,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc, } } - flow_cache_queue_garbage(fcp, deleted, &gc_list); + flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); } static void flow_cache_shrink(struct flow_cache *fc, @@ -208,10 +192,9 @@ struct flow_cache_object * flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver, void *ctx) { - struct flow_cache *fc = &flow_cache_global; + struct flow_cache *fc = &net->xfrm.flow_cache_global; struct flow_cache_percpu *fcp; struct flow_cache_entry *fle, *tfle; - struct hlist_node *entry; struct flow_cache_object *flo; size_t keysize; unsigned int hash; @@ -235,7 +218,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, flow_new_hash_rnd(fc, fcp); hash = flow_hash_code(fc, fcp, key, keysize); - hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { + hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) { if (tfle->net == net && tfle->family == family && tfle->dir == dir && @@ -259,7 +242,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); fcp->hash_count++; } - } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { + } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) { flo = fle->object; if (!flo) goto ret_object; @@ -280,13 +263,13 @@ nocache: } flo = resolver(net, key, family, dir, flo, ctx); if (fle) { - fle->genid = atomic_read(&flow_cache_genid); + fle->genid = atomic_read(&net->xfrm.flow_cache_genid); if (!IS_ERR(flo)) fle->object = flo; else fle->genid--; } else { - if (flo && !IS_ERR(flo)) + if (!IS_ERR_OR_NULL(flo)) flo->ops->delete(flo); } ret_object: @@ -301,15 +284,17 @@ static void flow_cache_flush_tasklet(unsigned long data) struct flow_cache *fc = info->cache; struct flow_cache_percpu *fcp; struct flow_cache_entry *fle; - struct hlist_node *entry, *tmp; + struct hlist_node *tmp; LIST_HEAD(gc_list); int i, deleted = 0; + struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, + flow_cache_global); fcp = this_cpu_ptr(fc->percpu); for (i = 0; i < flow_cache_hash_size(fc); i++) { - hlist_for_each_entry_safe(fle, entry, tmp, + hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { - if (flow_entry_valid(fle)) + if (flow_entry_valid(fle, xfrm)) continue; deleted++; @@ -318,57 +303,94 @@ static void flow_cache_flush_tasklet(unsigned long data) } } - flow_cache_queue_garbage(fcp, deleted, &gc_list); + flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); } +/* + * Return whether a cpu needs flushing. Conservatively, we assume + * the presence of any entries means the core may require flushing, + * since the flow_cache_ops.check() function may assume it's running + * on the same core as the per-cpu cache component. + */ +static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) +{ + struct flow_cache_percpu *fcp; + int i; + + fcp = per_cpu_ptr(fc->percpu, cpu); + for (i = 0; i < flow_cache_hash_size(fc); i++) + if (!hlist_empty(&fcp->hash_table[i])) + return 0; + return 1; +} + static void flow_cache_flush_per_cpu(void *data) { struct flow_flush_info *info = data; struct tasklet_struct *tasklet; - tasklet = this_cpu_ptr(&info->cache->percpu->flush_tasklet); + tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet; tasklet->data = (unsigned long)info; tasklet_schedule(tasklet); } -void flow_cache_flush(void) +void flow_cache_flush(struct net *net) { struct flow_flush_info info; - static DEFINE_MUTEX(flow_flush_sem); + cpumask_var_t mask; + int i, self; + + /* Track which cpus need flushing to avoid disturbing all cores. */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return; + cpumask_clear(mask); /* Don't want cpus going down or up during this. */ get_online_cpus(); - mutex_lock(&flow_flush_sem); - info.cache = &flow_cache_global; - atomic_set(&info.cpuleft, num_online_cpus()); + mutex_lock(&net->xfrm.flow_flush_sem); + info.cache = &net->xfrm.flow_cache_global; + for_each_online_cpu(i) + if (!flow_cache_percpu_empty(info.cache, i)) + cpumask_set_cpu(i, mask); + atomic_set(&info.cpuleft, cpumask_weight(mask)); + if (atomic_read(&info.cpuleft) == 0) + goto done; + init_completion(&info.completion); local_bh_disable(); - smp_call_function(flow_cache_flush_per_cpu, &info, 0); - flow_cache_flush_tasklet((unsigned long)&info); + self = cpumask_test_and_clear_cpu(smp_processor_id(), mask); + on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0); + if (self) + flow_cache_flush_tasklet((unsigned long)&info); local_bh_enable(); wait_for_completion(&info.completion); - mutex_unlock(&flow_flush_sem); + +done: + mutex_unlock(&net->xfrm.flow_flush_sem); put_online_cpus(); + free_cpumask_var(mask); } static void flow_cache_flush_task(struct work_struct *work) { - flow_cache_flush(); -} + struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, + flow_cache_gc_work); + struct net *net = container_of(xfrm, struct net, xfrm); -static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); + flow_cache_flush(net); +} -void flow_cache_flush_deferred(void) +void flow_cache_flush_deferred(struct net *net) { - schedule_work(&flow_cache_flush_work); + schedule_work(&net->xfrm.flow_cache_flush_work); } -static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) +static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); @@ -386,11 +408,12 @@ static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) return 0; } -static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, +static int flow_cache_cpu(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); + struct flow_cache *fc = container_of(nfb, struct flow_cache, + hotcpu_notifier); int res, cpu = (unsigned long) hcpu; struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); @@ -409,9 +432,20 @@ static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, return NOTIFY_OK; } -static int __init flow_cache_init(struct flow_cache *fc) +int flow_cache_init(struct net *net) { int i; + struct flow_cache *fc = &net->xfrm.flow_cache_global; + + if (!flow_cachep) + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_PANIC, NULL); + spin_lock_init(&net->xfrm.flow_cache_gc_lock); + INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list); + INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); + INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); + mutex_init(&net->xfrm.flow_flush_sem); fc->hash_shift = 10; fc->low_watermark = 2 * flow_cache_hash_size(fc); @@ -421,6 +455,8 @@ static int __init flow_cache_init(struct flow_cache *fc) if (!fc->percpu) return -ENOMEM; + cpu_notifier_register_begin(); + for_each_online_cpu(i) { if (flow_cache_cpu_prepare(fc, i)) goto err; @@ -428,7 +464,9 @@ static int __init flow_cache_init(struct flow_cache *fc) fc->hotcpu_notifier = (struct notifier_block){ .notifier_call = flow_cache_cpu, }; - register_hotcpu_notifier(&fc->hotcpu_notifier); + __register_hotcpu_notifier(&fc->hotcpu_notifier); + + cpu_notifier_register_done(); setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc); @@ -444,19 +482,30 @@ err: fcp->hash_table = NULL; } + cpu_notifier_register_done(); + free_percpu(fc->percpu); fc->percpu = NULL; return -ENOMEM; } +EXPORT_SYMBOL(flow_cache_init); -static int __init flow_cache_init_global(void) +void flow_cache_fini(struct net *net) { - flow_cachep = kmem_cache_create("flow_cache", - sizeof(struct flow_cache_entry), - 0, SLAB_PANIC, NULL); + int i; + struct flow_cache *fc = &net->xfrm.flow_cache_global; - return flow_cache_init(&flow_cache_global); -} + del_timer_sync(&fc->rnd_timer); + unregister_hotcpu_notifier(&fc->hotcpu_notifier); + + for_each_possible_cpu(i) { + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); + kfree(fcp->hash_table); + fcp->hash_table = NULL; + } -module_init(flow_cache_init_global); + free_percpu(fc->percpu); + fc->percpu = NULL; +} +EXPORT_SYMBOL(flow_cache_fini); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 466820b6e34..107ed12a532 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,10 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/ipv6.h> +#include <linux/igmp.h> +#include <linux/icmp.h> +#include <linux/sctp.h> +#include <linux/dccp.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> @@ -21,9 +25,35 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); } +/** + * skb_flow_get_ports - extract the upper layer ports and return them + * @skb: buffer to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ + int poff = proto_ports_offset(ip_proto); + + if (poff >= 0) { + __be32 *ports, _ports; + + ports = skb_header_pointer(skb, thoff + poff, + sizeof(_ports), &_ports); + if (ports) + return *ports; + } + + return 0; +} +EXPORT_SYMBOL(skb_flow_get_ports); + bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) { - int poff, nhoff = skb_network_offset(skb); + int nhoff = skb_network_offset(skb); u8 ip_proto; __be16 proto = skb->protocol; @@ -31,23 +61,23 @@ bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) again: switch (proto) { - case __constant_htons(ETH_P_IP): { + case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; ip: iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) + if (!iph || iph->ihl < 5) return false; + nhoff += iph->ihl * 4; + ip_proto = iph->protocol; if (ip_is_fragment(iph)) ip_proto = 0; - else - ip_proto = iph->protocol; + iph_to_flow_copy_addrs(flow, iph); - nhoff += iph->ihl * 4; break; } - case __constant_htons(ETH_P_IPV6): { + case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; ipv6: @@ -61,7 +91,8 @@ ipv6: nhoff += sizeof(struct ipv6hdr); break; } - case __constant_htons(ETH_P_8021Q): { + case htons(ETH_P_8021AD): + case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan; struct vlan_hdr _vlan; @@ -73,7 +104,7 @@ ipv6: nhoff += sizeof(*vlan); goto again; } - case __constant_htons(ETH_P_PPP_SES): { + case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; @@ -84,9 +115,9 @@ ipv6: proto = hdr->proto; nhoff += PPPOE_SES_HLEN; switch (proto) { - case __constant_htons(PPP_IP): + case htons(PPP_IP): goto ip; - case __constant_htons(PPP_IPV6): + case htons(PPP_IPV6): goto ipv6; default: return false; @@ -119,27 +150,256 @@ ipv6: nhoff += 4; if (hdr->flags & GRE_SEQ) nhoff += 4; + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = skb_header_pointer(skb, nhoff, + sizeof(_eth), &_eth); + if (!eth) + return false; + proto = eth->h_proto; + nhoff += sizeof(*eth); + } goto again; } break; } case IPPROTO_IPIP: - goto again; + proto = htons(ETH_P_IP); + goto ip; + case IPPROTO_IPV6: + proto = htons(ETH_P_IPV6); + goto ipv6; default: break; } flow->ip_proto = ip_proto; - poff = proto_ports_offset(ip_proto); - if (poff >= 0) { - __be32 *ports, _ports; - - nhoff += poff; - ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); - if (ports) - flow->ports = *ports; - } + flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); + flow->thoff = (u16) nhoff; return true; } EXPORT_SYMBOL(skb_flow_dissect); + +static u32 hashrnd __read_mostly; +static __always_inline void __flow_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +{ + __flow_hash_secret_init(); + return jhash_3words(a, b, c, hashrnd); +} + +static __always_inline u32 __flow_hash_1word(u32 a) +{ + __flow_hash_secret_init(); + return jhash_1word(a, hashrnd); +} + +/* + * __skb_get_hash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Sets hash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_hash in skb + * if hash is a canonical 4-tuple hash over transport ports. + */ +void __skb_get_hash(struct sk_buff *skb) +{ + struct flow_keys keys; + u32 hash; + + if (!skb_flow_dissect(skb, &keys)) + return; + + if (keys.ports) + skb->l4_hash = 1; + + /* get a consistent hash (same value on both flow directions) */ + if (((__force u32)keys.dst < (__force u32)keys.src) || + (((__force u32)keys.dst == (__force u32)keys.src) && + ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { + swap(keys.dst, keys.src); + swap(keys.port16[0], keys.port16[1]); + } + + hash = __flow_hash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports); + if (!hash) + hash = 1; + + skb->hash = hash; +} +EXPORT_SYMBOL(__skb_get_hash); + +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol; + hash = __flow_hash_1word(hash); + + return (u16) (((u64) hash * qcount) >> 32) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + +/* __skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 __skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + u32 poff = 0; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + poff += keys.thoff; + switch (keys.ip_proto) { + case IPPROTO_TCP: { + const struct tcphdr *tcph; + struct tcphdr _tcph; + + tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); + if (!tcph) + return poff; + + poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4); + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + poff += sizeof(struct udphdr); + break; + /* For the rest, we do not really care about header + * extensions at this point for now. + */ + case IPPROTO_ICMP: + poff += sizeof(struct icmphdr); + break; + case IPPROTO_ICMPV6: + poff += sizeof(struct icmp6hdr); + break; + case IPPROTO_IGMP: + poff += sizeof(struct igmphdr); + break; + case IPPROTO_DCCP: + poff += sizeof(struct dccp_hdr); + break; + case IPPROTO_SCTP: + poff += sizeof(struct sctphdr); + break; + } + + return poff; +} + +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->hash; + hash = __flow_hash_1word(hash); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, skb); + if (new_index < 0) + new_index = skb_tx_hash(dev, skb); + + if (queue_index != new_index && sk && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb, + void *accel_priv) +{ + int queue_index = 0; + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); + else + queue_index = __netdev_pick_tx(dev, skb); + + if (!accel_priv) + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index d9d198aa9fe..6b5b6e7013c 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -82,7 +82,7 @@ struct gen_estimator { struct list_head list; struct gnet_stats_basic_packed *bstats; - struct gnet_stats_rate_est *rate_est; + struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; int ewma_log; u64 last_bytes; @@ -167,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est) static struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { struct rb_node *p = est_root.rb_node; @@ -203,7 +203,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { @@ -258,7 +258,7 @@ EXPORT_SYMBOL(gen_new_estimator); * Note : Caller should respect an RCU grace period before freeing stats_lock */ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est) + struct gnet_stats_rate_est64 *rate_est) { struct gen_estimator *e; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); @@ -306,7 +306,7 @@ EXPORT_SYMBOL(gen_replace_estimator); * Returns true if estimator is active, and false if not. */ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { bool res; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ddedf211e58..9d3d9e78397 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -143,18 +143,30 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est *r) + struct gnet_stats_rate_est64 *r) { + struct gnet_stats_rate_est est; + int res; + if (b && !gen_estimator_active(b, r)) return 0; + est.bps = min_t(u64, UINT_MAX, r->bps); + /* we have some time before reaching 2^32 packets per second */ + est.pps = r->pps; + if (d->compat_tc_stats) { - d->tc_stats.bps = r->bps; - d->tc_stats.pps = r->pps; + d->tc_stats.bps = est.bps; + d->tc_stats.pps = est.pps; } - if (d->tail) - return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + if (d->tail) { + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); + if (res < 0 || est.bps == r->bps) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + } return 0; } diff --git a/net/core/iovec.c b/net/core/iovec.c index 7e7aeb01de4..e1ec45ab1e6 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -39,7 +39,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a { int size, ct, err; - if (m->msg_namelen) { + if (m->msg_name && m->msg_namelen) { if (mode == VERIFY_READ) { void __user *namep; namep = (void __user __force *) m->msg_name; @@ -51,6 +51,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a m->msg_name = address; } else { m->msg_name = NULL; + m->msg_namelen = 0; } size = m->msg_iovlen * sizeof(struct iovec); @@ -74,111 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a } /* - * Copy kernel to iovec. Returns -EFAULT on error. - * - * Note: this modifies the original iovec. - */ - -int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) -{ - while (len > 0) { - if (iov->iov_len) { - int copy = min_t(unsigned int, iov->iov_len, len); - if (copy_to_user(iov->iov_base, kdata, copy)) - return -EFAULT; - kdata += copy; - len -= copy; - iov->iov_len -= copy; - iov->iov_base += copy; - } - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovec); - -/* - * Copy kernel to iovec. Returns -EFAULT on error. - */ - -int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, - int offset, int len) -{ - int copy; - for (; len > 0; ++iov) { - /* Skip over the finished iovecs */ - if (unlikely(offset >= iov->iov_len)) { - offset -= iov->iov_len; - continue; - } - copy = min_t(unsigned int, iov->iov_len - offset, len); - if (copy_to_user(iov->iov_base + offset, kdata, copy)) - return -EFAULT; - offset = 0; - kdata += copy; - len -= copy; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovecend); - -/* - * Copy iovec to kernel. Returns -EFAULT on error. - * - * Note: this modifies the original iovec. - */ - -int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) -{ - while (len > 0) { - if (iov->iov_len) { - int copy = min_t(unsigned int, len, iov->iov_len); - if (copy_from_user(kdata, iov->iov_base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovec); - -/* - * Copy iovec from kernel. Returns -EFAULT on error. - */ - -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, - int offset, int len) -{ - /* Skip over the finished iovecs */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - } - - while (len > 0) { - u8 __user *base = iov->iov_base + offset; - int copy = min_t(unsigned int, len, iov->iov_len - offset); - - offset = 0; - if (copy_from_user(kdata, base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovecend); - -/* * And now for the all-in-one: copy and checksum from a user iovec * directly to a datagram * Calls to csum_partial but the last must be in 32 bit chunks @@ -262,3 +158,27 @@ out_fault: goto out; } EXPORT_SYMBOL(csum_partial_copy_fromiovecend); + +unsigned long iov_pages(const struct iovec *iov, int offset, + unsigned long nr_segs) +{ + unsigned long seg, base; + int pages = 0, len, size; + + while (nr_segs && (offset >= iov->iov_len)) { + offset -= iov->iov_len; + ++iov; + --nr_segs; + } + + for (seg = 0; seg < nr_segs; seg++) { + base = (unsigned long)iov[seg].iov_base + offset; + len = iov[seg].iov_len - offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + pages += size; + offset = 0; + } + + return pages; +} +EXPORT_SYMBOL(iov_pages); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 8f82a5cc385..bd0767e6b2b 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -92,6 +92,9 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (dev->ifindex != dev->iflink) return true; + if (dev->priv_flags & IFF_TEAM_PORT) + return true; + return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } @@ -144,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev) * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c815f285e5a..ef31fef25e5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -38,22 +38,16 @@ #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#define DEBUG #define NEIGH_DEBUG 1 - -#define NEIGH_PRINTK(x...) printk(x) -#define NEIGH_NOPRINTK(x...) do { ; } while(0) -#define NEIGH_PRINTK1 NEIGH_NOPRINTK -#define NEIGH_PRINTK2 NEIGH_NOPRINTK - -#if NEIGH_DEBUG >= 1 -#undef NEIGH_PRINTK1 -#define NEIGH_PRINTK1 NEIGH_PRINTK -#endif -#if NEIGH_DEBUG >= 2 -#undef NEIGH_PRINTK2 -#define NEIGH_PRINTK2 NEIGH_PRINTK -#endif +#define neigh_dbg(level, fmt, ...) \ +do { \ + if (level <= NEIGH_DEBUG) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) #define PNEIGH_HASHMASK 0xF @@ -123,7 +117,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? (net_random() % base) + (base >> 1) : 0; + return base ? (prandom_u32() % base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -239,14 +233,14 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) we must kill timers etc. and move it to safe state. */ - skb_queue_purge(&n->arp_queue); + __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; n->output = neigh_blackhole; if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; - NEIGH_PRINTK2("neigh %p is stray.\n", n); + neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -290,19 +284,11 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device goto out_entries; } - if (tbl->entry_size) - n = kzalloc(tbl->entry_size, GFP_ATOMIC); - else { - int sz = sizeof(*n) + tbl->key_len; - - sz = ALIGN(sz, NEIGH_PRIV_ALIGN); - sz += dev->neigh_priv_len; - n = kzalloc(sz, GFP_ATOMIC); - } + n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; - skb_queue_head_init(&n->arp_queue); + __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; @@ -513,7 +499,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, goto out_neigh_release; } - n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, @@ -550,7 +536,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, lockdep_is_held(&tbl->lock))); rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); - NEIGH_PRINTK2("neigh %p is created.\n", n); + neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; @@ -724,7 +710,9 @@ void neigh_destroy(struct neighbour *neigh) if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); - skb_queue_purge(&neigh->arp_queue); + write_lock_bh(&neigh->lock); + __skb_queue_purge(&neigh->arp_queue); + write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) @@ -733,7 +721,7 @@ void neigh_destroy(struct neighbour *neigh) dev_put(dev); neigh_parms_put(neigh->parms); - NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); @@ -747,7 +735,7 @@ EXPORT_SYMBOL(neigh_destroy); */ static void neigh_suspect(struct neighbour *neigh) { - NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->output = neigh->ops->output; } @@ -759,7 +747,7 @@ static void neigh_suspect(struct neighbour *neigh) */ static void neigh_connect(struct neighbour *neigh) { - NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + neigh_dbg(2, "neigh %p is connected\n", neigh); neigh->output = neigh->ops->connected_output; } @@ -787,9 +775,12 @@ static void neigh_periodic_work(struct work_struct *work) tbl->last_rand = jiffies; for (p = &tbl->parms; p; p = p->next) p->reachable_time = - neigh_rand_reach_time(p->base_reachable_time); + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } + if (atomic_read(&tbl->entries) < tbl->gc_thresh1) + goto out; + for (i = 0 ; i < (1 << nht->hash_shift); i++) { np = &nht->hash_buckets[i]; @@ -810,7 +801,7 @@ static void neigh_periodic_work(struct work_struct *work) if (atomic_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + n->parms->gc_staletime))) { + time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; n->dead = 1; write_unlock(&n->lock); @@ -832,21 +823,23 @@ next_elt: nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } - /* Cycle through all hash buckets every base_reachable_time/2 ticks. - * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 - * base_reachable_time. +out: + /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. + * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 + * BASE_REACHABLE_TIME. */ - schedule_delayed_work(&tbl->gc_work, - tbl->parms.base_reachable_time >> 1); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); write_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE) ? - p->ucast_probes : - p->ucast_probes + p->app_probes + p->mcast_probes; + int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); + if (!(n->nud_state & NUD_PROBE)) + max_probes += NEIGH_VAR(p, MCAST_PROBES); + return max_probes; } static void neigh_invalidate(struct neighbour *neigh) @@ -856,7 +849,7 @@ static void neigh_invalidate(struct neighbour *neigh) struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); - NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated @@ -870,14 +863,14 @@ static void neigh_invalidate(struct neighbour *neigh) neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } - skb_queue_purge(&neigh->arp_queue); + __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); + struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_copy(skb, GFP_ATOMIC); @@ -908,17 +901,18 @@ static void neigh_timer_handler(unsigned long arg) if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { - NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, - neigh->used + neigh->parms->delay_probe_time)) { - NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->used + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { + neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_suspect(neigh); - next = now + neigh->parms->delay_probe_time; + next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { - NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->nud_state = NUD_STALE; neigh->updated = jiffies; neigh_suspect(neigh); @@ -926,23 +920,24 @@ static void neigh_timer_handler(unsigned long arg) } } else if (state & NUD_DELAY) { if (time_before_eq(now, - neigh->confirmed + neigh->parms->delay_probe_time)) { - NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); + neigh->confirmed + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { + neigh_dbg(2, "neigh %p is now reachable\n", neigh); neigh->nud_state = NUD_REACHABLE; neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { - NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh_dbg(2, "neigh %p is probed\n", neigh); neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); - next = now + neigh->parms->retrans_time; + next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + neigh->parms->retrans_time; + next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -950,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_FAILED; notify = 1; neigh_invalidate(neigh); + goto out; } if (neigh->nud_state & NUD_IN_TIMER) { @@ -983,13 +979,16 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) goto out_unlock_bh; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { - if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; - atomic_set(&neigh->probes, neigh->parms->ucast_probes); + atomic_set(&neigh->probes, + NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; - next = now + max(neigh->parms->retrans_time, HZ/2); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/2); neigh_add_timer(neigh, next); immediate_probe = true; } else { @@ -1001,17 +1000,17 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) return 1; } } else if (neigh->nud_state & NUD_STALE) { - NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; - neigh_add_timer(neigh, - jiffies + neigh->parms->delay_probe_time); + neigh_add_timer(neigh, jiffies + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > - neigh->parms->queue_len_bytes) { + NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); @@ -1171,6 +1170,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->parms->reachable_time : 0))); neigh->nud_state = new; + notify = 1; } if (lladdr != neigh->ha) { @@ -1180,7 +1180,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - - (neigh->parms->base_reachable_time << 1); + (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) @@ -1222,7 +1222,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, write_lock_bh(&neigh->lock); } - skb_queue_purge(&neigh->arp_queue); + __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } out: @@ -1240,6 +1240,21 @@ out: } EXPORT_SYMBOL(neigh_update); +/* Update the neigh to listen temporarily for probe responses, even if it is + * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. + */ +void __neigh_set_probe_once(struct neighbour *neigh) +{ + neigh->updated = jiffies; + if (!(neigh->nud_state & NUD_FAILED)) + return; + neigh->nud_state = NUD_INCOMPLETE; + atomic_set(&neigh->probes, neigh_max_probes(neigh)); + neigh_add_timer(neigh, + jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); +} +EXPORT_SYMBOL(__neigh_set_probe_once); + struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) @@ -1284,7 +1299,7 @@ int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb) if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 && - dev->header_ops->rebuild(skb)) + dev_rebuild_header(skb)) return 0; return dev_queue_xmit(skb); @@ -1324,8 +1339,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) out: return rc; discard: - NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", - dst, neigh); + neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh); out_kfree_skb: rc = -EINVAL; kfree_skb(skb); @@ -1402,9 +1416,11 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long now = jiffies; - unsigned long sched_next = now + (net_random() % p->proxy_delay); - if (tbl->proxy_queue.qlen > p->proxy_qlen) { + unsigned long sched_next = now + (prandom_u32() % + NEIGH_VAR(p, PROXY_DELAY)); + + if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } @@ -1432,7 +1448,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, for (p = &tbl->parms; p; p = p->next) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || - (!p->dev && !ifindex)) + (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } @@ -1442,34 +1458,34 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { - struct neigh_parms *p, *ref; + struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; - ref = lookup_neigh_parms(tbl, net, 0); - if (!ref) - return NULL; - - p = kmemdup(ref, sizeof(*p), GFP_KERNEL); + p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; atomic_set(&p->refcnt, 1); p->reachable_time = - neigh_rand_reach_time(p->base_reachable_time); + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + dev_hold(dev); + p->dev = dev; + write_pnet(&p->net, hold_net(net)); + p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + release_net(net); + dev_put(dev); kfree(p); return NULL; } - dev_hold(dev); - p->dev = dev; - write_pnet(&p->net, hold_net(net)); - p->sysctl_table = NULL; write_lock_bh(&tbl->lock); p->next = tbl->parms.next; tbl->parms.next = p; write_unlock_bh(&tbl->lock); + + neigh_parms_data_state_cleanall(p); } return p; } @@ -1502,7 +1518,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } } write_unlock_bh(&tbl->lock); - NEIGH_PRINTK1("neigh_parms_release: not found\n"); + neigh_dbg(1, "%s: not found\n", __func__); } EXPORT_SYMBOL(neigh_parms_release); @@ -1522,7 +1538,7 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) write_pnet(&tbl->parms.net, &init_net); atomic_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = - neigh_rand_reach_time(tbl->parms.base_reachable_time); + neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) @@ -1542,9 +1558,16 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); + if (!tbl->entry_size) + tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + + tbl->key_len, NEIGH_PRIV_ALIGN); + else + WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); + rwlock_init(&tbl->lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); - schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + tbl->parms.reachable_time); setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); @@ -1611,7 +1634,7 @@ int neigh_table_clear(struct neigh_table *tbl) } EXPORT_SYMBOL(neigh_table_clear); -static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -1675,7 +1698,7 @@ out: return err; } -static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -1784,24 +1807,32 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || - nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) || + nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, + NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, - parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) || - nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) || - nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) || - nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) || - nla_put_u32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes) || + NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || + nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || + nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || + nla_put_u32(skb, NDTPA_UCAST_PROBES, + NEIGH_VAR(parms, UCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_PROBES, + NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, - parms->base_reachable_time) || - nla_put_msecs(skb, NDTPA_GC_STALETIME, parms->gc_staletime) || + NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || + nla_put_msecs(skb, NDTPA_GC_STALETIME, + NEIGH_VAR(parms, GC_STALETIME)) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, - parms->delay_probe_time) || - nla_put_msecs(skb, NDTPA_RETRANS_TIME, parms->retrans_time) || - nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay) || - nla_put_msecs(skb, NDTPA_PROXY_DELAY, parms->proxy_delay) || - nla_put_msecs(skb, NDTPA_LOCKTIME, parms->locktime)) + NEIGH_VAR(parms, DELAY_PROBE_TIME)) || + nla_put_msecs(skb, NDTPA_RETRANS_TIME, + NEIGH_VAR(parms, RETRANS_TIME)) || + nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, + NEIGH_VAR(parms, ANYCAST_DELAY)) || + nla_put_msecs(skb, NDTPA_PROXY_DELAY, + NEIGH_VAR(parms, PROXY_DELAY)) || + nla_put_msecs(skb, NDTPA_LOCKTIME, + NEIGH_VAR(parms, LOCKTIME))) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -1953,7 +1984,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_LOCKTIME] = { .type = NLA_U64 }, }; -static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; @@ -2017,49 +2048,68 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) switch (i) { case NDTPA_QUEUE_LEN: - p->queue_len_bytes = nla_get_u32(tbp[i]) * - SKB_TRUESIZE(ETH_FRAME_LEN); + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i]) * + SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: - p->queue_len_bytes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: - p->proxy_qlen = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, PROXY_QLEN, + nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: - p->app_probes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, APP_PROBES, + nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: - p->ucast_probes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, UCAST_PROBES, + nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: - p->mcast_probes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, MCAST_PROBES, + nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: - p->base_reachable_time = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, + nla_get_msecs(tbp[i])); break; case NDTPA_GC_STALETIME: - p->gc_staletime = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, GC_STALETIME, + nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: - p->delay_probe_time = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, DELAY_PROBE_TIME, + nla_get_msecs(tbp[i])); break; case NDTPA_RETRANS_TIME: - p->retrans_time = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, RETRANS_TIME, + nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: - p->anycast_delay = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, ANYCAST_DELAY, + nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: - p->proxy_delay = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, PROXY_DELAY, + nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: - p->locktime = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, LOCKTIME, + nla_get_msecs(tbp[i])); break; } } } + err = -ENOENT; + if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || + tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && + !net_eq(net, &init_net)) + goto errout_tbl_lock; + if (tb[NDTA_THRESH1]) tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); @@ -2199,7 +2249,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = pn->flags | NTF_PROXY; - ndm->ndm_type = NDA_DST; + ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev->ifindex; ndm->ndm_state = NUD_NONE; @@ -2712,7 +2762,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode)->data; + sf->private = PDE_DATA(inode); } return ret; }; @@ -2760,23 +2810,22 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -#ifdef CONFIG_ARPD void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); } EXPORT_SYMBOL(neigh_app_ns); -#endif /* CONFIG_ARPD */ #ifdef CONFIG_SYSCTL static int zero; +static int int_max = INT_MAX; static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); -static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, - size_t *lenp, loff_t *ppos) +static int proc_unres_qlen(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int size, ret; - ctl_table tmp = *ctl; + struct ctl_table tmp = *ctl; tmp.extra1 = &zero; tmp.extra2 = &unres_qlen_max; @@ -2790,125 +2839,167 @@ static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, return ret; } -enum { - NEIGH_VAR_MCAST_PROBE, - NEIGH_VAR_UCAST_PROBE, - NEIGH_VAR_APP_PROBE, - NEIGH_VAR_RETRANS_TIME, - NEIGH_VAR_BASE_REACHABLE_TIME, - NEIGH_VAR_DELAY_PROBE_TIME, - NEIGH_VAR_GC_STALETIME, - NEIGH_VAR_QUEUE_LEN, - NEIGH_VAR_QUEUE_LEN_BYTES, - NEIGH_VAR_PROXY_QLEN, - NEIGH_VAR_ANYCAST_DELAY, - NEIGH_VAR_PROXY_DELAY, - NEIGH_VAR_LOCKTIME, - NEIGH_VAR_RETRANS_TIME_MS, - NEIGH_VAR_BASE_REACHABLE_TIME_MS, - NEIGH_VAR_GC_INTERVAL, - NEIGH_VAR_GC_THRESH1, - NEIGH_VAR_GC_THRESH2, - NEIGH_VAR_GC_THRESH3, - NEIGH_VAR_MAX -}; +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, + int index) +{ + struct net_device *dev; + int family = neigh_parms_family(p); + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct neigh_parms *dst_p = + neigh_get_dev_parms_rcu(dev, family); + + if (dst_p && !test_bit(index, dst_p->data_state)) + dst_p->data[index] = p->data[index]; + } + rcu_read_unlock(); +} + +static void neigh_proc_update(struct ctl_table *ctl, int write) +{ + struct net_device *dev = ctl->extra1; + struct neigh_parms *p = ctl->extra2; + struct net *net = neigh_parms_net(p); + int index = (int *) ctl->data - p->data; + + if (!write) + return; + + set_bit(index, p->data_state); + if (!dev) /* NULL dev means this is default value */ + neigh_copy_dflt_parms(net, p, index); +} + +static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + tmp.extra1 = &zero; + tmp.extra2 = &int_max; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec); + +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); + +static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); + +static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +#define NEIGH_PARMS_DATA_OFFSET(index) \ + (&((struct neigh_parms *) 0)->data[index]) + +#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ + [NEIGH_VAR_ ## attr] = { \ + .procname = name, \ + .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ + .maxlen = sizeof(int), \ + .mode = mval, \ + .proc_handler = proc, \ + } + +#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) + +#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) + +#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { - [NEIGH_VAR_MCAST_PROBE] = { - .procname = "mcast_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_UCAST_PROBE] = { - .procname = "ucast_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_APP_PROBE] = { - .procname = "app_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_RETRANS_TIME] = { - .procname = "retrans_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_BASE_REACHABLE_TIME] = { - .procname = "base_reachable_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NEIGH_VAR_DELAY_PROBE_TIME] = { - .procname = "delay_first_probe_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NEIGH_VAR_GC_STALETIME] = { - .procname = "gc_stale_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NEIGH_VAR_QUEUE_LEN] = { - .procname = "unres_qlen", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_unres_qlen, - }, - [NEIGH_VAR_QUEUE_LEN_BYTES] = { - .procname = "unres_qlen_bytes", - .maxlen = sizeof(int), - .mode = 0644, - .extra1 = &zero, - .proc_handler = proc_dointvec_minmax, - }, - [NEIGH_VAR_PROXY_QLEN] = { - .procname = "proxy_qlen", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_ANYCAST_DELAY] = { - .procname = "anycast_delay", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_PROXY_DELAY] = { - .procname = "proxy_delay", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_LOCKTIME] = { - .procname = "locktime", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_RETRANS_TIME_MS] = { - .procname = "retrans_time_ms", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, - [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = { - .procname = "base_reachable_time_ms", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), + NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), @@ -2919,50 +3010,48 @@ static struct neigh_sysctl_table { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, }, {}, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, - char *p_name, proc_handler *handler) + proc_handler *handler) { + int i; struct neigh_sysctl_table *t; - const char *dev_name_source = NULL; + const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; + char *p_name; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); if (!t) goto err; - t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes; - t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes; - t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time; - t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time; - t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime; - t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes; - t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes; - t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen; - t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay; - t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; - t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time; + for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { + t->neigh_vars[i].data += (long) p; + t->neigh_vars[i].extra1 = dev; + t->neigh_vars[i].extra2 = p; + } if (dev) { dev_name_source = dev->name; @@ -2970,33 +3059,40 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { + struct neigh_table *tbl = p->tbl; dev_name_source = "default"; - t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); - t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; - t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; - t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } - if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev; } /* Don't export sysctls to unprivileged users */ if (neigh_parms_net(p)->user_ns != &init_user_ns) t->neigh_vars[0].procname = NULL; + switch (neigh_parms_family(p)) { + case AF_INET: + p_name = "ipv4"; + break; + case AF_INET6: + p_name = "ipv6"; + break; + default: + BUG(); + } + snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c new file mode 100644 index 00000000000..2bf83299600 --- /dev/null +++ b/net/core/net-procfs.c @@ -0,0 +1,423 @@ +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/wext.h> + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +extern struct list_head ptype_all __read_mostly; +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct net_device *dev; + struct hlist_head *h; + unsigned int count = 0, offset = get_offset(*pos); + + h = &net->dev_name_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, name_hlist) { + if (++count == offset) + return dev; + } + + return NULL; +} + +static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net_device *dev; + unsigned int bucket; + + do { + dev = dev_from_same_bucket(seq, pos); + if (dev) + return dev; + + bucket = get_bucket(*pos) + 1; + *pos = set_bucket_offset(bucket, 1); + } while (bucket < NETDEV_HASHENTRIES); + + return NULL; +} + +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +static void *dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + if (!*pos) + return SEQ_START_TOKEN; + + if (get_bucket(*pos) >= NETDEV_HASHENTRIES) + return NULL; + + return dev_from_bucket(seq, pos); +} + +static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return dev_from_bucket(seq, pos); +} + +static void dev_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static struct softnet_data *softnet_get_online(loff_t *pos) +{ + struct softnet_data *sd = NULL; + + while (*pos < nr_cpu_ids) + if (cpu_online(*pos)) { + sd = &per_cpu(softnet_data, *pos); + break; + } else + ++*pos; + return sd; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct softnet_data *sd = v; + unsigned int flow_limit_count = 0; + +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) + flow_limit_count = fl->count; + rcu_read_unlock(); +#endif + + seq_printf(seq, + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + sd->processed, sd->dropped, sd->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + sd->cpu_collision, sd->received_rps, flow_limit_count); + return 0; +} + +static const struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_seq_fops = { + .owner = THIS_MODULE, + .open = dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &softnet_seq_ops); +} + +static const struct file_operations softnet_seq_fops = { + .owner = THIS_MODULE, + .open = softnet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *ptype_get_idx(loff_t pos) +{ + struct packet_type *pt = NULL; + loff_t i = 0; + int t; + + list_for_each_entry_rcu(pt, &ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + for (t = 0; t < PTYPE_HASH_SIZE; t++) { + list_for_each_entry_rcu(pt, &ptype_base[t], list) { + if (i == pos) + return pt; + ++i; + } + } + return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct packet_type *pt; + struct list_head *nxt; + int hash; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ptype_get_idx(0); + + pt = v; + nxt = pt->list.next; + if (pt->type == htons(ETH_P_ALL)) { + if (nxt != &ptype_all) + goto found; + hash = 0; + nxt = ptype_base[0].next; + } else + hash = ntohs(pt->type) & PTYPE_HASH_MASK; + + while (nxt == &ptype_base[hash]) { + if (++hash >= PTYPE_HASH_SIZE) + return NULL; + nxt = ptype_base[hash].next; + } +found: + return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ + struct packet_type *pt = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); + else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else + seq_printf(seq, "%04x", ntohs(pt->type)); + + seq_printf(seq, " %-8s %pf\n", + pt->dev ? pt->dev->name : "", pt->func); + } + + return 0; +} + +static const struct seq_operations ptype_seq_ops = { + .start = ptype_seq_start, + .next = ptype_seq_next, + .stop = ptype_seq_stop, + .show = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ptype_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ptype_seq_fops = { + .owner = THIS_MODULE, + .open = ptype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + + +static int __net_init dev_proc_net_init(struct net *net) +{ + int rc = -ENOMEM; + + if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops)) + goto out; + if (!proc_create("softnet_stat", S_IRUGO, net->proc_net, + &softnet_seq_fops)) + goto out_dev; + if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops)) + goto out_softnet; + + if (wext_proc_init(net)) + goto out_ptype; + rc = 0; +out: + return rc; +out_ptype: + remove_proc_entry("ptype", net->proc_net); +out_softnet: + remove_proc_entry("softnet_stat", net->proc_net); +out_dev: + remove_proc_entry("dev", net->proc_net); + goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ + wext_proc_exit(net); + + remove_proc_entry("ptype", net->proc_net); + remove_proc_entry("softnet_stat", net->proc_net); + remove_proc_entry("dev", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { + .init = dev_proc_net_init, + .exit = dev_proc_net_exit, +}; + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct netdev_hw_addr *ha; + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + return 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(ha, dev) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, ha->refcount, ha->global_use); + + for (i = 0; i < dev->addr_len; i++) + seq_printf(seq, "%02x", ha->addr[i]); + + seq_putc(seq, '\n'); + } + netif_addr_unlock_bh(dev); + return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_mc_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + remove_proc_entry("dev_mcast", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + +int __init dev_proc_init(void) +{ + int ret = register_pernet_subsys(&dev_proc_ops); + if (!ret) + return register_pernet_subsys(&dev_mc_net_ops); + return ret; +} diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 28c5f5aa7ca..1cac29ebb05 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -21,6 +21,7 @@ #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/jiffies.h> +#include <linux/pm_runtime.h> #include "net-sysfs.h" @@ -59,12 +60,19 @@ static ssize_t format_##field(const struct net_device *net, char *buf) \ { \ return sprintf(buf, format_string, net->field); \ } \ -static ssize_t show_##field(struct device *dev, \ +static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ return netdev_show(dev, attr, buf, format_##field); \ -} +} \ + +#define NETDEVICE_SHOW_RO(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RO(field) +#define NETDEVICE_SHOW_RW(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RW(field) /* use same locking and permission rules as SIF* ioctl's */ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, @@ -95,16 +103,17 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, return ret; } -NETDEVICE_SHOW(dev_id, fmt_hex); -NETDEVICE_SHOW(addr_assign_type, fmt_dec); -NETDEVICE_SHOW(addr_len, fmt_dec); -NETDEVICE_SHOW(iflink, fmt_dec); -NETDEVICE_SHOW(ifindex, fmt_dec); -NETDEVICE_SHOW(type, fmt_dec); -NETDEVICE_SHOW(link_mode, fmt_dec); +NETDEVICE_SHOW_RO(dev_id, fmt_hex); +NETDEVICE_SHOW_RO(dev_port, fmt_dec); +NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); +NETDEVICE_SHOW_RO(addr_len, fmt_dec); +NETDEVICE_SHOW_RO(iflink, fmt_dec); +NETDEVICE_SHOW_RO(ifindex, fmt_dec); +NETDEVICE_SHOW_RO(type, fmt_dec); +NETDEVICE_SHOW_RO(link_mode, fmt_dec); /* use same locking rules as GIFHWADDR ioctl's */ -static ssize_t show_address(struct device *dev, struct device_attribute *attr, +static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *net = to_net_dev(dev); @@ -116,17 +125,32 @@ static ssize_t show_address(struct device *dev, struct device_attribute *attr, read_unlock(&dev_base_lock); return ret; } +static DEVICE_ATTR_RO(address); -static ssize_t show_broadcast(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t broadcast_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct net_device *net = to_net_dev(dev); if (dev_isalive(net)) return sysfs_format_mac(buf, net->broadcast, net->addr_len); return -EINVAL; } +static DEVICE_ATTR_RO(broadcast); -static ssize_t show_carrier(struct device *dev, +static int change_carrier(struct net_device *net, unsigned long new_carrier) +{ + if (!netif_running(net)) + return -EINVAL; + return dev_change_carrier(net, (bool) new_carrier); +} + +static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_carrier); +} + +static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -135,8 +159,9 @@ static ssize_t show_carrier(struct device *dev, } return -EINVAL; } +static DEVICE_ATTR_RW(carrier); -static ssize_t show_speed(struct device *dev, +static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -153,8 +178,9 @@ static ssize_t show_speed(struct device *dev, rtnl_unlock(); return ret; } +static DEVICE_ATTR_RO(speed); -static ssize_t show_duplex(struct device *dev, +static ssize_t duplex_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -184,8 +210,9 @@ static ssize_t show_duplex(struct device *dev, rtnl_unlock(); return ret; } +static DEVICE_ATTR_RO(duplex); -static ssize_t show_dormant(struct device *dev, +static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -195,6 +222,7 @@ static ssize_t show_dormant(struct device *dev, return -EINVAL; } +static DEVICE_ATTR_RO(dormant); static const char *const operstates[] = { "unknown", @@ -206,7 +234,7 @@ static const char *const operstates[] = { "up" }; -static ssize_t show_operstate(struct device *dev, +static ssize_t operstate_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); @@ -223,35 +251,43 @@ static ssize_t show_operstate(struct device *dev, return sprintf(buf, "%s\n", operstates[operstate]); } +static DEVICE_ATTR_RO(operstate); + +static ssize_t carrier_changes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + return sprintf(buf, fmt_dec, + atomic_read(&netdev->carrier_changes)); +} +static DEVICE_ATTR_RO(carrier_changes); /* read-write attributes */ -NETDEVICE_SHOW(mtu, fmt_dec); static int change_mtu(struct net_device *net, unsigned long new_mtu) { return dev_set_mtu(net, (int) new_mtu); } -static ssize_t store_mtu(struct device *dev, struct device_attribute *attr, +static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_mtu); } - -NETDEVICE_SHOW(flags, fmt_hex); +NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *net, unsigned long new_flags) { return dev_change_flags(net, (unsigned int) new_flags); } -static ssize_t store_flags(struct device *dev, struct device_attribute *attr, +static ssize_t flags_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_flags); } - -NETDEVICE_SHOW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *net, unsigned long new_len) { @@ -259,7 +295,7 @@ static int change_tx_queue_len(struct net_device *net, unsigned long new_len) return 0; } -static ssize_t store_tx_queue_len(struct device *dev, +static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -268,8 +304,9 @@ static ssize_t store_tx_queue_len(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } +NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); -static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, +static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); @@ -292,7 +329,7 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, return ret < 0 ? ret : len; } -static ssize_t show_ifalias(struct device *dev, +static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); @@ -305,8 +342,7 @@ static ssize_t show_ifalias(struct device *dev, rtnl_unlock(); return ret; } - -NETDEVICE_SHOW(group, fmt_dec); +static DEVICE_ATTR_RW(ifalias); static int change_group(struct net_device *net, unsigned long new_group) { @@ -314,35 +350,62 @@ static int change_group(struct net_device *net, unsigned long new_group) return 0; } -static ssize_t store_group(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_group); } +NETDEVICE_SHOW(group, fmt_dec); +static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); + +static ssize_t phys_port_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; -static struct device_attribute net_class_attributes[] = { - __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), - __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), - __ATTR(dev_id, S_IRUGO, show_dev_id, NULL), - __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias), - __ATTR(iflink, S_IRUGO, show_iflink, NULL), - __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), - __ATTR(type, S_IRUGO, show_type, NULL), - __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), - __ATTR(address, S_IRUGO, show_address, NULL), - __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), - __ATTR(carrier, S_IRUGO, show_carrier, NULL), - __ATTR(speed, S_IRUGO, show_speed, NULL), - __ATTR(duplex, S_IRUGO, show_duplex, NULL), - __ATTR(dormant, S_IRUGO, show_dormant, NULL), - __ATTR(operstate, S_IRUGO, show_operstate, NULL), - __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), - __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), - __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, - store_tx_queue_len), - __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group), - {} + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_port_id ppid; + + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_id); + +static struct attribute *net_class_attrs[] = { + &dev_attr_netdev_group.attr, + &dev_attr_type.attr, + &dev_attr_dev_id.attr, + &dev_attr_dev_port.attr, + &dev_attr_iflink.attr, + &dev_attr_ifindex.attr, + &dev_attr_addr_assign_type.attr, + &dev_attr_addr_len.attr, + &dev_attr_link_mode.attr, + &dev_attr_address.attr, + &dev_attr_broadcast.attr, + &dev_attr_speed.attr, + &dev_attr_duplex.attr, + &dev_attr_dormant.attr, + &dev_attr_operstate.attr, + &dev_attr_carrier_changes.attr, + &dev_attr_ifalias.attr, + &dev_attr_carrier.attr, + &dev_attr_mtu.attr, + &dev_attr_flags.attr, + &dev_attr_tx_queue_len.attr, + &dev_attr_phys_port_id.attr, + NULL, }; +ATTRIBUTE_GROUPS(net_class); /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, @@ -368,13 +431,13 @@ static ssize_t netstat_show(const struct device *d, /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ -static ssize_t show_##name(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ } \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) +static DEVICE_ATTR_RO(name) NETSTAT_ENTRY(rx_packets); NETSTAT_ENTRY(tx_packets); @@ -443,19 +506,12 @@ static struct attribute_group wireless_group = { .attrs = wireless_attrs, }; #endif + +#else /* CONFIG_SYSFS */ +#define net_class_groups NULL #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_RPS -/* - * RX queue sysfs structures and functions. - */ -struct rx_queue_attribute { - struct attribute attr; - ssize_t (*show)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf); - ssize_t (*store)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, const char *buf, size_t len); -}; +#ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) container_of(_attr, \ struct rx_queue_attribute, attr) @@ -490,6 +546,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = { .store = rx_queue_attr_store, }; +#ifdef CONFIG_RPS static ssize_t show_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, char *buf) { @@ -592,21 +649,11 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return sprintf(buf, "%lu\n", val); } -static void rps_dev_flow_table_release_work(struct work_struct *work) -{ - struct rps_dev_flow_table *table = container_of(work, - struct rps_dev_flow_table, free_work); - - vfree(table); -} - static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); - - INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); - schedule_work(&table->free_work); + vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, @@ -633,8 +680,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); /* On 64 bit arches, must check mask fits in table->mask (u32), - * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1) - * doesnt overflow. + * and on 32bit arches, must check + * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. */ #if BITS_PER_LONG > 32 if (mask > (unsigned long)(u32)mask) @@ -675,16 +722,20 @@ static struct rx_queue_attribute rps_cpus_attribute = static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +#endif /* CONFIG_RPS */ static struct attribute *rx_queue_default_attrs[] = { +#ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, +#endif NULL }; static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); +#ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; @@ -700,15 +751,29 @@ static void rx_queue_release(struct kobject *kobj) RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } +#endif memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); } +static const void *rx_queue_namespace(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->ns_type) + ns = dev->class->namespace(dev); + + return ns; +} + static struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_attrs = rx_queue_default_attrs, + .namespace = rx_queue_namespace }; static int rx_queue_add_kobject(struct net_device *net, int index) @@ -720,25 +785,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index) kobj->kset = net->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); - if (error) { - kobject_put(kobj); - return error; + if (error) + goto exit; + + if (net->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); + if (error) + goto exit; } kobject_uevent(kobj, KOBJ_ADD); dev_hold(queue->dev); return error; +exit: + kobject_put(kobj); + return error; } -#endif /* CONFIG_RPS */ +#endif /* CONFIG_SYSFS */ int net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS int i; int error = 0; +#ifndef CONFIG_RPS + if (!net->sysfs_rx_queue_group) + return 0; +#endif for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(net, i); if (error) { @@ -747,8 +823,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } } - while (--i >= new_num) + while (--i >= new_num) { + if (net->sysfs_rx_queue_group) + sysfs_remove_group(&net->_rx[i].kobj, + net->sysfs_rx_queue_group); kobject_put(&net->_rx[i].kobj); + } return error; #else @@ -929,15 +1009,12 @@ static struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) - if (queue == &dev->_tx[i]) - break; + unsigned int i; + i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; @@ -989,68 +1066,14 @@ static ssize_t show_xps_map(struct netdev_queue *queue, return len; } -static DEFINE_MUTEX(xps_map_mutex); -#define xmap_dereference(P) \ - rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) - -static void xps_queue_release(struct netdev_queue *queue) -{ - struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - struct xps_map *map; - unsigned long index; - int i, pos, nonempty = 0; - - index = get_netdev_queue_index(queue); - - mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - - if (dev_maps) { - for_each_possible_cpu(i) { - map = xmap_dereference(dev_maps->cpu_map[i]); - if (!map) - continue; - - for (pos = 0; pos < map->len; pos++) - if (map->queues[pos] == index) - break; - - if (pos < map->len) { - if (map->len > 1) - map->queues[pos] = - map->queues[--map->len]; - else { - RCU_INIT_POINTER(dev_maps->cpu_map[i], - NULL); - kfree_rcu(map, rcu); - map = NULL; - } - } - if (map) - nonempty = 1; - } - - if (!nonempty) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } - } - mutex_unlock(&xps_map_mutex); -} - static ssize_t store_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, const char *buf, size_t len) { struct net_device *dev = queue->dev; - cpumask_var_t mask; - int err, i, cpu, pos, map_len, alloc_len, need_set; unsigned long index; - struct xps_map *map, *new_map; - struct xps_dev_maps *dev_maps, *new_dev_maps; - int nonempty = 0; - int numa_node_id = -2; + cpumask_var_t mask; + int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1066,105 +1089,11 @@ static ssize_t store_xps_map(struct netdev_queue *queue, return err; } - new_dev_maps = kzalloc(max_t(unsigned int, - XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); - if (!new_dev_maps) { - free_cpumask_var(mask); - return -ENOMEM; - } - - mutex_lock(&xps_map_mutex); - - dev_maps = xmap_dereference(dev->xps_maps); - - for_each_possible_cpu(cpu) { - map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; - new_map = map; - if (map) { - for (pos = 0; pos < map->len; pos++) - if (map->queues[pos] == index) - break; - map_len = map->len; - alloc_len = map->alloc_len; - } else - pos = map_len = alloc_len = 0; - - need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); -#ifdef CONFIG_NUMA - if (need_set) { - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; - } -#endif - if (need_set && pos >= map_len) { - /* Need to add queue to this CPU's map */ - if (map_len >= alloc_len) { - alloc_len = alloc_len ? - 2 * alloc_len : XPS_MIN_MAP_ALLOC; - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), - GFP_KERNEL, - cpu_to_node(cpu)); - if (!new_map) - goto error; - new_map->alloc_len = alloc_len; - for (i = 0; i < map_len; i++) - new_map->queues[i] = map->queues[i]; - new_map->len = map_len; - } - new_map->queues[new_map->len++] = index; - } else if (!need_set && pos < map_len) { - /* Need to remove queue from this CPU's map */ - if (map_len > 1) - new_map->queues[pos] = - new_map->queues[--new_map->len]; - else - new_map = NULL; - } - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); - } - - /* Cleanup old maps */ - for_each_possible_cpu(cpu) { - map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; - if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) - kfree_rcu(map, rcu); - if (new_dev_maps->cpu_map[cpu]) - nonempty = 1; - } - - if (nonempty) { - rcu_assign_pointer(dev->xps_maps, new_dev_maps); - } else { - kfree(new_dev_maps); - RCU_INIT_POINTER(dev->xps_maps, NULL); - } - - if (dev_maps) - kfree_rcu(dev_maps, rcu); - - netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); - - mutex_unlock(&xps_map_mutex); + err = netif_set_xps_queue(dev, mask, index); free_cpumask_var(mask); - return len; -error: - mutex_unlock(&xps_map_mutex); - - if (new_dev_maps) - for_each_possible_cpu(i) - kfree(rcu_dereference_protected( - new_dev_maps->cpu_map[i], - 1)); - kfree(new_dev_maps); - free_cpumask_var(mask); - return -ENOMEM; + return err ? : len; } static struct netdev_queue_attribute xps_cpus_attribute = @@ -1183,18 +1112,27 @@ static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); -#ifdef CONFIG_XPS - xps_queue_release(queue); -#endif - memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); } +static const void *netdev_queue_namespace(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->ns_type) + ns = dev->class->namespace(dev); + + return ns; +} + static struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_attrs = netdev_queue_default_attrs, + .namespace = netdev_queue_namespace, }; static int netdev_queue_add_kobject(struct net_device *net, int index) @@ -1264,9 +1202,6 @@ static int register_queue_kobjects(struct net_device *net) NULL, &net->dev.kobj); if (!net->queues_kset) return -ENOMEM; -#endif - -#ifdef CONFIG_RPS real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; @@ -1293,7 +1228,7 @@ static void remove_queue_kobjects(struct net_device *net) { int real_rx = 0, real_tx = 0; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; @@ -1305,6 +1240,13 @@ static void remove_queue_kobjects(struct net_device *net) #endif } +static bool net_current_may_mount(void) +{ + struct net *net = current->nsproxy->net_ns; + + return ns_capable(net->user_ns, CAP_SYS_ADMIN); +} + static void *net_grab_current_ns(void) { struct net *ns = current->nsproxy->net_ns; @@ -1327,6 +1269,7 @@ static const void *net_netlink_ns(struct sock *sk) struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, + .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, @@ -1364,7 +1307,7 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); kfree(dev->ifalias); - kfree((char *)dev - dev->padded); + netdev_freemem(dev); } static const void *net_namespace(struct device *d) @@ -1377,9 +1320,7 @@ static const void *net_namespace(struct device *d) static struct class net_class = { .name = "net", .dev_release = netdev_release, -#ifdef CONFIG_SYSFS - .dev_attrs = net_class_attributes, -#endif /* CONFIG_SYSFS */ + .dev_groups = net_class_groups, .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, @@ -1396,6 +1337,8 @@ void netdev_unregister_kobject(struct net_device * net) remove_queue_kobjects(net); + pm_runtime_set_memalloc_noio(dev, false); + device_del(dev); } @@ -1440,22 +1383,26 @@ int netdev_register_kobject(struct net_device *net) return error; } + pm_runtime_set_memalloc_noio(dev, true); + return error; } -int netdev_class_create_file(struct class_attribute *class_attr) +int netdev_class_create_file_ns(struct class_attribute *class_attr, + const void *ns) { - return class_create_file(&net_class, class_attr); + return class_create_file_ns(&net_class, class_attr, ns); } -EXPORT_SYMBOL(netdev_class_create_file); +EXPORT_SYMBOL(netdev_class_create_file_ns); -void netdev_class_remove_file(struct class_attribute *class_attr) +void netdev_class_remove_file_ns(struct class_attribute *class_attr, + const void *ns) { - class_remove_file(&net_class, class_attr); + class_remove_file_ns(&net_class, class_attr, ns); } -EXPORT_SYMBOL(netdev_class_remove_file); +EXPORT_SYMBOL(netdev_class_remove_file_ns); -int netdev_kobject_init(void) +int __init netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); return class_register(&net_class); diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index bd7751ec1c4..2745a1b51e0 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -1,7 +1,7 @@ #ifndef __NET_SYSFS_H__ #define __NET_SYSFS_H__ -int netdev_kobject_init(void); +int __init netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 8acce01b6da..85b62691f4f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -10,7 +10,8 @@ #include <linux/idr.h> #include <linux/rculist.h> #include <linux/nsproxy.h> -#include <linux/proc_fs.h> +#include <linux/fs.h> +#include <linux/proc_ns.h> #include <linux/file.h> #include <linux/export.h> #include <linux/user_namespace.h> @@ -23,7 +24,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); +DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -272,7 +273,7 @@ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; struct net *net, *tmp; - LIST_HEAD(net_kill_list); + struct list_head net_kill_list; LIST_HEAD(net_exit_list); /* Atomically snapshot the list of namespaces to cleanup */ @@ -336,7 +337,7 @@ EXPORT_SYMBOL_GPL(__put_net); struct net *get_net_ns_by_fd(int fd) { - struct proc_inode *ei; + struct proc_ns *ei; struct file *file; struct net *net; @@ -344,7 +345,7 @@ struct net *get_net_ns_by_fd(int fd) if (IS_ERR(file)) return ERR_CAST(file); - ei = PROC_I(file->f_dentry->d_inode); + ei = get_proc_ns(file_inode(file)); if (ei->ns_ops == &netns_operations) net = get_net(ei->ns); else @@ -650,7 +651,7 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) struct net *net = ns; if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c new file mode 100644 index 00000000000..30d903b19c6 --- /dev/null +++ b/net/core/netclassid_cgroup.c @@ -0,0 +1,111 @@ +/* + * net/core/netclassid_cgroup.c Classid Cgroupfs Handling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fdtable.h> +#include <net/cls_cgroup.h> +#include <net/sock.h> + +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + +struct cgroup_cls_state *task_cls_state(struct task_struct *p) +{ + return css_cls_state(task_css(p, net_cls_cgrp_id)); +} +EXPORT_SYMBOL_GPL(task_cls_state); + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_cls_state *cs; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + return &cs->css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css->parent); + + if (parent) + cs->classid = parent->classid; + + return 0; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_cls_state(css)); +} + +static int update_classid(const void *v, struct file *file, unsigned n) +{ + int err; + struct socket *sock = sock_from_file(file, &err); + + if (sock) + sock->sk->sk_classid = (u32)(unsigned long)v; + + return 0; +} + +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; + struct task_struct *p; + + cgroup_taskset_for_each(p, tset) { + task_lock(p); + iterate_fd(p->files, 0, update_classid, v); + task_unlock(p); + } +} + +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return css_cls_state(css)->classid; +} + +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) +{ + css_cls_state(css)->classid = (u32) value; + + return 0; +} + +static struct cftype ss_files[] = { + { + .name = "classid", + .read_u64 = read_classid, + .write_u64 = write_classid, + }, + { } /* terminate */ +}; + +struct cgroup_subsys net_cls_cgrp_subsys = { + .css_alloc = cgrp_css_alloc, + .css_online = cgrp_css_online, + .css_free = cgrp_css_free, + .attach = cgrp_attach, + .base_cftypes = ss_files, +}; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3151acf5ec1..e33937fb32a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/moduleparam.h> +#include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> @@ -29,6 +30,9 @@ #include <linux/if_vlan.h> #include <net/tcp.h> #include <net/udp.h> +#include <net/addrconf.h> +#include <net/ndisc.h> +#include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <trace/events/napi.h> @@ -42,11 +46,9 @@ static struct sk_buff_head skb_pool; -static atomic_t trapped; +DEFINE_STATIC_SRCU(netpoll_srcu); #define USEC_PER_POLL 50 -#define NETPOLL_RX_ENABLED 1 -#define NETPOLL_RX_DROP 2 #define MAX_SKB_SIZE \ (sizeof(struct ethhdr) + \ @@ -55,7 +57,7 @@ static atomic_t trapped; MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); +static void netpoll_async_cleanup(struct work_struct *work); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -67,6 +69,37 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) +static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int status = NETDEV_TX_OK; + netdev_features_t features; + + features = netif_skb_features(skb); + + if (vlan_tx_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (unlikely(!skb)) { + /* This is actually a packet drop, but we + * don't want the code that calls this + * function to try and operate on a NULL skb. + */ + goto out; + } + skb->vlan_tci = 0; + } + + status = ops->ndo_start_xmit(skb, dev); + if (status == NETDEV_TX_OK) + txq_trans_update(txq); + +out: + return status; +} + static void queue_process(struct work_struct *work) { struct netpoll_info *npinfo = @@ -76,51 +109,31 @@ static void queue_process(struct work_struct *work) while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; - const struct net_device_ops *ops = dev->netdev_ops; struct netdev_queue *txq; if (!netif_device_present(dev) || !netif_running(dev)) { - __kfree_skb(skb); + kfree_skb(skb); continue; } txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); local_irq_save(flags); - __netif_tx_lock(txq, smp_processor_id()); + HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || - ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { + netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); - __netif_tx_unlock(txq); + HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } - __netif_tx_unlock(txq); + HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); } } -static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, - unsigned short ulen, __be32 saddr, __be32 daddr) -{ - __wsum psum; - - if (uh->check == 0 || skb_csum_unnecessary(skb)) - return 0; - - psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); - - if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_fold(csum_add(psum, skb->csum))) - return 0; - - skb->csum = psum; - - return __skb_checksum_complete(skb); -} - /* * Check whether delayed processing was scheduled for our NIC. If so, * we attempt to grab the poll lock and use ->poll() to pump the card. @@ -131,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, * trylock here and interrupts are already disabled in the softirq * case. Further, we test the poll_owner to avoid recursion on UP * systems where the lock doesn't exist. - * - * In cases where there is bi-directional communications, reading only - * one message at a time can lead to packets being dropped by the - * network adapter, forcing superfluous retries and possibly timeouts. - * Thus, we set our budget to greater than 1. */ -static int poll_one_napi(struct netpoll_info *npinfo, - struct napi_struct *napi, int budget) +static int poll_one_napi(struct napi_struct *napi, int budget) { int work; @@ -149,82 +156,88 @@ static int poll_one_napi(struct netpoll_info *npinfo, if (!test_bit(NAPI_STATE_SCHED, &napi->state)) return budget; - npinfo->rx_flags |= NETPOLL_RX_DROP; - atomic_inc(&trapped); set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); trace_napi_poll(napi); clear_bit(NAPI_STATE_NPSVC, &napi->state); - atomic_dec(&trapped); - npinfo->rx_flags &= ~NETPOLL_RX_DROP; return budget - work; } -static void poll_napi(struct net_device *dev) +static void poll_napi(struct net_device *dev, int budget) { struct napi_struct *napi; - int budget = 16; list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), - napi, budget); + budget = poll_one_napi(napi, budget); spin_unlock(&napi->poll_lock); - - if (!budget) - break; } } } -static void service_arp_queue(struct netpoll_info *npi) -{ - if (npi) { - struct sk_buff *skb; - - while ((skb = skb_dequeue(&npi->arp_tx))) - netpoll_arp_reply(skb, npi); - } -} - static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); + int budget = 0; + + /* Don't do any rx activity if the dev_lock mutex is held + * the dev_open/close paths use this to block netpoll activity + * while changing device state + */ + if (down_trylock(&ni->dev_lock)) + return; - if (!dev || !netif_running(dev)) + if (!netif_running(dev)) { + up(&ni->dev_lock); return; + } ops = dev->netdev_ops; - if (!ops->ndo_poll_controller) + if (!ops->ndo_poll_controller) { + up(&ni->dev_lock); return; + } /* Process pending work on NIC */ ops->ndo_poll_controller(dev); - poll_napi(dev); - - if (dev->flags & IFF_SLAVE) { - if (ni) { - struct net_device *bond_dev = dev->master; - struct sk_buff *skb; - struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); - while ((skb = skb_dequeue(&ni->arp_tx))) { - skb->dev = bond_dev; - skb_queue_tail(&bond_ni->arp_tx, skb); - } - } - } + poll_napi(dev, budget); - service_arp_queue(ni); + up(&ni->dev_lock); zap_completion_queue(); } +void netpoll_poll_disable(struct net_device *dev) +{ + struct netpoll_info *ni; + int idx; + might_sleep(); + idx = srcu_read_lock(&netpoll_srcu); + ni = srcu_dereference(dev->npinfo, &netpoll_srcu); + if (ni) + down(&ni->dev_lock); + srcu_read_unlock(&netpoll_srcu, idx); +} +EXPORT_SYMBOL(netpoll_poll_disable); + +void netpoll_poll_enable(struct net_device *dev) +{ + struct netpoll_info *ni; + rcu_read_lock(); + ni = rcu_dereference(dev->npinfo); + if (ni) + up(&ni->dev_lock); + rcu_read_unlock(); +} +EXPORT_SYMBOL(netpoll_poll_enable); + static void refill_skbs(void) { struct sk_buff *skb; @@ -257,7 +270,7 @@ static void zap_completion_queue(void) while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; - if (skb->destructor) { + if (!skb_irq_freeable(skb)) { atomic_inc(&skb->users); dev_kfree_skb_any(skb); /* put this one back */ } else { @@ -312,7 +325,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, { int status = NETDEV_TX_BUSY; unsigned long tries; - const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; @@ -320,7 +332,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { - __kfree_skb(skb); + dev_kfree_skb_irq(skb); return; } @@ -328,26 +340,16 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { - if (__netif_tx_trylock(txq)) { - if (!netif_xmit_stopped(txq)) { - if (vlan_tx_tag_present(skb) && - !(netif_skb_features(skb) & NETIF_F_HW_VLAN_TX)) { - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - break; - skb->vlan_tci = 0; - } - - status = ops->ndo_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); - } - __netif_tx_unlock(txq); + if (HARD_TX_TRYLOCK(dev, txq)) { + if (!netif_xmit_stopped(txq)) + status = netpoll_start_xmit(skb, dev, txq); + + HARD_TX_UNLOCK(dev, txq); if (status == NETDEV_TX_OK) break; @@ -362,7 +364,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, WARN_ONCE(!irqs_disabled(), "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", - dev->name, ops->ndo_start_xmit); + dev->name, dev->netdev_ops->ndo_start_xmit); } @@ -381,9 +383,14 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) struct iphdr *iph; struct ethhdr *eth; static atomic_t ip_ident; + struct ipv6hdr *ip6h; udp_len = len + sizeof(*udph); - ip_len = udp_len + sizeof(*iph); + if (np->ipv6) + ip_len = udp_len + sizeof(*ip6h); + else + ip_len = udp_len + sizeof(*iph); + total_len = ip_len + LL_RESERVED_SPACE(np->dev); skb = find_skb(np, total_len + np->dev->needed_tailroom, @@ -400,275 +407,117 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); - udph->check = 0; - udph->check = csum_tcpudp_magic(np->local_ip, - np->remote_ip, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - - /* iph->version = 4; iph->ihl = 5; */ - put_unaligned(0x45, (unsigned char *)iph); - iph->tos = 0; - put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = htons(atomic_inc_return(&ip_ident)); - iph->frag_off = 0; - iph->ttl = 64; - iph->protocol = IPPROTO_UDP; - iph->check = 0; - put_unaligned(np->local_ip, &(iph->saddr)); - put_unaligned(np->remote_ip, &(iph->daddr)); - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IP); - memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); - memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); - - skb->dev = np->dev; - - netpoll_send_skb(np, skb); -} -EXPORT_SYMBOL(netpoll_send_udp); - -static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) -{ - struct arphdr *arp; - unsigned char *arp_ptr; - int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; - __be32 sip, tip; - unsigned char *sha; - struct sk_buff *send_skb; - struct netpoll *np, *tmp; - unsigned long flags; - int hlen, tlen; - int hits = 0; - - if (list_empty(&npinfo->rx_np)) - return; - - /* Before checking the packet, we do some early - inspection whether this is interesting at all */ - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (np->dev == skb->dev) - hits++; - } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - - /* No netpoll struct is using this dev */ - if (!hits) - return; - - /* No arp on this interface */ - if (skb->dev->flags & IFF_NOARP) - return; - - if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) - return; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - arp = arp_hdr(skb); - - if ((arp->ar_hrd != htons(ARPHRD_ETHER) && - arp->ar_hrd != htons(ARPHRD_IEEE802)) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_op != htons(ARPOP_REQUEST)) - return; - - arp_ptr = (unsigned char *)(arp+1); - /* save the location of the src hw addr */ - sha = arp_ptr; - arp_ptr += skb->dev->addr_len; - memcpy(&sip, arp_ptr, 4); - arp_ptr += 4; - /* If we actually cared about dst hw addr, - it would get copied here */ - arp_ptr += skb->dev->addr_len; - memcpy(&tip, arp_ptr, 4); - - /* Should we ignore arp? */ - if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) - return; - - size = arp_hdr_len(skb->dev); - - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (tip != np->local_ip) - continue; - - hlen = LL_RESERVED_SPACE(np->dev); - tlen = np->dev->needed_tailroom; - send_skb = find_skb(np, size + hlen + tlen, hlen); - if (!send_skb) - continue; - - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); - - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ptype, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - continue; - } - - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should - * always equal 1 (Ethernet). - */ - - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); - - arp_ptr = (unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); - - netpoll_send_skb(np, send_skb); - - /* If there are several rx_hooks for the same address, - we're fine by sending a single reply */ - break; - } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); -} - -int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) -{ - int proto, len, ulen; - int hits = 0; - const struct iphdr *iph; - struct udphdr *uh; - struct netpoll *np, *tmp; - - if (list_empty(&npinfo->rx_np)) - goto out; - - if (skb->dev->type != ARPHRD_ETHER) - goto out; - - /* check if netpoll clients need ARP */ - if (skb->protocol == htons(ETH_P_ARP) && - atomic_read(&trapped)) { - skb_queue_tail(&npinfo->arp_tx, skb); - return 1; - } - - if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { - skb = vlan_untag(skb); - if (unlikely(!skb)) - goto out; - } - - proto = ntohs(eth_hdr(skb)->h_proto); - if (proto != ETH_P_IP) - goto out; - if (skb->pkt_type == PACKET_OTHERHOST) - goto out; - if (skb_shared(skb)) - goto out; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - iph = (struct iphdr *)skb->data; - if (iph->ihl < 5 || iph->version != 4) - goto out; - if (!pskb_may_pull(skb, iph->ihl*4)) - goto out; - iph = (struct iphdr *)skb->data; - if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) - goto out; - - len = ntohs(iph->tot_len); - if (skb->len < len || len < iph->ihl*4) - goto out; - - /* - * Our transport medium may have padded the buffer out. - * Now We trim to the true length of the frame. - */ - if (pskb_trim_rcsum(skb, len)) - goto out; - - iph = (struct iphdr *)skb->data; - if (iph->protocol != IPPROTO_UDP) - goto out; - len -= iph->ihl*4; - uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); - ulen = ntohs(uh->len); - - if (ulen != len) - goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) - goto out; - - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (np->local_ip && np->local_ip != iph->daddr) - continue; - if (np->remote_ip && np->remote_ip != iph->saddr) - continue; - if (np->local_port && np->local_port != ntohs(uh->dest)) - continue; - - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); - hits++; + if (np->ipv6) { + udph->check = 0; + udph->check = csum_ipv6_magic(&np->local_ip.in6, + &np->remote_ip.in6, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + + /* ip6h->version = 6; ip6h->priority = 0; */ + put_unaligned(0x60, (unsigned char *)ip6h); + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + + ip6h->payload_len = htons(sizeof(struct udphdr) + len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 32; + ip6h->saddr = np->local_ip.in6; + ip6h->daddr = np->remote_ip.in6; + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IPV6); + } else { + udph->check = 0; + udph->check = csum_tcpudp_magic(np->local_ip.ip, + np->remote_ip.ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + put_unaligned(0x45, (unsigned char *)iph); + iph->tos = 0; + put_unaligned(htons(ip_len), &(iph->tot_len)); + iph->id = htons(atomic_inc_return(&ip_ident)); + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip.ip, &(iph->saddr)); + put_unaligned(np->remote_ip.ip, &(iph->daddr)); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IP); } - if (!hits) - goto out; - - kfree_skb(skb); - return 1; + ether_addr_copy(eth->h_source, np->dev->dev_addr); + ether_addr_copy(eth->h_dest, np->remote_mac); -out: - if (atomic_read(&trapped)) { - kfree_skb(skb); - return 1; - } + skb->dev = np->dev; - return 0; + netpoll_send_skb(np, skb); } +EXPORT_SYMBOL(netpoll_send_udp); void netpoll_print_options(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); - np_info(np, "local IP %pI4\n", &np->local_ip); + if (np->ipv6) + np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); + else + np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); np_info(np, "interface '%s'\n", np->dev_name); np_info(np, "remote port %d\n", np->remote_port); - np_info(np, "remote IP %pI4\n", &np->remote_ip); + if (np->ipv6) + np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); + else + np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); np_info(np, "remote ethernet address %pM\n", np->remote_mac); } EXPORT_SYMBOL(netpoll_print_options); +static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) +{ + const char *end; + + if (!strchr(str, ':') && + in4_pton(str, -1, (void *)addr, -1, &end) > 0) { + if (!*end) + return 0; + } + if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { +#if IS_ENABLED(CONFIG_IPV6) + if (!*end) + return 1; +#else + return -1; +#endif + } + return -1; +} + int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; + int ipv6; + bool ipversion_set = false; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) @@ -681,10 +530,15 @@ int netpoll_parse_options(struct netpoll *np, char *opt) cur++; if (*cur != '/') { + ipversion_set = true; if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; - np->local_ip = in_aton(cur); + ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); + if (ipv6 < 0) + goto parse_failed; + else + np->ipv6 = (bool)ipv6; cur = delim; } cur++; @@ -716,7 +570,13 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; - np->remote_ip = in_aton(cur); + ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); + if (ipv6 < 0) + goto parse_failed; + else if (ipversion_set && np->ipv6 != (bool)ipv6) + goto parse_failed; + else + np->ipv6 = (bool)ipv6; cur = delim + 1; if (*cur != 0) { @@ -735,15 +595,15 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; - unsigned long flags; int err; np->dev = ndev; strlcpy(np->dev_name, ndev->name, IFNAMSIZ); + INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || !ndev->netdev_ops->ndo_poll_controller) { @@ -754,17 +614,13 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) } if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), gfp); + npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; goto out; } - npinfo->rx_flags = 0; - INIT_LIST_HEAD(&npinfo->rx_np); - - spin_lock_init(&npinfo->rx_lock); - skb_queue_head_init(&npinfo->arp_tx); + sema_init(&npinfo->dev_lock, 1); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); @@ -772,24 +628,17 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); + err = ops->ndo_netpoll_setup(ndev, npinfo); if (err) goto free_npinfo; } } else { - npinfo = ndev->npinfo; + npinfo = rtnl_dereference(ndev->npinfo); atomic_inc(&npinfo->refcnt); } npinfo->netpoll = np; - if (np->rx_hook) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_flags |= NETPOLL_RX_ENABLED; - list_add_tail(&np->rx, &npinfo->rx_np); - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } - /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -808,14 +657,19 @@ int netpoll_setup(struct netpoll *np) struct in_device *in_dev; int err; - if (np->dev_name) - ndev = dev_get_by_name(&init_net, np->dev_name); + rtnl_lock(); + if (np->dev_name) { + struct net *net = current->nsproxy->net_ns; + ndev = __dev_get_by_name(net, np->dev_name); + } if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); - return -ENODEV; + err = -ENODEV; + goto unlock; } + dev_hold(ndev); - if (ndev->master) { + if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; goto put; @@ -826,15 +680,14 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - rtnl_lock(); err = dev_open(ndev); - rtnl_unlock(); if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } + rtnl_unlock(); atleast = jiffies + HZ/10; atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { @@ -854,39 +707,70 @@ int netpoll_setup(struct netpoll *np) np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); msleep(4000); } + rtnl_lock(); } - if (!np->local_ip) { - rcu_read_lock(); - in_dev = __in_dev_get_rcu(ndev); + if (!np->local_ip.ip) { + if (!np->ipv6) { + in_dev = __in_dev_get_rtnl(ndev); + + if (!in_dev || !in_dev->ifa_list) { + np_err(np, "no IP address for %s, aborting\n", + np->dev_name); + err = -EDESTADDRREQ; + goto put; + } + + np->local_ip.ip = in_dev->ifa_list->ifa_local; + np_info(np, "local IP %pI4\n", &np->local_ip.ip); + } else { +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *idev; - if (!in_dev || !in_dev->ifa_list) { - rcu_read_unlock(); - np_err(np, "no IP address for %s, aborting\n", - np->dev_name); err = -EDESTADDRREQ; + idev = __in6_dev_get(ndev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + continue; + np->local_ip.in6 = ifp->addr; + err = 0; + break; + } + read_unlock_bh(&idev->lock); + } + if (err) { + np_err(np, "no IPv6 address for %s, aborting\n", + np->dev_name); + goto put; + } else + np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); +#else + np_err(np, "IPv6 is not supported %s, aborting\n", + np->dev_name); + err = -EINVAL; goto put; +#endif } - - np->local_ip = in_dev->ifa_list->ifa_local; - rcu_read_unlock(); - np_info(np, "local IP %pI4\n", &np->local_ip); } /* fill up the skb queue */ refill_skbs(); - rtnl_lock(); - err = __netpoll_setup(np, ndev, GFP_KERNEL); - rtnl_unlock(); - + err = __netpoll_setup(np, ndev); if (err) goto put; + rtnl_unlock(); return 0; put: dev_put(ndev); +unlock: + rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); @@ -903,7 +787,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); - skb_queue_purge(&npinfo->arp_tx); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ @@ -919,19 +802,16 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; - unsigned long flags; - npinfo = np->dev->npinfo; + /* rtnl_dereference would be preferable here but + * rcu_cleanup_netpoll path can put us in here safely without + * holding the rtnl, so plain rcu_dereference it is + */ + npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; - if (!list_empty(&npinfo->rx_np)) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_del(&np->rx); - if (list_empty(&npinfo->rx_np)) - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } + synchronize_srcu(&netpoll_srcu); if (atomic_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -946,45 +826,31 @@ void __netpoll_cleanup(struct netpoll *np) } EXPORT_SYMBOL_GPL(__netpoll_cleanup); -static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +static void netpoll_async_cleanup(struct work_struct *work) { - struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); + struct netpoll *np = container_of(work, struct netpoll, cleanup_work); + rtnl_lock(); __netpoll_cleanup(np); + rtnl_unlock(); kfree(np); } -void __netpoll_free_rcu(struct netpoll *np) +void __netpoll_free_async(struct netpoll *np) { - call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); + schedule_work(&np->cleanup_work); } -EXPORT_SYMBOL_GPL(__netpoll_free_rcu); +EXPORT_SYMBOL_GPL(__netpoll_free_async); void netpoll_cleanup(struct netpoll *np) { - if (!np->dev) - return; - rtnl_lock(); + if (!np->dev) + goto out; __netpoll_cleanup(np); - rtnl_unlock(); - dev_put(np->dev); np->dev = NULL; +out: + rtnl_unlock(); } EXPORT_SYMBOL(netpoll_cleanup); - -int netpoll_trap(void) -{ - return atomic_read(&trapped); -} -EXPORT_SYMBOL(netpoll_trap); - -void netpoll_set_trap(int trap) -{ - if (trap) - atomic_inc(&trapped); - else - atomic_dec(&trapped); -} -EXPORT_SYMBOL(netpoll_set_trap); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 5e67defe2cb..2f385b9bccc 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -29,14 +29,8 @@ #define PRIOMAP_MIN_SZ 128 -static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), - struct cgroup_netprio_state, css); -} - /* - * Extend @dev->priomap so that it's large enough to accomodate + * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ @@ -69,10 +63,8 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) /* allocate & copy */ new = kzalloc(new_sz, GFP_KERNEL); - if (!new) { - pr_warn("Unable to alloc new priomap!\n"); + if (!new) return -ENOMEM; - } if (old) memcpy(new->priomap, old->priomap, @@ -89,67 +81,70 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) /** * netprio_prio - return the effective netprio of a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ -static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + int id = css->cgroup->id; - if (map && cgrp->id < map->priomap_len) - return map->priomap[cgrp->id]; + if (map && id < map->priomap_len) + return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * - * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl + * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ -static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, - u32 prio) +static int netprio_set_prio(struct cgroup_subsys_state *css, + struct net_device *dev, u32 prio) { struct netprio_map *map; + int id = css->cgroup->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); - if (!prio && (!map || map->priomap_len <= cgrp->id)) + if (!prio && (!map || map->priomap_len <= id)) return 0; - ret = extend_netdev_table(dev, cgrp->id); + ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); - map->priomap[cgrp->id] = prio; + map->priomap[id] = prio; return 0; } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { - struct cgroup_netprio_state *cs; + struct cgroup_subsys_state *css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) + css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) return ERR_PTR(-ENOMEM); - return &cs->css; + return css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; - if (!parent) + if (!parent_css) return 0; rtnl_lock(); @@ -158,9 +153,9 @@ static int cgrp_css_online(struct cgroup *cgrp) * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { - u32 prio = netprio_prio(parent, dev); + u32 prio = netprio_prio(parent_css, dev); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); if (ret) break; } @@ -168,37 +163,37 @@ static int cgrp_css_online(struct cgroup *cgrp) return ret; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_netprio_state(cgrp)); + kfree(css); } -static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp->id; + return css->cgroup->id; } -static int read_priomap(struct cgroup *cont, struct cftype *cft, - struct cgroup_map_cb *cb) +static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) - cb->fill(cb, dev->name, netprio_prio(cont, dev)); + seq_printf(sf, "%s %u\n", dev->name, + netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } -static int write_priomap(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static ssize_t write_priomap(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; - if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) + if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); @@ -207,11 +202,11 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); - return ret; + return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) @@ -223,14 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; - void *v; + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, tset) { task_lock(p); - v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } @@ -243,27 +238,24 @@ static struct cftype ss_files[] = { }, { .name = "ifpriomap", - .read_map = read_priomap, - .write_string = write_priomap, + .seq_show = read_priomap, + .write = write_priomap, }, { } /* terminate */ }; -struct cgroup_subsys net_prio_subsys = { - .name = "net_prio", +struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, - .subsys_id = net_prio_subsys_id, .base_cftypes = ss_files, - .module = THIS_MODULE, }; static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* @@ -288,37 +280,9 @@ static struct notifier_block netprio_device_notifier = { static int __init init_cgroup_netprio(void) { - int ret; - - ret = cgroup_load_subsys(&net_prio_subsys); - if (ret) - goto out; - register_netdevice_notifier(&netprio_device_notifier); - -out: - return ret; -} - -static void __exit exit_cgroup_netprio(void) -{ - struct netprio_map *old; - struct net_device *dev; - - unregister_netdevice_notifier(&netprio_device_notifier); - - cgroup_unload_subsys(&net_prio_subsys); - - rtnl_lock(); - for_each_netdev(&init_net, dev) { - old = rtnl_dereference(dev->priomap); - RCU_INIT_POINTER(dev->priomap, NULL); - if (old) - kfree_rcu(old, rcu); - } - rtnl_unlock(); + return 0; } -module_init(init_cgroup_netprio); -module_exit(exit_cgroup_netprio); +subsys_initcall(init_cgroup_netprio); MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b29dacf900f..fc17a9d309a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -160,10 +160,13 @@ #include <net/net_namespace.h> #include <net/checksum.h> #include <net/ipv6.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> #include <net/addrconf.h> #ifdef CONFIG_XFRM #include <net/xfrm.h> #endif +#include <net/netns/generic.h> #include <asm/byteorder.h> #include <linux/rcupdate.h> #include <linux/bitops.h> @@ -197,6 +200,7 @@ #define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ +#define F_UDPCSUM (1<<16) /* Include UDP checksum */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -212,7 +216,6 @@ #define PKTGEN_MAGIC 0xbe9be955 #define PG_PROC_DIR "pktgen" #define PGCTRL "pgctrl" -static struct proc_dir_entry *pg_proc_dir; #define MAX_CFLOWS 65536 @@ -386,6 +389,9 @@ struct pktgen_dev { #ifdef CONFIG_XFRM __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ + __u32 spi; + struct dst_entry dst; + struct dst_ops dstops; #endif char result[512]; }; @@ -397,7 +403,15 @@ struct pktgen_hdr { __be32 tv_usec; }; -static bool pktgen_exiting __read_mostly; + +static int pg_net_id __read_mostly; + +struct pktgen_net { + struct net *net; + struct proc_dir_entry *proc_dir; + struct list_head pktgen_threads; + bool pktgen_exiting; +}; struct pktgen_thread { spinlock_t if_lock; /* for list of devices */ @@ -414,6 +428,7 @@ struct pktgen_thread { wait_queue_head_t queue; struct completion start_done; + struct pktgen_net *net; }; #define REMOVE 1 @@ -428,9 +443,9 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char *ifname, bool exact); static int pktgen_device_event(struct notifier_block *, unsigned long, void *); -static void pktgen_run_all_threads(void); -static void pktgen_reset_all_threads(void); -static void pktgen_stop_all_threads_ifs(void); +static void pktgen_run_all_threads(struct pktgen_net *pn); +static void pktgen_reset_all_threads(struct pktgen_net *pn); +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); @@ -442,7 +457,6 @@ static int pg_clone_skb_d __read_mostly; static int debug __read_mostly; static DEFINE_MUTEX(pktgen_thread_lock); -static LIST_HEAD(pktgen_threads); static struct notifier_block pktgen_notifier_block = { .notifier_call = pktgen_device_event, @@ -462,44 +476,41 @@ static int pgctrl_show(struct seq_file *seq, void *v) static ssize_t pgctrl_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - int err = 0; char data[128]; + struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); - if (!capable(CAP_NET_ADMIN)) { - err = -EPERM; - goto out; - } + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (count == 0) + return -EINVAL; if (count > sizeof(data)) count = sizeof(data); - if (copy_from_user(data, buf, count)) { - err = -EFAULT; - goto out; - } - data[count - 1] = 0; /* Make string */ + if (copy_from_user(data, buf, count)) + return -EFAULT; + + data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ if (!strcmp(data, "stop")) - pktgen_stop_all_threads_ifs(); + pktgen_stop_all_threads_ifs(pn); else if (!strcmp(data, "start")) - pktgen_run_all_threads(); + pktgen_run_all_threads(pn); else if (!strcmp(data, "reset")) - pktgen_reset_all_threads(); + pktgen_reset_all_threads(pn); else pr_warning("Unknown command: %s\n", data); - err = count; - -out: - return err; + return count; } static int pgctrl_open(struct inode *inode, struct file *file) { - return single_open(file, pgctrl_show, PDE(inode)->data); + return single_open(file, pgctrl_show, PDE_DATA(inode)); } static const struct file_operations pktgen_fops = { @@ -562,7 +573,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) is_zero_ether_addr(pkt_dev->src_mac) ? pkt_dev->odev->dev_addr : pkt_dev->src_mac); - seq_printf(seq, "dst_mac: "); + seq_puts(seq, "dst_mac: "); seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, @@ -577,7 +588,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->nr_labels) { unsigned int i; - seq_printf(seq, " mpls: "); + seq_puts(seq, " mpls: "); for (i = 0; i < pkt_dev->nr_labels; i++) seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), i == pkt_dev->nr_labels-1 ? "\n" : ", "); @@ -602,61 +613,67 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); - seq_printf(seq, " Flags: "); + seq_puts(seq, " Flags: "); if (pkt_dev->flags & F_IPV6) - seq_printf(seq, "IPV6 "); + seq_puts(seq, "IPV6 "); if (pkt_dev->flags & F_IPSRC_RND) - seq_printf(seq, "IPSRC_RND "); + seq_puts(seq, "IPSRC_RND "); if (pkt_dev->flags & F_IPDST_RND) - seq_printf(seq, "IPDST_RND "); + seq_puts(seq, "IPDST_RND "); if (pkt_dev->flags & F_TXSIZE_RND) - seq_printf(seq, "TXSIZE_RND "); + seq_puts(seq, "TXSIZE_RND "); if (pkt_dev->flags & F_UDPSRC_RND) - seq_printf(seq, "UDPSRC_RND "); + seq_puts(seq, "UDPSRC_RND "); if (pkt_dev->flags & F_UDPDST_RND) - seq_printf(seq, "UDPDST_RND "); + seq_puts(seq, "UDPDST_RND "); + + if (pkt_dev->flags & F_UDPCSUM) + seq_puts(seq, "UDPCSUM "); if (pkt_dev->flags & F_MPLS_RND) - seq_printf(seq, "MPLS_RND "); + seq_puts(seq, "MPLS_RND "); if (pkt_dev->flags & F_QUEUE_MAP_RND) - seq_printf(seq, "QUEUE_MAP_RND "); + seq_puts(seq, "QUEUE_MAP_RND "); if (pkt_dev->flags & F_QUEUE_MAP_CPU) - seq_printf(seq, "QUEUE_MAP_CPU "); + seq_puts(seq, "QUEUE_MAP_CPU "); if (pkt_dev->cflows) { if (pkt_dev->flags & F_FLOW_SEQ) - seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/ + seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/ else - seq_printf(seq, "FLOW_RND "); + seq_puts(seq, "FLOW_RND "); } #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) - seq_printf(seq, "IPSEC "); + if (pkt_dev->flags & F_IPSEC_ON) { + seq_puts(seq, "IPSEC "); + if (pkt_dev->spi) + seq_printf(seq, "spi:%u", pkt_dev->spi); + } #endif if (pkt_dev->flags & F_MACSRC_RND) - seq_printf(seq, "MACSRC_RND "); + seq_puts(seq, "MACSRC_RND "); if (pkt_dev->flags & F_MACDST_RND) - seq_printf(seq, "MACDST_RND "); + seq_puts(seq, "MACDST_RND "); if (pkt_dev->flags & F_VID_RND) - seq_printf(seq, "VID_RND "); + seq_puts(seq, "VID_RND "); if (pkt_dev->flags & F_SVID_RND) - seq_printf(seq, "SVID_RND "); + seq_puts(seq, "SVID_RND "); if (pkt_dev->flags & F_NODE) - seq_printf(seq, "NODE_ALLOC "); + seq_puts(seq, "NODE_ALLOC "); seq_puts(seq, "\n"); @@ -699,7 +716,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->result[0]) seq_printf(seq, "Result: %s\n", pkt_dev->result); else - seq_printf(seq, "Result: Idle\n"); + seq_puts(seq, "Result: Idle\n"); return 0; } @@ -1219,12 +1236,24 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!NODE_ALLOC") == 0) pkt_dev->flags &= ~F_NODE; + else if (strcmp(f, "UDPCSUM") == 0) + pkt_dev->flags |= F_UDPCSUM; + + else if (strcmp(f, "!UDPCSUM") == 0) + pkt_dev->flags &= ~F_UDPCSUM; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " - "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); + "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " + "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " + "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " +#ifdef CONFIG_XFRM + "IPSEC, " +#endif + "NODE_ALLOC\n"); return count; } sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); @@ -1413,7 +1442,7 @@ static ssize_t pktgen_if_write(struct file *file, if (!mac_pton(valstr, pkt_dev->dst_mac)) return -EINVAL; /* Set up Dest MAC */ - memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN); + ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac); sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac); return count; @@ -1430,7 +1459,7 @@ static ssize_t pktgen_if_write(struct file *file, if (!mac_pton(valstr, pkt_dev->src_mac)) return -EINVAL; /* Set up Src MAC */ - memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN); + ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac); sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac); return count; @@ -1455,7 +1484,18 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); return count; } +#ifdef CONFIG_XFRM + if (!strcmp(name, "spi")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + i += len; + pkt_dev->spi = value; + sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); + return count; + } +#endif if (!strcmp(name, "flowlen")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -1676,7 +1716,7 @@ static ssize_t pktgen_if_write(struct file *file, static int pktgen_if_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_if_show, PDE(inode)->data); + return single_open(file, pktgen_if_show, PDE_DATA(inode)); } static const struct file_operations pktgen_if_fops = { @@ -1695,14 +1735,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) BUG_ON(!t); - seq_printf(seq, "Running: "); + seq_puts(seq, "Running: "); if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); - seq_printf(seq, "\nStopped: "); + seq_puts(seq, "\nStopped: "); list_for_each_entry(pkt_dev, &t->if_list, list) if (!pkt_dev->running) @@ -1711,7 +1751,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) if (t->result[0]) seq_printf(seq, "\nResult: %s\n", t->result); else - seq_printf(seq, "\nResult: NA\n"); + seq_puts(seq, "\nResult: NA\n"); if_unlock(t); @@ -1781,10 +1821,13 @@ static ssize_t pktgen_thread_write(struct file *file, return -EFAULT; i += len; mutex_lock(&pktgen_thread_lock); - pktgen_add_device(t, f); + ret = pktgen_add_device(t, f); mutex_unlock(&pktgen_thread_lock); - ret = count; - sprintf(pg_result, "OK: add_device=%s", f); + if (!ret) { + ret = count; + sprintf(pg_result, "OK: add_device=%s", f); + } else + sprintf(pg_result, "ERROR: can not add device %s", f); goto out; } @@ -1811,7 +1854,7 @@ out: static int pktgen_thread_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_thread_show, PDE(inode)->data); + return single_open(file, pktgen_thread_show, PDE_DATA(inode)); } static const struct file_operations pktgen_thread_fops = { @@ -1824,13 +1867,14 @@ static const struct file_operations pktgen_thread_fops = { }; /* Think find or remove for NN */ -static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) +static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, + const char *ifname, int remove) { struct pktgen_thread *t; struct pktgen_dev *pkt_dev = NULL; bool exact = (remove == FIND); - list_for_each_entry(t, &pktgen_threads, th_list) { + list_for_each_entry(t, &pn->pktgen_threads, th_list) { pkt_dev = pktgen_find_dev(t, ifname, exact); if (pkt_dev) { if (remove) { @@ -1848,7 +1892,7 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) /* * mark a device for removal */ -static void pktgen_mark_device(const char *ifname) +static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) { struct pktgen_dev *pkt_dev = NULL; const int max_tries = 10, msec_per_try = 125; @@ -1859,7 +1903,7 @@ static void pktgen_mark_device(const char *ifname) while (1) { - pkt_dev = __pktgen_NN_threads(ifname, REMOVE); + pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE); if (pkt_dev == NULL) break; /* success */ @@ -1880,21 +1924,21 @@ static void pktgen_mark_device(const char *ifname) mutex_unlock(&pktgen_thread_lock); } -static void pktgen_change_name(struct net_device *dev) +static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev) { struct pktgen_thread *t; - list_for_each_entry(t, &pktgen_threads, th_list) { + list_for_each_entry(t, &pn->pktgen_threads, th_list) { struct pktgen_dev *pkt_dev; list_for_each_entry(pkt_dev, &t->if_list, list) { if (pkt_dev->odev != dev) continue; - remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + proc_remove(pkt_dev->entry); pkt_dev->entry = proc_create_data(dev->name, 0600, - pg_proc_dir, + pn->proc_dir, &pktgen_if_fops, pkt_dev); if (!pkt_dev->entry) @@ -1908,9 +1952,10 @@ static void pktgen_change_name(struct net_device *dev) static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id); - if (!net_eq(dev_net(dev), &init_net) || pktgen_exiting) + if (pn->pktgen_exiting) return NOTIFY_DONE; /* It is OK that we do not hold the group lock right now, @@ -1919,18 +1964,19 @@ static int pktgen_device_event(struct notifier_block *unused, switch (event) { case NETDEV_CHANGENAME: - pktgen_change_name(dev); + pktgen_change_name(pn, dev); break; case NETDEV_UNREGISTER: - pktgen_mark_device(dev->name); + pktgen_mark_device(pn, dev->name); break; } return NOTIFY_DONE; } -static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, +static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn, + struct pktgen_dev *pkt_dev, const char *ifname) { char b[IFNAMSIZ+5]; @@ -1944,13 +1990,14 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, } b[i] = 0; - return dev_get_by_name(&init_net, b); + return dev_get_by_name(pn->net, b); } /* Associate pktgen_dev with a device. */ -static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +static int pktgen_setup_dev(const struct pktgen_net *pn, + struct pktgen_dev *pkt_dev, const char *ifname) { struct net_device *odev; int err; @@ -1961,7 +2008,7 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) pkt_dev->odev = NULL; } - odev = pktgen_dev_get_by_name(pkt_dev, ifname); + odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname); if (!odev) { pr_err("no such netdevice: \"%s\"\n", ifname); return -ENODEV; @@ -2015,10 +2062,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) /* Default to the interface's mac if not explicitly set. */ if (is_zero_ether_addr(pkt_dev->src_mac)) - memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); + ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr); /* Set up Dest MAC */ - memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); + ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac); if (pkt_dev->flags & F_IPV6) { int i, set = 0, err = 1; @@ -2182,7 +2229,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = random32() % pkt_dev->cflows; + flow = prandom_u32() % pkt_dev->cflows; pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2203,14 +2250,23 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { struct xfrm_state *x = pkt_dev->flows[flow].x; + struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); if (!x) { - /*slow path: we dont already have xfrm_state*/ - x = xfrm_stateonly_find(&init_net, DUMMY_MARK, - (xfrm_address_t *)&pkt_dev->cur_daddr, - (xfrm_address_t *)&pkt_dev->cur_saddr, - AF_INET, - pkt_dev->ipsmode, - pkt_dev->ipsproto, 0); + + if (pkt_dev->spi) { + /* We need as quick as possible to find the right SA + * Searching with minimum criteria to archieve this. + */ + x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); + } else { + /* slow path: we dont already have xfrm_state */ + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, + (xfrm_address_t *)&pkt_dev->cur_daddr, + (xfrm_address_t *)&pkt_dev->cur_saddr, + AF_INET, + pkt_dev->ipsmode, + pkt_dev->ipsproto, 0); + } if (x) { pkt_dev->flows[flow].x = x; set_pkt_overhead(pkt_dev); @@ -2229,7 +2285,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = random32() % + t = prandom_u32() % (pkt_dev->queue_map_max - pkt_dev->queue_map_min + 1) + pkt_dev->queue_map_min; @@ -2261,7 +2317,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = random32() % pkt_dev->src_mac_count; + mc = prandom_u32() % pkt_dev->src_mac_count; else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2287,7 +2343,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = random32() % pkt_dev->dst_mac_count; + mc = prandom_u32() % pkt_dev->dst_mac_count; else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2314,21 +2370,21 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)random32() & + ((__force __be32)prandom_u32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = random32() & (4096-1); + pkt_dev->vlan_id = prandom_u32() & (4096 - 1); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = random32() & (4096 - 1); + pkt_dev->svlan_id = prandom_u32() & (4096 - 1); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = random32() % + pkt_dev->cur_udp_src = prandom_u32() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min) + pkt_dev->udp_src_min; @@ -2341,7 +2397,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = random32() % + pkt_dev->cur_udp_dst = prandom_u32() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + pkt_dev->udp_dst_min; } else { @@ -2358,7 +2414,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = random32() % (imx - imn) + imn; + t = prandom_u32() % (imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2379,17 +2435,15 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __be32 s; if (pkt_dev->flags & F_IPDST_RND) { - t = random32() % (imx - imn) + imn; - s = htonl(t); - - while (ipv4_is_loopback(s) || - ipv4_is_multicast(s) || - ipv4_is_lbcast(s) || - ipv4_is_zeronet(s) || - ipv4_is_local_multicast(s)) { - t = random32() % (imx - imn) + imn; + do { + t = prandom_u32() % + (imx - imn) + imn; s = htonl(t); - } + } while (ipv4_is_loopback(s) || + ipv4_is_multicast(s) || + ipv4_is_lbcast(s) || + ipv4_is_zeronet(s) || + ipv4_is_local_multicast(s)); pkt_dev->cur_daddr = s; } else { t = ntohl(pkt_dev->cur_daddr); @@ -2420,7 +2474,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)random32() | + (((__force __be32)prandom_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2430,7 +2484,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = random32() % + t = prandom_u32() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) + pkt_dev->min_pkt_size; } else { @@ -2448,31 +2502,47 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) #ifdef CONFIG_XFRM +static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { + + [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */ +}; + static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int err = 0; + struct net *net = dev_net(pkt_dev->odev); if (!x) return 0; /* XXX: we dont support tunnel mode for now until * we resolve the dst issue */ - if (x->props.mode != XFRM_MODE_TRANSPORT) + if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; - spin_lock(&x->lock); + /* But when user specify an valid SPI, transformation + * supports both transport/tunnel mode + ESP/AH type. + */ + if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) + skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); - if (err) + rcu_read_unlock_bh(); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); goto error; + } err = x->type->output(x, skb); - if (err) + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR); goto error; - + } + spin_lock_bh(&x->lock); x->curlft.bytes += skb->len; x->curlft.packets++; + spin_unlock_bh(&x->lock); error: - spin_unlock(&x->lock); return err; } @@ -2500,6 +2570,8 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, if (x) { int ret; __u8 *eth; + struct iphdr *iph; + nhead = x->props.header_len - skb_headroom(skb); if (nhead > 0) { ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); @@ -2521,6 +2593,11 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, eth = (__u8 *) skb_push(skb, ETH_HLEN); memcpy(eth, pkt_dev->hh, 12); *(u16 *) ð[12] = protocol; + + /* Update IPv4 header len as well as checksum value */ + iph = ip_hdr(skb); + iph->tot_len = htons(skb->len - ETH_HLEN); + ip_send_check(iph); } } return 1; @@ -2612,6 +2689,29 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->tv_usec = htonl(timestamp.tv_usec); } +static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, + struct pktgen_dev *pkt_dev, + unsigned int extralen) +{ + struct sk_buff *skb = NULL; + unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + + pkt_dev->pkt_overhead; + + if (pkt_dev->flags & F_NODE) { + int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); + + skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } + } else { + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + } + + return skb; +} + static struct sk_buff *fill_packet_ipv4(struct net_device *odev, struct pktgen_dev *pkt_dev) { @@ -2642,32 +2742,13 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, datalen = (odev->hard_header_len + 16) & ~0xf; - if (pkt_dev->flags & F_NODE) { - int node; - - if (pkt_dev->node >= 0) - node = pkt_dev->node; - else - node = numa_node_id(); - - skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64 - + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node); - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = odev; - } - } - else - skb = __netdev_alloc_skb(odev, - pkt_dev->cur_pkt_size + 64 - + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); - + skb = pktgen_alloc_skb(odev, pkt_dev, datalen); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } - prefetchw(skb->data); + prefetchw(skb->data); skb_reserve(skb, datalen); /* Reserve for ethernet and IP header */ @@ -2693,15 +2774,15 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IP); } - skb->network_header = skb->tail; - skb->transport_header = skb->network_header + sizeof(struct iphdr); - skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, skb->len); + iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); + + skb_set_transport_header(skb, skb->len); + udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; - iph = ip_hdr(skb); - udph = udp_hdr(skb); - memcpy(eth, pkt_dev->hh, 12); *(__be16 *) & eth[12] = protocol; @@ -2714,7 +2795,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); udph->len = htons(datalen + 8); /* DATA + udphdr */ - udph->check = 0; /* No checksum */ + udph->check = 0; iph->ihl = 5; iph->version = 4; @@ -2728,13 +2809,28 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->frag_off = 0; iplen = 20 + 8 + datalen; iph->tot_len = htons(iplen); - iph->check = 0; - iph->check = ip_fast_csum((void *)iph, iph->ihl); + ip_send_check(iph); skb->protocol = protocol; - skb->mac_header = (skb->network_header - ETH_HLEN - - pkt_dev->pkt_overhead); skb->dev = odev; skb->pkt_type = PACKET_HOST; + + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V4_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + udp4_hwcsum(skb, udph->source, udph->dest); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_tcpudp_magic(udph->source, udph->dest, + datalen + 8, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); #ifdef CONFIG_XFRM @@ -2751,7 +2847,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct sk_buff *skb = NULL; __u8 *eth; struct udphdr *udph; - int datalen; + int datalen, udplen; struct ipv6hdr *iph; __be16 protocol = htons(ETH_P_IPV6); __be32 *mpls; @@ -2773,15 +2869,13 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - skb = __netdev_alloc_skb(odev, - pkt_dev->cur_pkt_size + 64 - + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT); + skb = pktgen_alloc_skb(odev, pkt_dev, 16); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } - prefetchw(skb->data); + prefetchw(skb->data); skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ @@ -2807,13 +2901,14 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - skb->network_header = skb->tail; - skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); - skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, skb->len); + iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + + skb_set_transport_header(skb, skb->len); + udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; - iph = ipv6_hdr(skb); - udph = udp_hdr(skb); memcpy(eth, pkt_dev->hh, 12); *(__be16 *) ð[12] = protocol; @@ -2828,10 +2923,11 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, net_info_ratelimited("increased datalen to %d\n", datalen); } + udplen = datalen + sizeof(struct udphdr); udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); - udph->len = htons(datalen + sizeof(struct udphdr)); - udph->check = 0; /* No checksum */ + udph->len = htons(udplen); + udph->check = 0; *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ @@ -2842,18 +2938,33 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->hop_limit = 32; - iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->payload_len = htons(udplen); iph->nexthdr = IPPROTO_UDP; iph->daddr = pkt_dev->cur_in6_daddr; iph->saddr = pkt_dev->cur_in6_saddr; - skb->mac_header = (skb->network_header - ETH_HLEN - - pkt_dev->pkt_overhead); skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V6_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); return skb; @@ -2912,7 +3023,7 @@ static void pktgen_run(struct pktgen_thread *t) t->control &= ~(T_STOP); } -static void pktgen_stop_all_threads_ifs(void) +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn) { struct pktgen_thread *t; @@ -2920,7 +3031,7 @@ static void pktgen_stop_all_threads_ifs(void) mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= T_STOP; mutex_unlock(&pktgen_thread_lock); @@ -2956,28 +3067,28 @@ signal: return 0; } -static int pktgen_wait_all_threads_run(void) +static int pktgen_wait_all_threads_run(struct pktgen_net *pn) { struct pktgen_thread *t; int sig = 1; mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) { + list_for_each_entry(t, &pn->pktgen_threads, th_list) { sig = pktgen_wait_thread_run(t); if (sig == 0) break; } if (sig == 0) - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= (T_STOP); mutex_unlock(&pktgen_thread_lock); return sig; } -static void pktgen_run_all_threads(void) +static void pktgen_run_all_threads(struct pktgen_net *pn) { struct pktgen_thread *t; @@ -2985,7 +3096,7 @@ static void pktgen_run_all_threads(void) mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= (T_RUN); mutex_unlock(&pktgen_thread_lock); @@ -2993,10 +3104,10 @@ static void pktgen_run_all_threads(void) /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); - pktgen_wait_all_threads_run(); + pktgen_wait_all_threads_run(pn); } -static void pktgen_reset_all_threads(void) +static void pktgen_reset_all_threads(struct pktgen_net *pn) { struct pktgen_thread *t; @@ -3004,7 +3115,7 @@ static void pktgen_reset_all_threads(void) mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= (T_REMDEVALL); mutex_unlock(&pktgen_thread_lock); @@ -3012,7 +3123,7 @@ static void pktgen_reset_all_threads(void) /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); - pktgen_wait_all_threads_run(); + pktgen_wait_all_threads_run(pn); } static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) @@ -3154,9 +3265,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) static void pktgen_rem_thread(struct pktgen_thread *t) { /* Remove from the thread list */ - - remove_proc_entry(t->tsk->comm, pg_proc_dir); - + remove_proc_entry(t->tsk->comm, t->net->proc_dir); } static void pktgen_resched(struct pktgen_dev *pkt_dev) @@ -3229,9 +3338,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) queue_map = skb_get_queue_mapping(pkt_dev->skb); txq = netdev_get_tx_queue(odev, queue_map); - __netif_tx_lock_bh(txq); + local_bh_disable(); + + HARD_TX_LOCK(odev, txq, smp_processor_id()); - if (unlikely(netif_xmit_frozen_or_stopped(txq))) { + if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; @@ -3265,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; } unlock: - __netif_tx_unlock_bh(txq); + HARD_TX_UNLOCK(odev, txq); + + local_bh_enable(); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { @@ -3302,7 +3415,7 @@ static int pktgen_thread_worker(void *arg) pkt_dev = next_to_run(t); if (unlikely(!pkt_dev && t->control == 0)) { - if (pktgen_exiting) + if (t->net->pktgen_exiting) break; wait_event_interruptible_timeout(t->queue, t->control != 0, @@ -3424,7 +3537,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) /* We don't allow a device to be on several threads */ - pkt_dev = __pktgen_NN_threads(ifname, FIND); + pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND); if (pkt_dev) { pr_err("ERROR: interface already used\n"); return -EBUSY; @@ -3459,13 +3572,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_id = 0xffff; pkt_dev->node = -1; - err = pktgen_setup_dev(pkt_dev, ifname); + err = pktgen_setup_dev(t->net, pkt_dev, ifname); if (err) goto out1; if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) pkt_dev->clone_skb = pg_clone_skb_d; - pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, + pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir, &pktgen_if_fops, pkt_dev); if (!pkt_dev->entry) { pr_err("cannot create %s/%s procfs entry\n", @@ -3476,6 +3589,17 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) #ifdef CONFIG_XFRM pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; pkt_dev->ipsproto = IPPROTO_ESP; + + /* xfrm tunnel mode needs additional dst to extract outter + * ip header protocol/ttl/id field, here creat a phony one. + * instead of looking for a valid rt, which definitely hurting + * performance under such circumstance. + */ + pkt_dev->dstops.family = AF_INET; + pkt_dev->dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); + pkt_dev->dst.child = &pkt_dev->dst; + pkt_dev->dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); @@ -3490,7 +3614,7 @@ out1: return err; } -static int __init pktgen_create_thread(int cpu) +static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) { struct pktgen_thread *t; struct proc_dir_entry *pe; @@ -3508,7 +3632,7 @@ static int __init pktgen_create_thread(int cpu) INIT_LIST_HEAD(&t->if_list); - list_add_tail(&t->th_list, &pktgen_threads); + list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); p = kthread_create_on_node(pktgen_thread_worker, @@ -3524,7 +3648,7 @@ static int __init pktgen_create_thread(int cpu) kthread_bind(p, cpu); t->tsk = p; - pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir, + pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, &pktgen_thread_fops, t); if (!pe) { pr_err("cannot create %s/%s procfs entry\n", @@ -3535,6 +3659,7 @@ static int __init pktgen_create_thread(int cpu) return -EINVAL; } + t->net = pn; wake_up_process(p); wait_for_completion(&t->start_done); @@ -3560,7 +3685,6 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t, static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) { - pr_debug("remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { @@ -3580,7 +3704,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, _rem_dev_from_if_list(t, pkt_dev); if (pkt_dev->entry) - remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + proc_remove(pkt_dev->entry); #ifdef CONFIG_XFRM free_SAs(pkt_dev); @@ -3592,63 +3716,63 @@ static int pktgen_remove_device(struct pktgen_thread *t, return 0; } -static int __init pg_init(void) +static int __net_init pg_net_init(struct net *net) { - int cpu; + struct pktgen_net *pn = net_generic(net, pg_net_id); struct proc_dir_entry *pe; - int ret = 0; - - pr_info("%s", version); - - pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); - if (!pg_proc_dir) + int cpu, ret = 0; + + pn->net = net; + INIT_LIST_HEAD(&pn->pktgen_threads); + pn->pktgen_exiting = false; + pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net); + if (!pn->proc_dir) { + pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR); return -ENODEV; - - pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); + } + pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops); if (pe == NULL) { - pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); + pr_err("cannot create %s procfs entry\n", PGCTRL); ret = -EINVAL; - goto remove_dir; + goto remove; } - register_netdevice_notifier(&pktgen_notifier_block); - for_each_online_cpu(cpu) { int err; - err = pktgen_create_thread(cpu); + err = pktgen_create_thread(cpu, pn); if (err) - pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n", + pr_warn("Cannot create thread for cpu %d (%d)\n", cpu, err); } - if (list_empty(&pktgen_threads)) { - pr_err("ERROR: Initialization failed for all threads\n"); + if (list_empty(&pn->pktgen_threads)) { + pr_err("Initialization failed for all threads\n"); ret = -ENODEV; - goto unregister; + goto remove_entry; } return 0; - unregister: - unregister_netdevice_notifier(&pktgen_notifier_block); - remove_proc_entry(PGCTRL, pg_proc_dir); - remove_dir: - proc_net_remove(&init_net, PG_PROC_DIR); +remove_entry: + remove_proc_entry(PGCTRL, pn->proc_dir); +remove: + remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); return ret; } -static void __exit pg_cleanup(void) +static void __net_exit pg_net_exit(struct net *net) { + struct pktgen_net *pn = net_generic(net, pg_net_id); struct pktgen_thread *t; struct list_head *q, *n; LIST_HEAD(list); /* Stop all interfaces & threads */ - pktgen_exiting = true; + pn->pktgen_exiting = true; mutex_lock(&pktgen_thread_lock); - list_splice_init(&pktgen_threads, &list); + list_splice_init(&pn->pktgen_threads, &list); mutex_unlock(&pktgen_thread_lock); list_for_each_safe(q, n, &list) { @@ -3658,12 +3782,36 @@ static void __exit pg_cleanup(void) kfree(t); } - /* Un-register us from receiving netdevice events */ - unregister_netdevice_notifier(&pktgen_notifier_block); + remove_proc_entry(PGCTRL, pn->proc_dir); + remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); +} + +static struct pernet_operations pg_net_ops = { + .init = pg_net_init, + .exit = pg_net_exit, + .id = &pg_net_id, + .size = sizeof(struct pktgen_net), +}; + +static int __init pg_init(void) +{ + int ret = 0; - /* Clean up proc file system */ - remove_proc_entry(PGCTRL, pg_proc_dir); - proc_net_remove(&init_net, PG_PROC_DIR); + pr_info("%s", version); + ret = register_pernet_subsys(&pg_net_ops); + if (ret) + return ret; + ret = register_netdevice_notifier(&pktgen_notifier_block); + if (ret) + unregister_pernet_subsys(&pg_net_ops); + + return ret; +} + +static void __exit pg_cleanup(void) +{ + unregister_netdevice_notifier(&pktgen_notifier_block); + unregister_pernet_subsys(&pg_net_ops); } module_init(pg_init); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c new file mode 100644 index 00000000000..d3027a73fd4 --- /dev/null +++ b/net/core/ptp_classifier.c @@ -0,0 +1,141 @@ +/* PTP classifier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* The below program is the bpf_asm (tools/net/) representation of + * the opcode array in the ptp_filter structure. + * + * For convenience, this can easily be altered and reviewed with + * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a + * simple file containing the below program: + * + * ldh [12] ; load ethertype + * + * ; PTP over UDP over IPv4 over Ethernet + * test_ipv4: + * jneq #0x800, test_ipv6 ; ETH_P_IP ? + * ldb [23] ; load proto + * jneq #17, drop_ipv4 ; IPPROTO_UDP ? + * ldh [20] ; load frag offset field + * jset #0x1fff, drop_ipv4 ; don't allow fragments + * ldxb 4*([14]&0xf) ; load IP header len + * ldh [x + 16] ; load UDP dst port + * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 22] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x10 ; PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over Ethernet + * test_ipv6: + * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ? + * ldb [20] ; load proto + * jneq #17, drop_ipv6 ; IPPROTO_UDP ? + * ldh [56] ; load UDP dst port + * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ? + * ldh [62] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x20 ; PTP_CLASS_IPV6 + * ret a ; return PTP class + * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over 802.1Q over Ethernet + * test_8021q: + * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ? + * ldh [16] ; load inner type + * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * ldb [18] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [18] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x40 ; PTP_CLASS_V2_VLAN + * ret a ; return PTP class + * + * ; PTP over Ethernet + * test_ieee1588: + * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * ldb [14] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [14] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x30 ; PTP_CLASS_L2 + * ret a ; return PTP class + * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE + */ + +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <linux/ptp_classify.h> + +static struct sk_filter *ptp_insns __read_mostly; + +unsigned int ptp_classify_raw(const struct sk_buff *skb) +{ + return SK_RUN_FILTER(ptp_insns, skb); +} +EXPORT_SYMBOL_GPL(ptp_classify_raw); + +void __init ptp_classifier_init(void) +{ + static struct sock_filter ptp_filter[] __initdata = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000014 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x0000000e }, + { 0x48, 0, 0, 0x00000010 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x00000016 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000010 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 9, 0x000086dd }, + { 0x30, 0, 0, 0x00000014 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x00000038 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x0000003e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000020 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 9, 0x00008100 }, + { 0x28, 0, 0, 0x00000010 }, + { 0x15, 0, 15, 0x000088f7 }, + { 0x30, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 12, 0x00000000 }, + { 0x28, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000040 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x15, 0, 7, 0x000088f7 }, + { 0x30, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 4, 0x00000000 }, + { 0x28, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000030 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + }; + struct sock_fprog_kern ptp_prog = { + .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, + }; + + BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog)); +} diff --git a/net/core/request_sock.c b/net/core/request_sock.c index c31d9e8668c..467f326126e 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -186,8 +186,6 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, struct fastopen_queue *fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; - BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk)); - tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; @@ -223,5 +221,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, out: spin_unlock_bh(&fastopenq->lock); sock_put(lsk); - return; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1868625af25..1063996f831 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -353,18 +353,65 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + for_each_net(net) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - rtnl_lock(); + /* Close the race with cleanup_net() */ + mutex_lock(&net_mutex); + rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); + mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); +static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (!master_dev) + return 0; + ops = master_dev->rtnl_link_ops; + if (!ops || !ops->get_slave_size) + return 0; + /* IFLA_INFO_SLAVE_DATA + nested data */ + return nla_total_size(sizeof(struct nlattr)) + + ops->get_slave_size(master_dev, dev); +} + static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; @@ -385,6 +432,8 @@ static size_t rtnl_link_get_size(const struct net_device *dev) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); + size += rtnl_link_get_slave_info_data_size(dev); + return size; } @@ -403,34 +452,16 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) } /** - * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. - * @ops: struct rtnl_af_ops * to register - * - * The caller must hold the rtnl_mutex. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_af_register(struct rtnl_af_ops *ops) -{ - list_add_tail(&ops->list, &rtnl_af_ops); - return 0; -} -EXPORT_SYMBOL_GPL(__rtnl_af_register); - -/** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Returns 0 on success or a negative error code. */ -int rtnl_af_register(struct rtnl_af_ops *ops) +void rtnl_af_register(struct rtnl_af_ops *ops) { - int err; - rtnl_lock(); - err = __rtnl_af_register(ops); + list_add_tail(&ops->list, &rtnl_af_ops); rtnl_unlock(); - return err; } EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -477,69 +508,105 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev) return size; } -static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +static bool rtnl_have_link_slave_info(const struct net_device *dev) { - const struct rtnl_link_ops *ops = dev->rtnl_link_ops; - struct nlattr *linkinfo, *data; - int err = -EMSGSIZE; + struct net_device *master_dev; - linkinfo = nla_nest_start(skb, IFLA_LINKINFO); - if (linkinfo == NULL) - goto out; + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (master_dev && master_dev->rtnl_link_ops) + return true; + return false; +} +static int rtnl_link_slave_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + struct nlattr *slave_data; + int err; + + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (!master_dev) + return 0; + ops = master_dev->rtnl_link_ops; + if (!ops) + return 0; + if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) + return -EMSGSIZE; + if (ops->fill_slave_info) { + slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA); + if (!slave_data) + return -EMSGSIZE; + err = ops->fill_slave_info(skb, master_dev, dev); + if (err < 0) + goto err_cancel_slave_data; + nla_nest_end(skb, slave_data); + } + return 0; + +err_cancel_slave_data: + nla_nest_cancel(skb, slave_data); + return err; +} + +static int rtnl_link_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + struct nlattr *data; + int err; + + if (!ops) + return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) - goto err_cancel_link; + return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) - goto err_cancel_link; + return err; } if (ops->fill_info) { data = nla_nest_start(skb, IFLA_INFO_DATA); if (data == NULL) - goto err_cancel_link; + return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } - - nla_nest_end(skb, linkinfo); return 0; err_cancel_data: nla_nest_cancel(skb, data); -err_cancel_link: - nla_nest_cancel(skb, linkinfo); -out: return err; } -static const int rtm_min[RTM_NR_FAMILIES] = +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { - [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), - [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), - [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), - [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), - [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), - [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), - [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), -}; + struct nlattr *linkinfo; + int err = -EMSGSIZE; -static const int rta_max[RTM_NR_FAMILIES] = -{ - [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, - [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, - [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX, - [RTM_FAM(RTM_NEWRULE)] = FRA_MAX, - [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX, - [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, - [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, - [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, -}; + linkinfo = nla_nest_start(skb, IFLA_LINKINFO); + if (linkinfo == NULL) + goto out; + + err = rtnl_link_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + err = rtnl_link_slave_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + nla_nest_end(skb, linkinfo); + return 0; + +err_cancel_link: + nla_nest_cancel(skb, linkinfo); +out: + return err; +} int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { @@ -731,14 +798,15 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + - nla_total_size(sizeof(struct ifla_vf_tx_rate)) + - nla_total_size(sizeof(struct ifla_vf_spoofchk))); + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + + nla_total_size(sizeof(struct ifla_vf_rate))); return size; } else return 0; } -static size_t rtnl_port_size(const struct net_device *dev) +static size_t rtnl_port_size(const struct net_device *dev, + u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ @@ -754,7 +822,8 @@ static size_t rtnl_port_size(const struct net_device *dev) size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + @@ -780,17 +849,20 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ - + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -849,11 +921,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) return 0; } -static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, + u32 ext_filter_mask) { int err; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); @@ -869,6 +943,24 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_port_id ppid; + + err = dev_get_phys_port_id(dev, &ppid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -879,6 +971,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, const struct rtnl_link_stats64 *stats; struct nlattr *attr, *af_spec; struct rtnl_af_ops *af_ops; + struct net_device *upper_dev = netdev_master_upper_dev_get(dev); ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -907,12 +1000,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #endif (dev->ifindex != dev->iflink && nla_put_u32(skb, IFLA_LINK, dev->iflink)) || - (dev->master && - nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) || + (upper_dev && + nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || + nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || (dev->ifalias && - nla_put_string(skb, IFLA_IFALIAS, dev->ifalias))) + nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || + nla_put_u32(skb, IFLA_CARRIER_CHANGES, + atomic_read(&dev->carrier_changes))) goto nla_put_failure; if (1) { @@ -934,6 +1030,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } + if (rtnl_phys_port_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -966,8 +1065,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_info ivi; struct ifla_vf_mac vf_mac; struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_link_state vf_linkstate; /* * Not all SR-IOV capable drivers support the @@ -976,18 +1077,28 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, * report anything. */ ivi.spoofchk = -1; + memset(ivi.mac, 0, sizeof(ivi.mac)); + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; vf_mac.vf = vf_vlan.vf = + vf_rate.vf = vf_tx_rate.vf = - vf_spoofchk.vf = ivi.vf; + vf_spoofchk.vf = + vf_linkstate.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; - vf_tx_rate.rate = ivi.tx_rate; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -995,20 +1106,24 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, } if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk)) + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate)) goto nla_put_failure; nla_nest_end(skb, vf); } nla_nest_end(skb, vfinfo); } - if (rtnl_port_fill(skb, dev)) + if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; - if (dev->rtnl_link_ops) { + if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } @@ -1050,57 +1165,7 @@ nla_put_failure: return -EMSGSIZE; } -static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; - struct hlist_head *head; - struct hlist_node *node; - struct nlattr *tb[IFLA_MAX+1]; - u32 ext_filter_mask = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - - rcu_read_lock(); - cb->seq = net->dev_base_seq; - - if (nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, - ifla_policy) >= 0) { - - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); - } - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { - if (idx < s_idx) - goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, - ext_filter_mask) <= 0) - goto out; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - } -out: - rcu_read_unlock(); - cb->args[1] = idx; - cb->args[0] = h; - - return skb->len; -} - -const struct nla_policy ifla_policy[IFLA_MAX+1] = { +static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -1108,6 +1173,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, + [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, @@ -1124,12 +1190,15 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, + [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ }; -EXPORT_SYMBOL(ifla_policy); static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, + [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { @@ -1145,6 +1214,10 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY, .len = sizeof(struct ifla_vf_spoofchk) }, + [IFLA_VF_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_rate) }, + [IFLA_VF_LINK_STATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_link_state) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1161,6 +1234,78 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx = 0, s_idx; + struct net_device *dev; + struct hlist_head *head; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + int err; + int hdrlen; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + rcu_read_lock(); + cb->seq = net->dev_base_seq; + + /* A hack to preserve kernel<->userspace interface. + * The correct header is ifinfomsg. It is consistent with rtnl_getlink. + * However, before Linux v3.9 the code here assumed rtgenmsg and that's + * what iproute2 < v3.9.0 used. + * We can detect the old iproute2. Even including the IFLA_EXT_MASK + * attribute, its netlink message is shorter than struct ifinfomsg. + */ + hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + } + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI, + ext_filter_mask); + /* If we ran out of room on the first message, + * we're in trouble + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + + if (err <= 0) + goto out; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + +int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len) +{ + return nla_parse(tb, IFLA_MAX, head, len, ifla_policy); +} +EXPORT_SYMBOL(rtnl_nla_parse_ifla); + struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; @@ -1242,11 +1387,29 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) } case IFLA_VF_TX_RATE: { struct ifla_vf_tx_rate *ivt; + struct ifla_vf_info ivf; ivt = nla_data(vf); err = -EOPNOTSUPP; - if (ops->ndo_set_vf_tx_rate) - err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, - ivt->rate); + if (ops->ndo_get_vf_config) + err = ops->ndo_get_vf_config(dev, ivt->vf, + &ivf); + if (err) + break; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_rate) + err = ops->ndo_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, + ivt->rate); + break; + } + case IFLA_VF_RATE: { + struct ifla_vf_rate *ivt; + ivt = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_rate) + err = ops->ndo_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, + ivt->max_tx_rate); break; } case IFLA_VF_SPOOFCHK: { @@ -1258,6 +1421,15 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ivs->setting); break; } + case IFLA_VF_LINK_STATE: { + struct ifla_vf_link_state *ivl; + ivl = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_link_state) + err = ops->ndo_set_vf_link_state(dev, ivl->vf, + ivl->link_state); + break; + } default: err = -EINVAL; break; @@ -1270,16 +1442,16 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) static int do_set_master(struct net_device *dev, int ifindex) { - struct net_device *master_dev; + struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; - if (dev->master) { - if (dev->master->ifindex == ifindex) + if (upper_dev) { + if (upper_dev->ifindex == ifindex) return 0; - ops = dev->master->netdev_ops; + ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { - err = ops->ndo_del_slave(dev->master, dev); + err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; } else { @@ -1288,12 +1460,12 @@ static int do_set_master(struct net_device *dev, int ifindex) } if (ifindex) { - master_dev = __dev_get_by_index(dev_net(dev), ifindex); - if (!master_dev) + upper_dev = __dev_get_by_index(dev_net(dev), ifindex); + if (!upper_dev) return -EINVAL; - ops = master_dev->netdev_ops; + ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { - err = ops->ndo_add_slave(master_dev, dev); + err = ops->ndo_add_slave(upper_dev, dev); if (err) return err; } else { @@ -1303,11 +1475,11 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } -static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, + struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { const struct net_device_ops *ops = dev->netdev_ops; - int send_addr_notify = 0; int err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { @@ -1316,7 +1488,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, err = PTR_ERR(net); goto errout; } - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { err = -EPERM; goto errout; } @@ -1360,16 +1532,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct sockaddr *sa; int len; - if (!ops->ndo_set_mac_address) { - err = -EOPNOTSUPP; - goto errout; - } - - if (!netif_device_present(dev)) { - err = -ENODEV; - goto errout; - } - len = sizeof(sa_family_t) + dev->addr_len; sa = kmalloc(len, GFP_KERNEL); if (!sa) { @@ -1379,13 +1541,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = ops->ndo_set_mac_address(dev, sa); + err = dev_set_mac_address(dev, sa); kfree(sa); if (err) goto errout; - send_addr_notify = 1; modified = 1; - add_device_randomness(dev->dev_addr, dev->addr_len); } if (tb[IFLA_MTU]) { @@ -1422,7 +1582,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); - send_addr_notify = 1; + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (ifm->ifi_flags || ifm->ifi_change) { @@ -1438,6 +1598,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, modified = 1; } + if (tb[IFLA_CARRIER]) { + err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); + if (err) + goto errout; + modified = 1; + } + if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); @@ -1536,13 +1703,10 @@ errout: net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); - if (send_addr_notify) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; } -static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -1578,12 +1742,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (err < 0) goto errout; - err = do_setlink(dev, ifm, tb, ifname, 0); + err = do_setlink(skb, dev, ifm, tb, ifname, 0); errout: return err; } -static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; @@ -1618,7 +1782,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); - list_del(&list_kill); return 0; } @@ -1635,9 +1798,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); - __dev_notify_flags(dev, old_flags); + __dev_notify_flags(dev, old_flags, ~0U); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -1672,9 +1834,11 @@ struct net_device *rtnl_create_link(struct net *net, if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); - if (tb[IFLA_ADDRESS]) + if (tb[IFLA_ADDRESS]) { memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); + dev->addr_assign_type = NET_ADDR_SET; + } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); @@ -1694,7 +1858,8 @@ err: } EXPORT_SYMBOL(rtnl_create_link); -static int rtnl_group_changelink(struct net *net, int group, +static int rtnl_group_changelink(const struct sk_buff *skb, + struct net *net, int group, struct ifinfomsg *ifm, struct nlattr **tb) { @@ -1703,7 +1868,7 @@ static int rtnl_group_changelink(struct net *net, int group, for_each_netdev(net, dev) { if (dev->group == group) { - err = do_setlink(dev, ifm, tb, NULL, 0); + err = do_setlink(skb, dev, ifm, tb, NULL, 0); if (err < 0) return err; } @@ -1712,11 +1877,13 @@ static int rtnl_group_changelink(struct net *net, int group, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; + const struct rtnl_link_ops *m_ops = NULL; struct net_device *dev; + struct net_device *master_dev = NULL; struct ifinfomsg *ifm; char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; @@ -1746,6 +1913,12 @@ replay: dev = NULL; } + if (dev) { + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + } + err = validate_linkmsg(dev, tb); if (err < 0) return err; @@ -1767,7 +1940,10 @@ replay: } if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct nlattr *attr[ops ? ops->maxtype + 1 : 0]; + struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0]; + struct nlattr **data = NULL; + struct nlattr **slave_data = NULL; struct net *dest_net; if (ops) { @@ -1786,6 +1962,24 @@ replay: } } + if (m_ops) { + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy); + if (err < 0) + return err; + slave_data = slave_attr; + } + if (m_ops->slave_validate) { + err = m_ops->slave_validate(tb, slave_data); + if (err < 0) + return err; + } + } + if (dev) { int modified = 0; @@ -1805,12 +1999,23 @@ replay: modified = 1; } - return do_setlink(dev, ifm, tb, ifname, modified); + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + err = m_ops->slave_changelink(master_dev, dev, + tb, slave_data); + if (err < 0) + return err; + modified = 1; + } + + return do_setlink(skb, dev, ifm, tb, ifname, modified); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(net, + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, tb); return -ENODEV; @@ -1848,16 +2053,25 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) + if (ops->newlink) { err = ops->newlink(net, dev, tb, data); - else + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) + free_netdev(dev); + goto out; + } + } else { err = register_netdevice(dev); - - if (err < 0 && !IS_ERR(dev)) - free_netdev(dev); - if (err < 0) - goto out; - + if (err < 0) { + free_netdev(dev); + goto out; + } + } err = rtnl_configure_link(dev, ifm); if (err < 0) unregister_netdevice(dev); @@ -1867,7 +2081,7 @@ out: } } -static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -1922,9 +2136,13 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; u16 min_ifinfo_dump_size = 0; + int hdrlen; + + /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, - ifla_policy) >= 0) { + if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } @@ -1958,8 +2176,11 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (rtnl_msg_handlers[idx] == NULL || rtnl_msg_handlers[idx][type].dumpit == NULL) continue; - if (idx > s_idx) + if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); + cb->prev_seq = 0; + cb->seq = 0; + } if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) break; } @@ -1968,14 +2189,15 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags); if (skb == NULL) goto errout; @@ -1986,22 +2208,24 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); } +EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u32 pid, u32 seq, - int type, unsigned int flags) + int type, unsigned int flags, + int nlflags) { struct nlmsghdr *nlh; struct ndmsg *ndm; - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; @@ -2039,7 +2263,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type) if (!skb) goto errout; - err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF); + err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0); if (err < 0) { kfree_skb(skb); goto errout; @@ -2051,19 +2275,47 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +/** + * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry + */ +int ndo_dflt_fdb_add(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, + u16 flags) +{ + int err = -EINVAL; + + /* If aging addresses are supported device will need to + * implement its own handler for this. + */ + if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { + pr_info("%s: FDB only supports static addresses\n", dev->name); + return err; + } + + if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) + err = dev_uc_add_excl(dev, addr); + else if (is_multicast_ether_addr(addr)) + err = dev_mc_add_excl(dev, addr); + + /* Only return duplicate errors if NLM_F_EXCL is set */ + if (err == -EEXIST && !(flags & NLM_F_EXCL)) + err = 0; + + return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_add); + +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - struct net_device *master = NULL; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; int err; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); if (err < 0) return err; @@ -2086,20 +2338,16 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } addr = nla_data(tb[NDA_LLADDR]); - if (!is_valid_ether_addr(addr)) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ether address\n"); - return -EINVAL; - } err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && (dev->priv_flags & IFF_BRIDGE_PORT)) { - master = dev->master; - err = master->netdev_ops->ndo_fdb_add(ndm, tb, - dev, addr, - nlh->nlmsg_flags); + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + const struct net_device_ops *ops = br_dev->netdev_ops; + + err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); if (err) goto out; else @@ -2107,10 +2355,13 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } /* Embedded bridge, macvlan, and any other device support */ - if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) { - err = dev->netdev_ops->ndo_fdb_add(ndm, tb, - dev, addr, - nlh->nlmsg_flags); + if ((ndm->ndm_flags & NTF_SELF)) { + if (dev->netdev_ops->ndo_fdb_add) + err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, + nlh->nlmsg_flags); + else + err = ndo_dflt_fdb_add(ndm, tb, dev, addr, + nlh->nlmsg_flags); if (!err) { rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH); @@ -2121,20 +2372,50 @@ out: return err; } -static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +/** + * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry + */ +int ndo_dflt_fdb_del(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr) +{ + int err = -EOPNOTSUPP; + + /* If aging addresses are supported device will need to + * implement its own handler for this. + */ + if (!(ndm->ndm_state & NUD_PERMANENT)) { + pr_info("%s: FDB only supports static addresses\n", dev->name); + return -EINVAL; + } + + if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) + err = dev_uc_del(dev, addr); + else if (is_multicast_ether_addr(addr)) + err = dev_mc_del(dev, addr); + else + err = -EINVAL; + + return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_del); + +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; - struct nlattr *llattr; + struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; int err = -EINVAL; __u8 *addr; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (nlmsg_len(nlh) < sizeof(*ndm)) - return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (err < 0) + return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { @@ -2148,22 +2429,23 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -ENODEV; } - llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR); - if (llattr == NULL || nla_len(llattr) != ETH_ALEN) { - pr_info("PF_BRIGDE: RTM_DELNEIGH with invalid address\n"); + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); return -EINVAL; } - addr = nla_data(llattr); + addr = nla_data(tb[NDA_LLADDR]); + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && (dev->priv_flags & IFF_BRIDGE_PORT)) { - struct net_device *master = dev->master; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + const struct net_device_ops *ops = br_dev->netdev_ops; - if (master->netdev_ops->ndo_fdb_del) - err = master->netdev_ops->ndo_fdb_del(ndm, dev, addr); + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr); if (err) goto out; @@ -2172,8 +2454,11 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } /* Embedded bridge, macvlan, and any other device support */ - if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_del) { - err = dev->netdev_ops->ndo_fdb_del(ndm, dev, addr); + if (ndm->ndm_flags & NTF_SELF) { + if (dev->netdev_ops->ndo_fdb_del) + err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); + else + err = ndo_dflt_fdb_del(ndm, tb, dev, addr); if (!err) { rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); @@ -2203,7 +2488,8 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, portid, seq, - RTM_NEWNEIGH, NTF_SELF); + RTM_NEWNEIGH, NTF_SELF, + NLM_F_MULTI); if (err < 0) return err; skip: @@ -2218,7 +2504,7 @@ skip: * @dev: netdevice * * Default netdevice operation to dump the existing unicast address list. - * Returns zero on success. + * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -2247,15 +2533,19 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_BRIDGE_PORT) { - struct net_device *master = dev->master; - const struct net_device_ops *ops = master->netdev_ops; + struct net_device *br_dev; + const struct net_device_ops *ops; + br_dev = netdev_master_upper_dev_get(dev); + ops = br_dev->netdev_ops; if (ops->ndo_fdb_dump) idx = ops->ndo_fdb_dump(skb, cb, dev, idx); } if (dev->netdev_ops->ndo_fdb_dump) idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx); + else + idx = ndo_dflt_fdb_dump(skb, cb, dev, idx); } rcu_read_unlock(); @@ -2270,6 +2560,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct ifinfomsg *ifm; struct nlattr *br_afspec; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); if (nlh == NULL) @@ -2287,8 +2578,8 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || - (dev->master && - nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) || + (br_dev && + nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev->iflink && @@ -2320,23 +2611,31 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; + struct nlattr *extfilt; + u32 filter_mask = 0; + + extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), + IFLA_EXT_MASK); + if (extfilt) + filter_mask = nla_get_u32(extfilt); rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; - struct net_device *master = dev->master; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); - if (master && master->netdev_ops->ndo_bridge_getlink) { + if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && - master->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev) < 0) + br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, filter_mask) < 0) break; idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev) < 0) + ops->ndo_bridge_getlink(skb, portid, seq, dev, + filter_mask) < 0) break; idx++; } @@ -2365,7 +2664,7 @@ static inline size_t bridge_nlmsg_size(void) static int rtnl_bridge_notify(struct net_device *dev, u16 flags) { struct net *net = dev_net(dev); - struct net_device *master = dev->master; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; @@ -2376,15 +2675,15 @@ static int rtnl_bridge_notify(struct net_device *dev, u16 flags) } if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) && - master && master->netdev_ops->ndo_bridge_getlink) { - err = master->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); + br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { + err = br_dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); if (err < 0) goto errout; } if ((flags & BRIDGE_FLAGS_SELF) && dev->netdev_ops->ndo_bridge_getlink) { - err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); + err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); if (err < 0) goto errout; } @@ -2398,8 +2697,7 @@ errout: return err; } -static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, - void *arg) +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -2436,13 +2734,14 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, oflags = flags; if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { - if (!dev->master || - !dev->master->netdev_ops->ndo_bridge_setlink) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } - err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh); if (err) goto out; @@ -2468,9 +2767,75 @@ out: return err; } -/* Protected by RTNL sempahore. */ -static struct rtattr **rta_buf; -static int rtattr_max; +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + struct nlattr *br_spec, *attr = NULL; + int rem, err = -EOPNOTSUPP; + u16 oflags, flags = 0; + bool have_flags = false; + + if (nlmsg_len(nlh) < sizeof(*ifm)) + return -EINVAL; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + dev = __dev_get_by_index(net, ifm->ifi_index); + if (!dev) { + pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + return -ENODEV; + } + + br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (br_spec) { + nla_for_each_nested(attr, br_spec, rem) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + have_flags = true; + flags = nla_get_u16(attr); + break; + } + } + } + + oflags = flags; + + if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { + err = -EOPNOTSUPP; + goto out; + } + + err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + if (err) + goto out; + + flags &= ~BRIDGE_FLAGS_MASTER; + } + + if ((flags & BRIDGE_FLAGS_SELF)) { + if (!dev->netdev_ops->ndo_bridge_dellink) + err = -EOPNOTSUPP; + else + err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + + if (!err) + flags &= ~BRIDGE_FLAGS_SELF; + } + + if (have_flags) + memcpy(nla_data(attr), &flags, sizeof(flags)); + /* Generate event to notify upper layer of bridge change */ + if (!err) + err = rtnl_bridge_notify(dev, oflags); +out: + return err; +} /* Process one rtnetlink message. */ @@ -2479,7 +2844,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct net *net = sock_net(skb->sk); rtnl_doit_func doit; int sz_idx, kind; - int min_len; int family; int type; int err; @@ -2491,14 +2855,14 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) type -= RTM_BASE; /* All the messages must have at least 1 byte length */ - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) return 0; - family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; sz_idx = type>>2; kind = type&3; - if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { @@ -2527,32 +2891,11 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } - memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); - - min_len = rtm_min[sz_idx]; - if (nlh->nlmsg_len < min_len) - return -EINVAL; - - if (nlh->nlmsg_len > min_len) { - int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); - - while (RTA_OK(attr, attrlen)) { - unsigned int flavor = attr->rta_type; - if (flavor) { - if (flavor > rta_max[sz_idx]) - return -EINVAL; - rta_buf[flavor-1] = attr; - } - attr = RTA_NEXT(attr, attrlen); - } - } - doit = rtnl_get_doit(family, type); if (doit == NULL) return -EOPNOTSUPP; - return doit(skb, nlh, (void *)&rta_buf[0]); + return doit(skb, nlh); } static void rtnetlink_rcv(struct sk_buff *skb) @@ -2564,7 +2907,7 @@ static void rtnetlink_rcv(struct sk_buff *skb) static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UP: @@ -2581,7 +2924,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_JOIN: break; default: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); break; } return NOTIFY_DONE; @@ -2622,16 +2965,6 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { - int i; - - rtattr_max = 0; - for (i = 0; i < ARRAY_SIZE(rta_max); i++) - if (rta_max[i] > rtattr_max) - rtattr_max = rta_max[i]; - rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); - if (!rta_buf) - panic("rtnetlink_init: cannot allocate rta_buf\n"); - if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); @@ -2651,6 +2984,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); + rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); } diff --git a/net/core/scm.c b/net/core/scm.c index 57fb1ee6649..b442e7e25e6 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -24,6 +24,7 @@ #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/security.h> +#include <linux/pid_namespace.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/slab.h> @@ -35,6 +36,7 @@ #include <net/sock.h> #include <net/compat.h> #include <net/scm.h> +#include <net/cls_cgroup.h> /* @@ -51,11 +53,12 @@ static __inline__ int scm_check_creds(struct ucred *creds) if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; - if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || + ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || - uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && + uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || - gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) { + gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) { return 0; } return -EPERM; @@ -184,22 +187,6 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) p->creds.uid = uid; p->creds.gid = gid; - - if (!p->cred || - !uid_eq(p->cred->euid, uid) || - !gid_eq(p->cred->egid, gid)) { - struct cred *cred; - err = -ENOMEM; - cred = prepare_creds(); - if (!cred) - goto error; - - cred->uid = cred->euid = uid; - cred->gid = cred->egid = gid; - if (p->cred) - put_cred(p->cred); - p->cred = cred; - } break; } default: @@ -302,8 +289,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } /* Bump the usage count and install the file. */ sock = sock_from_file(fp[i], &err); - if (sock) - sock_update_netprioidx(sock->sk, current); + if (sock) { + sock_update_netprioidx(sock->sk); + sock_update_classid(sock->sk); + } fd_install(new_fd, get_file(fp[i])); } diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index e61a8bb7fce..ba71212f025 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -7,17 +7,20 @@ #include <linux/hrtimer.h> #include <linux/ktime.h> #include <linux/string.h> +#include <linux/net.h> #include <net/secure_seq.h> -static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; +#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -static int __init net_secret_init(void) +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; + +static __always_inline void net_secret_init(void) { - get_random_bytes(net_secret, sizeof(net_secret)); - return 0; + net_get_random_once(net_secret, sizeof(net_secret)); } -late_initcall(net_secret_init); +#endif #ifdef CONFIG_INET static u32 seq_scale(u32 seq) @@ -44,6 +47,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32)daddr[i]; @@ -65,6 +69,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32) daddr[i]; @@ -80,35 +85,13 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -__u32 secure_ip_id(__be32 daddr) -{ - u32 hash[MD5_DIGEST_WORDS]; - - hash[0] = (__force __u32) daddr; - hash[1] = net_secret[13]; - hash[2] = net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; -} - -__u32 secure_ipv6_id(const __be32 daddr[4]) -{ - __u32 hash[4]; - - memcpy(hash, daddr, 16); - md5_transform(hash, net_secret); - - return hash[0]; -} __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -123,6 +106,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = (__force u32)dport ^ net_secret[14]; @@ -142,6 +126,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, u32 hash[MD5_DIGEST_WORDS]; u64 seq; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -166,6 +151,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, u64 seq; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + daddr[i]; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ab989b0de4..c1a33033cbe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -47,6 +47,8 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/udp.h> #include <linux/netdevice.h> #ifdef CONFIG_NET_CLS_ACT #include <net/pkt_sched.h> @@ -65,6 +67,7 @@ #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/ip6_checksum.h> #include <net/xfrm.h> #include <asm/uaccess.h> @@ -74,77 +77,37 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; -static void sock_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - put_page(buf->page); -} - -static void sock_pipe_buf_get(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - get_page(buf->page); -} - -static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - - -/* Pipe buffer operations for a socket. */ -static const struct pipe_buf_operations sock_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = sock_pipe_buf_release, - .steal = sock_pipe_buf_steal, - .get = sock_pipe_buf_get, -}; - -/* - * Keep out-of-line to prevent kernel bloat. - * __builtin_return_address is not used because it is not always - * reliable. - */ - /** - * skb_over_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_put(). Not user callable. + * skb_panic - private function for out-of-line support + * @skb: buffer + * @sz: size + * @addr: address + * @msg: skb_over_panic or skb_under_panic + * + * Out-of-line support for skb_put() and skb_push(). + * Called via the wrapper skb_over_panic() or skb_under_panic(). + * Keep out of line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always reliable. */ -static void skb_over_panic(struct sk_buff *skb, int sz, void *here) +static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, + const char msg[]) { pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", - __func__, here, skb->len, sz, skb->head, skb->data, + msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); BUG(); } -/** - * skb_under_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_push(). Not user callable. - */ - -static void skb_under_panic(struct sk_buff *skb, int sz, void *here) +static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) { - pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", - __func__, here, skb->len, sz, skb->head, skb->data, - (unsigned long)skb->tail, (unsigned long)skb->end, - skb->dev ? skb->dev->name : "<NULL>"); - BUG(); + skb_panic(skb, sz, addr, __func__); } +static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) +{ + skb_panic(skb, sz, addr, __func__); +} /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells @@ -155,8 +118,9 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) */ #define kmalloc_reserve(size, gfp, node, pfmemalloc) \ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) -void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip, - bool *pfmemalloc) + +static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, + unsigned long ip, bool *pfmemalloc) { void *obj; bool ret_pfmemalloc = false; @@ -188,6 +152,31 @@ out: * */ +struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +{ + struct sk_buff *skb; + + /* Get the HEAD */ + skb = kmem_cache_alloc_node(skbuff_head_cache, + gfp_mask & ~__GFP_DMA, node); + if (!skb) + goto out; + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->head = NULL; + skb->truesize = sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + + skb->mac_header = (typeof(skb->mac_header))~0U; +out: + return skb; +} + /** * __alloc_skb - allocate a network buffer * @size: size to allocate @@ -257,9 +246,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); @@ -294,7 +282,8 @@ EXPORT_SYMBOL(__alloc_skb); * @frag_size: size of fragment, or 0 if head was kmalloced * * Allocate a new &sk_buff. Caller provides space holding head and - * skb_shared_info. @data must have been allocated by kmalloc() + * skb_shared_info. @data must have been allocated by kmalloc() only if + * @frag_size is 0, otherwise data should come from the page allocator. * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : @@ -325,9 +314,8 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); @@ -348,10 +336,6 @@ struct netdev_alloc_cache { }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768) -#define NETDEV_FRAG_PAGE_MAX_SIZE (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER) -#define NETDEV_PAGECNT_MAX_BIAS NETDEV_FRAG_PAGE_MAX_SIZE - static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct netdev_alloc_cache *nc; @@ -465,17 +449,22 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, } EXPORT_SYMBOL(skb_add_rx_frag); -static void skb_drop_list(struct sk_buff **listp) +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, + unsigned int truesize) { - struct sk_buff *list = *listp; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - *listp = NULL; + skb_frag_size_add(frag, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_coalesce_rx_frag); - do { - struct sk_buff *this = list; - list = list->next; - kfree_skb(this); - } while (list); +static void skb_drop_list(struct sk_buff **listp) +{ + kfree_skb_list(*listp); + *listp = NULL; } static inline void skb_drop_fraglist(struct sk_buff *skb) @@ -576,9 +565,6 @@ static void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - nf_conntrack_put_reasm(skb->nfct_reasm); -#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif @@ -595,7 +581,8 @@ static void skb_release_head_state(struct sk_buff *skb) static void skb_release_all(struct sk_buff *skb) { skb_release_head_state(skb); - skb_release_data(skb); + if (likely(skb->head)) + skb_release_data(skb); } /** @@ -634,6 +621,17 @@ void kfree_skb(struct sk_buff *skb) } EXPORT_SYMBOL(kfree_skb); +void kfree_skb_list(struct sk_buff *segs) +{ + while (segs) { + struct sk_buff *next = segs->next; + + kfree_skb(segs); + segs = next; + } +} +EXPORT_SYMBOL(kfree_skb_list); + /** * skb_tx_error - report an sk_buff xmit error * @skb: buffer that triggered an error @@ -682,20 +680,24 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; + new->inner_protocol = old->inner_protocol; new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_transport_header; + new->inner_network_header = old->inner_network_header; + new->inner_mac_header = old->inner_mac_header; skb_dst_copy(new, old); - new->rxhash = old->rxhash; + skb_copy_hash(new, old); new->ooo_okay = old->ooo_okay; - new->l4_rxhash = old->l4_rxhash; new->no_fcs = old->no_fcs; new->encapsulation = old->encapsulation; + new->encap_hdr_csum = old->encap_hdr_csum; + new->csum_valid = old->csum_valid; + new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif memcpy(new->cb, old->cb, sizeof(old->cb)); new->csum = old->csum; - new->local_df = old->local_df; + new->ignore_df = old->ignore_df; new->pkt_type = old->pkt_type; new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); @@ -708,18 +710,20 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mark = old->mark; new->skb_iif = old->skb_iif; __nf_copy(new, old); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - new->nf_trace = old->nf_trace; -#endif #ifdef CONFIG_NET_SCHED new->tc_index = old->tc_index; #ifdef CONFIG_NET_CLS_ACT new->tc_verd = old->tc_verd; #endif #endif + new->vlan_proto = old->vlan_proto; new->vlan_tci = old->vlan_tci; skb_copy_secmark(new, old); + +#ifdef CONFIG_NET_RX_BUSY_POLL + new->napi_id = old->napi_id; +#endif } /* @@ -802,7 +806,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) page = alloc_page(gfp_mask); if (!page) { while (head) { - struct page *next = (struct page *)head->private; + struct page *next = (struct page *)page_private(head); put_page(head); head = next; } @@ -812,7 +816,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) memcpy(page_address(page), vaddr + f->page_offset, skb_frag_size(f)); kunmap_atomic(vaddr); - page->private = (unsigned long)head; + set_page_private(page, (unsigned long)head); head = page; } @@ -826,7 +830,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) for (i = num_frags - 1; i >= 0; i--) { __skb_fill_page_desc(skb, i, head, 0, skb_shinfo(skb)->frags[i].size); - head = (struct page *)head->private; + head = (struct page *)page_private(head); } skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; @@ -878,26 +882,25 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) } EXPORT_SYMBOL(skb_clone); -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +static void skb_headers_offset_update(struct sk_buff *skb, int off) { -#ifndef NET_SKBUFF_DATA_USES_OFFSET - /* - * Shift between the two data areas in bytes - */ - unsigned long offset = new->data - old->data; -#endif + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += off; + /* {transport,network,mac}_header and tail are relative to skb->head */ + skb->transport_header += off; + skb->network_header += off; + if (skb_mac_header_was_set(skb)) + skb->mac_header += off; + skb->inner_transport_header += off; + skb->inner_network_header += off; + skb->inner_mac_header += off; +} +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ __copy_skb_header(new, old); -#ifndef NET_SKBUFF_DATA_USES_OFFSET - /* {transport,network,mac}_header are relative to skb->head */ - new->transport_header += offset; - new->network_header += offset; - if (skb_mac_header_was_set(new)) - new->mac_header += offset; - new->inner_transport_header += offset; - new->inner_network_header += offset; -#endif skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; @@ -951,10 +954,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) EXPORT_SYMBOL(skb_copy); /** - * __pskb_copy - create copy of an sk_buff with private head. + * __pskb_copy_fclone - create copy of an sk_buff with private head. * @skb: buffer to copy * @headroom: headroom of new skb * @gfp_mask: allocation priority + * @fclone: if true allocate the copy of the skb from the fclone + * cache instead of the head cache; it is recommended to set this + * to true for the cases where the copy will likely be cloned * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when @@ -964,11 +970,12 @@ EXPORT_SYMBOL(skb_copy); * The returned buffer has a reference count of 1. */ -struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, + gfp_t gfp_mask, bool fclone) { unsigned int size = skb_headlen(skb) + headroom; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); + struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); if (!n) goto out; @@ -1008,7 +1015,7 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) out: return n; } -EXPORT_SYMBOL(__pskb_copy); +EXPORT_SYMBOL(__pskb_copy_fclone); /** * pskb_expand_head - reallocate header of &sk_buff @@ -1017,8 +1024,8 @@ EXPORT_SYMBOL(__pskb_copy); * @ntail: room to add at tail * @gfp_mask: allocation priority * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * Expands (or creates identical copy, if @nhead and @ntail are zero) + * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * @@ -1088,17 +1095,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, #else skb->end = skb->head + size; #endif - /* {transport,network,mac}_header and tail are relative to skb->head */ skb->tail += off; - skb->transport_header += off; - skb->network_header += off; - if (skb_mac_header_was_set(skb)) - skb->mac_header += off; - skb->inner_transport_header += off; - skb->inner_network_header += off; - /* Only adjust this if it actually is csum_start rather than csum */ - if (skb->ip_summed == CHECKSUM_PARTIAL) - skb->csum_start += nhead; + skb_headers_offset_update(skb, nhead); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -1163,7 +1161,6 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, NUMA_NO_NODE); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; - int off; if (!n) return NULL; @@ -1187,17 +1184,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, copy_skb_header(n, skb); - off = newheadroom - oldheadroom; - if (n->ip_summed == CHECKSUM_PARTIAL) - n->csum_start += off; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n->transport_header += off; - n->network_header += off; - if (skb_mac_header_was_set(skb)) - n->mac_header += off; - n->inner_transport_header += off; - n->inner_network_header += off; -#endif + skb_headers_offset_update(n, newheadroom - oldheadroom); return n; } @@ -1250,6 +1237,29 @@ free_skb: EXPORT_SYMBOL(skb_pad); /** + * pskb_put - add data to the tail of a potentially fragmented buffer + * @skb: start of the buffer to use + * @tail: tail fragment of the buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the potentially + * fragmented buffer. @tail must be the last fragment of @skb -- or + * @skb itself. If this would exceed the total buffer size the kernel + * will panic. A pointer to the first byte of the extra data is + * returned. + */ + +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); + +/** * skb_put - add data to a buffer * @skb: buffer to use * @len: amount of data to add @@ -1649,7 +1659,7 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) static struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, - struct sk_buff *skb, struct sock *sk) + struct sock *sk) { struct page_frag *pfrag = sk_page_frag(sk); @@ -1682,14 +1692,14 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd, static bool spd_fill_page(struct splice_pipe_desc *spd, struct pipe_inode_info *pipe, struct page *page, unsigned int *len, unsigned int offset, - struct sk_buff *skb, bool linear, + bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) return true; if (linear) { - page = linear_to_page(page, len, &offset, skb, sk); + page = linear_to_page(page, len, &offset, sk); if (!page) return true; } @@ -1706,23 +1716,9 @@ static bool spd_fill_page(struct splice_pipe_desc *spd, return false; } -static inline void __segment_seek(struct page **page, unsigned int *poff, - unsigned int *plen, unsigned int off) -{ - unsigned long n; - - *poff += off; - n = *poff / PAGE_SIZE; - if (n) - *page = nth_page(*page, n); - - *poff = *poff % PAGE_SIZE; - *plen -= off; -} - static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, - unsigned int *len, struct sk_buff *skb, + unsigned int *len, struct splice_pipe_desc *spd, bool linear, struct sock *sk, struct pipe_inode_info *pipe) @@ -1737,23 +1733,19 @@ static bool __splice_segment(struct page *page, unsigned int poff, } /* ignore any bits we already processed */ - if (*off) { - __segment_seek(&page, &poff, &plen, *off); - *off = 0; - } + poff += *off; + plen -= *off; + *off = 0; do { unsigned int flen = min(*len, plen); - /* the linear region may spread across several pages */ - flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - - if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) + if (spd_fill_page(spd, pipe, page, &flen, poff, + linear, sk)) return true; - - __segment_seek(&page, &poff, &plen, flen); + poff += flen; + plen -= flen; *len -= flen; - } while (*len && plen); return false; @@ -1777,7 +1769,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd, + offset, len, spd, skb_head_is_locked(skb), sk, pipe)) return true; @@ -1790,7 +1782,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, if (__splice_segment(skb_frag_page(f), f->page_offset, skb_frag_size(f), - offset, len, skb, spd, false, sk, pipe)) + offset, len, spd, false, sk, pipe)) return true; } @@ -1814,7 +1806,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, .flags = flags, - .ops = &sock_pipe_buf_ops, + .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; @@ -1944,9 +1936,8 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, + __wsum csum, const struct skb_checksum_ops *ops) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -1957,7 +1948,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - csum = csum_partial(skb->data + offset, copy, csum); + csum = ops->update(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -1978,10 +1969,10 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, if (copy > len) copy = len; vaddr = kmap_atomic(skb_frag_page(frag)); - csum2 = csum_partial(vaddr + frag->page_offset + - offset - start, copy, 0); + csum2 = ops->update(vaddr + frag->page_offset + + offset - start, copy, 0); kunmap_atomic(vaddr); - csum = csum_block_add(csum, csum2, pos); + csum = ops->combine(csum, csum2, pos, copy); if (!(len -= copy)) return csum; offset += copy; @@ -2000,9 +1991,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, __wsum csum2; if (copy > len) copy = len; - csum2 = skb_checksum(frag_iter, offset - start, - copy, 0); - csum = csum_block_add(csum, csum2, pos); + csum2 = __skb_checksum(frag_iter, offset - start, + copy, 0, ops); + csum = ops->combine(csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; @@ -2014,6 +2005,18 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, return csum; } +EXPORT_SYMBOL(__skb_checksum); + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + const struct skb_checksum_ops ops = { + .update = csum_partial_ext, + .combine = csum_block_add_ext, + }; + + return __skb_checksum(skb, offset, len, csum, &ops); +} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -2095,6 +2098,104 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); + /** + * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + * @from: source buffer + * + * Calculates the amount of linear headroom needed in the 'to' skb passed + * into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ + unsigned int hlen = 0; + + if (!from->head_frag || + skb_headlen(from) < L1_CACHE_BYTES || + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(from); + + if (skb_has_frag_list(from)) + hlen = from->len; + + return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + * skb_zerocopy - Zero copy skb to skb + * @to: destination buffer + * @from: source buffer + * @len: number of bytes to copy from source buffer + * @hlen: size of linear headroom in destination buffer + * + * Copies up to `len` bytes from `from` to `to` by creating references + * to the frags in the source buffer. + * + * The `hlen` as calculated by skb_zerocopy_headlen() specifies the + * headroom in the `to` buffer. + * + * Return value: + * 0: everything is OK + * -ENOMEM: couldn't orphan frags of @from due to lack of memory + * -EFAULT: skb_copy_bits() found some problem with skb geometry + */ +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + int ret; + struct page *page; + unsigned int offset; + + BUG_ON(!from->head_frag && !hlen); + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) + return skb_copy_bits(from, 0, skb_put(to, len), len); + + if (hlen) { + ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + if (unlikely(ret)) + return ret; + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + to->truesize += len + plen; + to->len += len + plen; + to->data_len += len + plen; + + if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { + skb_tx_error(from); + return -ENOMEM; + } + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); + len -= skb_shinfo(to)->frags[j].size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); + void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { __wsum csum; @@ -2355,6 +2456,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2532,14 +2634,14 @@ EXPORT_SYMBOL(skb_prepare_seq_read); * @data: destination pointer for data to be returned * @st: state variable * - * Reads a block of skb data at &consumed relative to the + * Reads a block of skb data at @consumed relative to the * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length + * the head of the data block to @data and returns the length * of the block or 0 if the end of the skb data or the upper * offset has been reached. * * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number + * returned, i.e. @consumed is typically set to the number * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * @@ -2557,8 +2659,13 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, unsigned int block_limit, abs_offset = consumed + st->lower_offset; skb_frag_t *frag; - if (unlikely(abs_offset >= st->upper_offset)) + if (unlikely(abs_offset >= st->upper_offset)) { + if (st->frag_data) { + kunmap_atomic(st->frag_data); + st->frag_data = NULL; + } return 0; + } next_skb: block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; @@ -2686,48 +2793,37 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, int len, int odd, struct sk_buff *skb), void *from, int length) { - int frg_cnt = 0; - skb_frag_t *frag = NULL; - struct page *page = NULL; - int copy, left; + int frg_cnt = skb_shinfo(skb)->nr_frags; + int copy; int offset = 0; int ret; + struct page_frag *pfrag = ¤t->task_frag; do { /* Return error if we don't have space for new frag */ - frg_cnt = skb_shinfo(skb)->nr_frags; if (frg_cnt >= MAX_SKB_FRAGS) - return -EFAULT; - - /* allocate a new page for next frag */ - page = alloc_pages(sk->sk_allocation, 0); + return -EMSGSIZE; - /* If alloc_page fails just return failure and caller will - * free previous allocated pages by doing kfree_skb() - */ - if (page == NULL) + if (!sk_page_frag_refill(sk, pfrag)) return -ENOMEM; - /* initialize the next frag */ - skb_fill_page_desc(skb, frg_cnt, page, 0, 0); - skb->truesize += PAGE_SIZE; - atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); - - /* get the new initialized frag */ - frg_cnt = skb_shinfo(skb)->nr_frags; - frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; - /* copy the user data to page */ - left = PAGE_SIZE - frag->page_offset; - copy = (length > left)? left : length; + copy = min_t(int, length, pfrag->size - pfrag->offset); - ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), - offset, copy, 0, skb); + ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, + offset, copy, 0, skb); if (ret < 0) return -EFAULT; /* copy was successful so update the size parameters */ - skb_frag_size_add(frag, copy); + skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, + copy); + frg_cnt++; + pfrag->offset += copy; + get_page(pfrag->page); + + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); skb->len += copy; skb->data_len += copy; offset += copy; @@ -2762,59 +2858,96 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum); /** * skb_segment - Perform protocol segmentation on skb. - * @skb: buffer to segment + * @head_skb: buffer to segment * @features: features for the output path (see dev->features) * * This function performs segmentation on the given skb. It returns * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) +struct sk_buff *skb_segment(struct sk_buff *head_skb, + netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; - struct sk_buff *fskb = skb_shinfo(skb)->frag_list; - unsigned int mss = skb_shinfo(skb)->gso_size; - unsigned int doffset = skb->data - skb_mac_header(skb); + struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; + skb_frag_t *frag = skb_shinfo(head_skb)->frags; + unsigned int mss = skb_shinfo(head_skb)->gso_size; + unsigned int doffset = head_skb->data - skb_mac_header(head_skb); + struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; + unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int headroom; unsigned int len; + __be16 proto; + bool csum; int sg = !!(features & NETIF_F_SG); - int nfrags = skb_shinfo(skb)->nr_frags; + int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; int pos; + int dummy; + + __skb_push(head_skb, doffset); + proto = skb_network_protocol(head_skb, &dummy); + if (unlikely(!proto)) + return ERR_PTR(-EINVAL); - __skb_push(skb, doffset); - headroom = skb_headroom(skb); - pos = skb_headlen(skb); + csum = !head_skb->encap_hdr_csum && + !!can_checksum_protocol(features, proto); + + headroom = skb_headroom(head_skb); + pos = skb_headlen(head_skb); do { struct sk_buff *nskb; - skb_frag_t *frag; + skb_frag_t *nskb_frag; int hsize; int size; - len = skb->len - offset; + len = head_skb->len - offset; if (len > mss) len = mss; - hsize = skb_headlen(skb) - offset; + hsize = skb_headlen(head_skb) - offset; if (hsize < 0) hsize = 0; if (hsize > len || !sg) hsize = len; - if (!hsize && i >= nfrags) { - BUG_ON(fskb->len != len); + if (!hsize && i >= nfrags && skb_headlen(list_skb) && + (skb_headlen(list_skb) == len || sg)) { + BUG_ON(skb_headlen(list_skb) > len); + + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + pos += skb_headlen(list_skb); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + size = skb_frag_size(frag); + if (pos + size > offset + len) + break; + + i++; + pos += size; + frag++; + } - pos += len; - nskb = skb_clone(fskb, GFP_ATOMIC); - fskb = fskb->next; + nskb = skb_clone(list_skb, GFP_ATOMIC); + list_skb = list_skb->next; if (unlikely(!nskb)) goto err; + if (unlikely(pskb_trim(nskb, len))) { + kfree_skb(nskb); + goto err; + } + hsize = skb_end_offset(nskb); if (skb_cow_head(nskb, doffset + headroom)) { kfree_skb(nskb); @@ -2826,7 +2959,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) __skb_push(nskb, doffset); } else { nskb = __alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC, skb_alloc_rx_flag(skb), + GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) @@ -2842,122 +2975,133 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) segs = nskb; tail = nskb; - __copy_skb_header(nskb, skb); - nskb->mac_len = skb->mac_len; + __copy_skb_header(nskb, head_skb); + nskb->mac_len = head_skb->mac_len; - /* nskb and skb might have different headroom */ - if (nskb->ip_summed == CHECKSUM_PARTIAL) - nskb->csum_start += skb_headroom(nskb) - headroom; + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb->mac_len); - nskb->transport_header = (nskb->network_header + - skb_network_header_len(skb)); - skb_copy_from_linear_data(skb, nskb->data, doffset); + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, + nskb->data - tnl_hlen, + doffset + tnl_hlen); - if (fskb != skb_shinfo(skb)->frag_list) - continue; + if (nskb->len == len + doffset) + goto perform_csum_check; if (!sg) { nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(skb, offset, + nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), len, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; continue; } - frag = skb_shinfo(nskb)->frags; + nskb_frag = skb_shinfo(nskb)->frags; - skb_copy_from_linear_data_offset(skb, offset, + skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - while (pos < offset + len && i < nfrags) { - *frag = skb_shinfo(skb)->frags[i]; - __skb_frag_ref(frag); - size = skb_frag_size(frag); + skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & + SKBTX_SHARED_FRAG; + + while (pos < offset + len) { + if (i >= nfrags) { + BUG_ON(skb_headlen(list_skb)); + + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + + BUG_ON(!nfrags); + + list_skb = list_skb->next; + } + + if (unlikely(skb_shinfo(nskb)->nr_frags >= + MAX_SKB_FRAGS)) { + net_warn_ratelimited( + "skb_segment: too many frags: %u %u\n", + pos, mss); + goto err; + } + + if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) + goto err; + + *nskb_frag = *frag; + __skb_frag_ref(nskb_frag); + size = skb_frag_size(nskb_frag); if (pos < offset) { - frag->page_offset += offset - pos; - skb_frag_size_sub(frag, offset - pos); + nskb_frag->page_offset += offset - pos; + skb_frag_size_sub(nskb_frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; + frag++; pos += size; } else { - skb_frag_size_sub(frag, pos + size - (offset + len)); + skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); goto skip_fraglist; } - frag++; - } - - if (pos < offset + len) { - struct sk_buff *fskb2 = fskb; - - BUG_ON(pos + fskb->len != offset + len); - - pos += fskb->len; - fskb = fskb->next; - - if (fskb2->next) { - fskb2 = skb_clone(fskb2, GFP_ATOMIC); - if (!fskb2) - goto err; - } else - skb_get(fskb2); - - SKB_FRAG_ASSERT(nskb); - skb_shinfo(nskb)->frag_list = fskb2; + nskb_frag++; } skip_fraglist: nskb->data_len = len - hsize; nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; - } while ((offset += len) < skb->len); + +perform_csum_check: + if (!csum) { + nskb->csum = skb_checksum(nskb, doffset, + nskb->len - doffset, 0); + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } + } while ((offset += len) < head_skb->len); return segs; err: - while ((skb = segs)) { - segs = skb->next; - kfree_skb(skb); - } + kfree_skb_list(segs); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skb_segment); int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - struct sk_buff *p = *head; - struct sk_buff *nskb; - struct skb_shared_info *skbinfo = skb_shinfo(skb); - struct skb_shared_info *pinfo = skb_shinfo(p); - unsigned int headroom; - unsigned int len = skb_gro_len(skb); + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); + struct sk_buff *nskb, *lp, *p = *head; + unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int headroom; - if (p->len + len >= 65536) + if (unlikely(p->len + len >= 65536)) return -E2BIG; - if (pinfo->frag_list) - goto merge; - else if (headlen <= offset) { + lp = NAPI_GRO_CB(p)->last; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; - offset -= headlen; - if (nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; + offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; @@ -2988,7 +3132,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + @@ -3006,7 +3150,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; - } else if (skb_gro_len(p) != pinfo->gso_size) + } + if (pinfo->frag_list) + goto merge; + if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; headroom = skb_headroom(p); @@ -3058,16 +3205,24 @@ merge: __skb_pull(skb, offset); - NAPI_GRO_CB(p)->last->next = skb; + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; skb_header_release(skb); + lp = p; done: NAPI_GRO_CB(p)->count++; p->data_len += len; p->truesize += delta_truesize; p->len += len; - + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } @@ -3158,6 +3313,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) return elt; } +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given + * sglist without mark the sg which contain last skb data as the end. + * So the caller can mannipulate sg list as will when padding new data after + * the first call without calling sg_unmark_end to expend sg list. + * + * Scenario to use skb_to_sgvec_nomark: + * 1. sg_init_table + * 2. skb_to_sgvec_nomark(payload1) + * 3. skb_to_sgvec_nomark(payload2) + * + * This is equivalent to: + * 1. sg_init_table + * 2. skb_to_sgvec(payload1) + * 3. sg_unmark_end + * 4. skb_to_sgvec(payload2) + * + * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * is more preferable. + */ +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, + int offset, int len) +{ + return __skb_to_sgvec(skb, sg, offset, len); +} +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); + int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { int nsg = __skb_to_sgvec(skb, sg, offset, len); @@ -3290,8 +3471,6 @@ static void sock_rmem_free(struct sk_buff *skb) */ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { - int len = skb->len; - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf) return -ENOMEM; @@ -3306,7 +3485,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); @@ -3322,12 +3501,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; - if (hwtstamps) { - *skb_hwtstamps(skb) = + *skb_hwtstamps(orig_skb) = *hwtstamps; } else { /* @@ -3335,9 +3510,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, * so keep the shared tx_flags and only * store software time stamp */ - skb->tstamp = ktime_get_real(); + orig_skb->tstamp = ktime_get_real(); } + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; @@ -3394,10 +3573,243 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + start; skb->csum_offset = off; + skb_set_transport_header(skb, start); return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, + unsigned int max) +{ + if (skb_headlen(skb) >= len) + return 0; + + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + if (max > skb->len) + max = skb->len; + + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) + return -ENOMEM; + + if (skb_headlen(skb) < len) + return -EPROTO; + + return 0; +} + +#define MAX_TCP_HDR_LEN (15 * 4) + +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, + typeof(IPPROTO_IP) proto, + unsigned int off) +{ + switch (proto) { + int err; + + case IPPROTO_TCP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), + off + MAX_TCP_HDR_LEN); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct tcphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; + + case IPPROTO_UDP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), + off + sizeof(struct udphdr)); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct udphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &udp_hdr(skb)->check; + } + + return ERR_PTR(-EPROTO); +} + +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) +{ + unsigned int off; + bool fragment; + __sum16 *csum; + int err; + + fragment = false; + + err = skb_maybe_pull_tail(skb, + sizeof(struct iphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) + fragment = true; + + off = ip_hdrlen(skb); + + err = -EPROTO; + + if (fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + ip_hdr(skb)->protocol, 0); + err = 0; + +out: + return err; +} + +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ + (type *)(skb_network_header(skb) + (off)) + +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) +{ + int err; + u8 nexthdr; + unsigned int off; + unsigned int len; + bool fragment; + bool done; + __sum16 *csum; + + fragment = false; + done = false; + + off = sizeof(struct ipv6hdr); + + err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + nexthdr = ipv6_hdr(skb)->nexthdr; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + while (off <= len && !done) { + switch (nexthdr) { + case IPPROTO_DSTOPTS: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: { + struct ipv6_opt_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ipv6_opt_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_optlen(hp); + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ip_auth_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ip_auth_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_authlen(hp); + break; + } + case IPPROTO_FRAGMENT: { + struct frag_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct frag_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct frag_hdr, skb, off); + + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) + fragment = true; + + nexthdr = hp->nexthdr; + off += sizeof(struct frag_hdr); + break; + } + default: + done = true; + break; + } + } + + err = -EPROTO; + + if (!done || fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, nexthdr, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, nexthdr, 0); + err = 0; + +out: + return err; +} + +/** + * skb_checksum_setup - set up partial checksum offset + * @skb: the skb to set up + * @recalculate: if true the pseudo-header checksum will be recalculated + */ +int skb_checksum_setup(struct sk_buff *skb, bool recalculate) +{ + int err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + err = skb_checksum_setup_ipv4(skb, recalculate); + break; + + case htons(ETH_P_IPV6): + err = skb_checksum_setup_ipv6(skb, recalculate); + break; + + default: + err = -EPROTO; + break; + } + + return err; +} +EXPORT_SYMBOL(skb_checksum_setup); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", @@ -3493,3 +3905,57 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return true; } EXPORT_SYMBOL(skb_try_coalesce); + +/** + * skb_scrub_packet - scrub an skb + * + * @skb: buffer to clean + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. + */ +void skb_scrub_packet(struct sk_buff *skb, bool xnet) +{ + if (xnet) + skb_orphan(skb); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + skb->mark = 0; + secpath_reset(skb); + nf_reset(skb); + nf_reset_trace(skb); +} +EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + return tcp_hdrlen(skb) + shinfo->gso_size; + + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return shinfo->gso_size; +} +EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); diff --git a/net/core/sock.c b/net/core/sock.c index bc131d41968..026e01f7027 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -93,6 +93,7 @@ #include <linux/capability.h> #include <linux/errno.h> +#include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> @@ -139,9 +140,60 @@ #include <net/tcp.h> #endif +#include <net/busy_poll.h> + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, + struct user_namespace *user_ns, int cap) +{ + return file_ns_capable(sk->sk_socket->file, user_ns, cap) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capbility to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socke was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + + #ifdef CONFIG_MEMCG_KMEM int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { @@ -186,8 +238,10 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; +#if defined(CONFIG_MEMCG_KMEM) struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); +#endif /* * Make lock validator output more readable. (we pre-construct these @@ -208,7 +262,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_MAX" + "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -224,7 +278,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_MAX" + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -240,7 +294,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_MAX" + "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" }; /* @@ -423,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -470,12 +524,6 @@ discard_and_relse: } EXPORT_SYMBOL(sk_receive_skb); -void sk_reset_txq(struct sock *sk) -{ - sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); - struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); @@ -569,9 +617,7 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); - struct net_device *dev; char devname[IFNAMSIZ]; - unsigned seq; if (sk->sk_bound_dev_if == 0) { len = 0; @@ -582,20 +628,9 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, if (len < IFNAMSIZ) goto out; -retry: - seq = read_seqcount_begin(&devnet_rename_seq); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); - ret = -ENODEV; - if (!dev) { - rcu_read_unlock(); + ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); + if (ret) goto out; - } - - strcpy(devname, dev->name); - rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) - goto retry; len = strlen(devname) + 1; @@ -665,6 +700,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: @@ -746,7 +784,7 @@ set_rcvbuf: break; case SO_NO_CHECK: - sk->sk_no_check = valbool; + sk->sk_no_check_tx = valbool; break; case SO_PRIORITY: @@ -861,6 +899,13 @@ set_rcvbuf: ret = sk_detach_filter(sk); break; + case SO_LOCK_FILTER: + if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) + ret = -EPERM; + else + sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); + break; + case SO_PASSSEC: if (valbool) set_bit(SOCK_PASSSEC, &sock->flags); @@ -886,7 +931,7 @@ set_rcvbuf: case SO_PEEK_OFF: if (sock->ops->set_peek_off) - sock->ops->set_peek_off(sk, val); + ret = sock->ops->set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; @@ -895,6 +940,30 @@ set_rcvbuf: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; + case SO_SELECT_ERR_QUEUE: + sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); + break; + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + /* allow unprivileged users to decrease the value */ + if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else { + if (val < 0) + ret = -EINVAL; + else + sk->sk_ll_usec = val; + } + break; +#endif + + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + default: ret = -ENOPROTOOPT; break; @@ -905,8 +974,8 @@ set_rcvbuf: EXPORT_SYMBOL(sock_setsockopt); -void cred_to_ucred(struct pid *pid, const struct cred *cred, - struct ucred *ucred) +static void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; @@ -917,7 +986,6 @@ void cred_to_ucred(struct pid *pid, const struct cred *cred, ucred->gid = from_kgid_munged(current_ns, cred->egid); } } -EXPORT_SYMBOL_GPL(cred_to_ucred); int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) @@ -965,6 +1033,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_reuse; break; + case SO_REUSEPORT: + v.val = sk->sk_reuseport; + break; + case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; @@ -992,7 +1064,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_NO_CHECK: - v.val = sk->sk_no_check; + v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: @@ -1140,6 +1212,28 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; + case SO_LOCK_FILTER: + v.val = sock_flag(sk, SOCK_FILTER_LOCKED); + break; + + case SO_BPF_EXTENSIONS: + v.val = bpf_tell_extensions(); + break; + + case SO_SELECT_ERR_QUEUE: + v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); + break; + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + v.val = sk->sk_ll_usec; + break; +#endif + + case SO_MAX_PACING_RATE: + v.val = sk->sk_max_pacing_rate; + break; + default: return -ENOPROTOOPT; } @@ -1189,18 +1283,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -/* - * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes - * un-modified. Special care is taken when initializing object to zero. - */ -static inline void sk_prot_clear_nulls(struct sock *sk, int size) -{ - if (offsetof(struct sock, sk_node.next) != 0) - memset(sk, 0, offsetof(struct sock, sk_node.next)); - memset(&sk->sk_node.pprev, 0, - size - offsetof(struct sock, sk_node.pprev)); -} - void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) { unsigned long nulls1, nulls2; @@ -1278,30 +1360,16 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#ifdef CONFIG_CGROUPS -#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk, struct task_struct *task) -{ - u32 classid; - - classid = task_cls_classid(task); - if (classid != sk->sk_classid) - sk->sk_classid = classid; -} -EXPORT_SYMBOL(sock_update_classid); -#endif - -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) -void sock_update_netprioidx(struct sock *sk, struct task_struct *task) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) +void sock_update_netprioidx(struct sock *sk) { if (in_interrupt()) return; - sk->sk_cgrp_prioidx = task_netprioidx(task); + sk->sk_cgrp_prioidx = task_netprioidx(current); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif -#endif /** * sk_alloc - All socket objects are allocated here @@ -1327,8 +1395,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, get_net(net)); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk, current); - sock_update_netprioidx(sk, current); + sock_update_classid(sk); + sock_update_netprioidx(sk); } return sk; @@ -1553,6 +1621,25 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ @@ -1619,22 +1706,6 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, EXPORT_SYMBOL(sock_wmalloc); /* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, - gfp_t priority) -{ - if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { - struct sk_buff *skb = alloc_skb(size, priority); - if (skb) { - skb_set_owner_r(skb, sk); - return skb; - } - } - return NULL; -} - -/* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) @@ -1699,24 +1770,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; err = -EMSGSIZE; if (npages > MAX_SKB_FRAGS) goto failure; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; - timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -1725,50 +1795,54 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1777,6 +1851,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } @@ -1785,14 +1860,24 @@ EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @prio: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) { int order; @@ -1801,19 +1886,17 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) pfrag->offset = 0; return true; } - if (pfrag->offset < pfrag->size) + if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } - /* We restrict high order allocations to users that can afford to wait */ - order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; - + order = SKB_FRAG_PAGE_ORDER; do { - gfp_t gfp = sk->sk_allocation; + gfp_t gfp = prio; if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; + gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; pfrag->page = alloc_pages(gfp, order); if (likely(pfrag->page)) { pfrag->offset = 0; @@ -1822,6 +1905,15 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } } while (--order >= 0); + return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; + sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; @@ -2153,7 +2245,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk) { struct socket_wq *wq; @@ -2212,7 +2304,7 @@ EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { - if (timer_pending(timer) && del_timer(timer)) + if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); @@ -2270,6 +2362,13 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_RX_BUSY_POLL + sk->sk_napi_id = 0; + sk->sk_ll_usec = sysctl_net_busy_read; +#endif + + sk->sk_max_pacing_rate = ~0U; + sk->sk_pacing_rate = ~0U; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2307,10 +2406,13 @@ void release_sock(struct sock *sk) if (sk->sk_backlog.tail) __release_sock(sk); + /* Warning : release_cb() might need to release sk ownership, + * ie call sock_release_ownership(sk) before us. + */ if (sk->sk_prot->release_cb) sk->sk_prot->release_cb(sk); - sk->sk_lock.owned = 0; + sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); @@ -2398,6 +2500,52 @@ void sock_enable_timestamp(struct sock *sk, int flag) } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * @@ -2818,7 +2966,7 @@ static const struct file_operations proto_seq_fops = { static __net_init int proto_init_net(struct net *net) { - if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) + if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) return -ENOMEM; return 0; @@ -2826,7 +2974,7 @@ static __net_init int proto_init_net(struct net *net) static __net_exit void proto_exit_net(struct net *net) { - proc_net_remove(net, "protocols"); + remove_proc_entry("protocols", net->proc_net); } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 602cd637182..a4216a4c957 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -49,6 +49,41 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); +int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, + struct sk_buff *skb, int attrtype) +{ + struct sock_fprog_kern *fprog; + struct sk_filter *filter; + struct nlattr *attr; + unsigned int flen; + int err = 0; + + if (!may_report_filterinfo) { + nla_reserve(skb, attrtype, 0); + return 0; + } + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (!filter) + goto out; + + fprog = filter->orig_prog; + flen = sk_filter_proglen(fprog); + + attr = nla_reserve(skb, attrtype, flen); + if (attr == NULL) { + err = -EMSGSIZE; + goto out; + } + + memcpy(nla_data(attr), fprog->filter, flen); +out: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(sock_diag_put_filterinfo); + void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) { mutex_lock(&sock_diag_table_mutex); @@ -97,21 +132,6 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld) } EXPORT_SYMBOL_GPL(sock_diag_unregister); -static const inline struct sock_diag_handler *sock_diag_lock_handler(int family) -{ - if (sock_diag_handlers[family] == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, family); - - mutex_lock(&sock_diag_table_mutex); - return sock_diag_handlers[family]; -} - -static inline void sock_diag_unlock_handler(const struct sock_diag_handler *h) -{ - mutex_unlock(&sock_diag_table_mutex); -} - static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; @@ -121,12 +141,20 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(*req)) return -EINVAL; - hndl = sock_diag_lock_handler(req->sdiag_family); + if (req->sdiag_family >= AF_MAX) + return -EINVAL; + + if (sock_diag_handlers[req->sdiag_family] == NULL) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, req->sdiag_family); + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[req->sdiag_family]; if (hndl == NULL) err = -ENOENT; else err = hndl->dump(skb, nlh); - sock_diag_unlock_handler(hndl); + mutex_unlock(&sock_diag_table_mutex); return err; } diff --git a/net/core/stream.c b/net/core/stream.c index f5df85dcd20..301c05f2606 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -30,7 +30,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + if (sk_stream_is_writeable(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); @@ -122,7 +122,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) DEFINE_WAIT(wait); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; while (1) { set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d1b08045a9d..cf9cd13509a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -19,14 +19,20 @@ #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> +#include <net/busy_poll.h> +#include <net/pkt_sched.h> + +static int zero = 0; +static int one = 1; +static int ushort_max = USHRT_MAX; #ifdef CONFIG_RPS -static int rps_sock_flow_sysctl(ctl_table *table, int write, +static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; - ctl_table tmp = { + struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode @@ -85,6 +91,130 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, } #endif /* CONFIG_RPS */ +#ifdef CONFIG_NET_FLOW_LIMIT +static DEFINE_MUTEX(flow_limit_update_mutex); + +static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct sd_flow_limit *cur; + struct softnet_data *sd; + cpumask_var_t mask; + int i, len, ret = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + if (write) { + ret = cpumask_parse_user(buffer, *lenp, mask); + if (ret) + goto done; + + mutex_lock(&flow_limit_update_mutex); + len = sizeof(*cur) + netdev_flow_limit_table_len; + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + cur = rcu_dereference_protected(sd->flow_limit, + lockdep_is_held(&flow_limit_update_mutex)); + if (cur && !cpumask_test_cpu(i, mask)) { + RCU_INIT_POINTER(sd->flow_limit, NULL); + synchronize_rcu(); + kfree(cur); + } else if (!cur && cpumask_test_cpu(i, mask)) { + cur = kzalloc_node(len, GFP_KERNEL, + cpu_to_node(i)); + if (!cur) { + /* not unwinding previous changes */ + ret = -ENOMEM; + goto write_unlock; + } + cur->num_buckets = netdev_flow_limit_table_len; + rcu_assign_pointer(sd->flow_limit, cur); + } + } +write_unlock: + mutex_unlock(&flow_limit_update_mutex); + } else { + char kbuf[128]; + + if (*ppos || !*lenp) { + *lenp = 0; + goto done; + } + + cpumask_clear(mask); + rcu_read_lock(); + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + if (rcu_dereference(sd->flow_limit)) + cpumask_set_cpu(i, mask); + } + rcu_read_unlock(); + + len = min(sizeof(kbuf) - 1, *lenp); + len = cpumask_scnprintf(kbuf, len, mask); + if (!len) { + *lenp = 0; + goto done; + } + if (len < *lenp) + kbuf[len++] = '\n'; + if (copy_to_user(buffer, kbuf, len)) { + ret = -EFAULT; + goto done; + } + *lenp = len; + *ppos += len; + } + +done: + free_cpumask_var(mask); + return ret; +} + +static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned int old, *ptr; + int ret; + + mutex_lock(&flow_limit_update_mutex); + + ptr = table->data; + old = *ptr; + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write && !is_power_of_2(*ptr)) { + *ptr = old; + ret = -EINVAL; + } + + mutex_unlock(&flow_limit_update_mutex); + return ret; +} +#endif /* CONFIG_NET_FLOW_LIMIT */ + +#ifdef CONFIG_NET_SCHED +static int set_default_qdisc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char id[IFNAMSIZ]; + struct ctl_table tbl = { + .data = id, + .maxlen = IFNAMSIZ, + }; + int ret; + + qdisc_get_default(id, IFNAMSIZ); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = qdisc_set_default(id); + return ret; +} +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -92,28 +222,32 @@ static struct ctl_table net_core_table[] = { .data = &sysctl_wmem_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "dev_weight", @@ -174,6 +308,44 @@ static struct ctl_table net_core_table[] = { .proc_handler = rps_sock_flow_sysctl }, #endif +#ifdef CONFIG_NET_FLOW_LIMIT + { + .procname = "flow_limit_cpu_bitmap", + .mode = 0644, + .proc_handler = flow_limit_cpu_sysctl + }, + { + .procname = "flow_limit_table_len", + .data = &netdev_flow_limit_table_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = flow_limit_table_len_sysctl + }, +#endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_RX_BUSY_POLL + { + .procname = "busy_poll", + .data = &sysctl_net_busy_poll, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "busy_read", + .data = &sysctl_net_busy_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif +#ifdef CONFIG_NET_SCHED + { + .procname = "default_qdisc", + .mode = 0644, + .maxlen = IFNAMSIZ, + .proc_handler = set_default_qdisc + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", @@ -198,7 +370,9 @@ static struct ctl_table netns_core_table[] = { .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .extra1 = &zero, + .extra2 = &ushort_max, + .proc_handler = proc_dointvec_minmax }, { } }; diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 661b5a40ec1..6521dfd8b7c 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -23,16 +23,11 @@ #include <linux/skbuff.h> #include <linux/export.h> -static struct sock_filter ptp_filter[] = { - PTP_FILTER -}; - static unsigned int classify(const struct sk_buff *skb) { - if (likely(skb->dev && - skb->dev->phydev && + if (likely(skb->dev && skb->dev->phydev && skb->dev->phydev->drv)) - return sk_run_filter(skb, ptp_filter); + return ptp_classify_raw(skb); else return PTP_CLASS_NONE; } @@ -60,11 +55,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) if (likely(phydev->drv->txtstamp)) { if (!atomic_inc_not_zero(&sk->sk_refcnt)) return; + clone = skb_clone(skb, GFP_ATOMIC); if (!clone) { sock_put(sk); return; } + clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } @@ -89,12 +86,15 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, } *skb_hwtstamps(skb) = *hwtstamps; + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; skb->sk = NULL; + err = sock_queue_err_skb(sk, skb); + sock_put(sk); if (err) kfree_skb(skb); @@ -132,8 +132,3 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) return false; } EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); - -void __init skb_timestamping_init(void) -{ - BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); -} diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 00000000000..8c3203c585b --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,77 @@ +#include <linux/export.h> +#include <net/ip.h> +#include <net/tso.h> + +/* Calculate expected number of TX descriptors */ +int tso_count_descs(struct sk_buff *skb) +{ + /* The Marvell Way */ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} +EXPORT_SYMBOL(tso_count_descs); + +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int mac_hdr_len = skb_network_offset(skb); + + memcpy(hdr, skb->data, hdr_len); + iph = (struct iphdr *)(hdr + mac_hdr_len); + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); + tcph->seq = htonl(tso->tcp_seq); + tso->ip_id++; + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } +} +EXPORT_SYMBOL(tso_build_hdr); + +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +{ + tso->tcp_seq += size; + tso->size -= size; + tso->data += size; + + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} +EXPORT_SYMBOL(tso_build_data); + +void tso_start(struct sk_buff *skb, struct tso_t *tso) +{ + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + tso->ip_id = ntohs(ip_hdr(skb)->id); + tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); + tso->next_frag_idx = 0; + + /* Build first data */ + tso->size = skb_headlen(skb) - hdr_len; + tso->data = skb->data + hdr_len; + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} +EXPORT_SYMBOL(tso_start); diff --git a/net/core/utils.c b/net/core/utils.c index e3487e46193..eed34338736 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/ctype.h> #include <linux/inet.h> #include <linux/mm.h> #include <linux/net.h> @@ -338,26 +339,51 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace16); -int mac_pton(const char *s, u8 *mac) +struct __net_random_once_work { + struct work_struct work; + struct static_key *key; +}; + +static void __net_random_once_deferred(struct work_struct *w) { - int i; + struct __net_random_once_work *work = + container_of(w, struct __net_random_once_work, work); + BUG_ON(!static_key_enabled(work->key)); + static_key_slow_dec(work->key); + kfree(work); +} - /* XX:XX:XX:XX:XX:XX */ - if (strlen(s) < 3 * ETH_ALEN - 1) - return 0; - - /* Don't dirty result unless string is valid MAC. */ - for (i = 0; i < ETH_ALEN; i++) { - if (!strchr("0123456789abcdefABCDEF", s[i * 3])) - return 0; - if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1])) - return 0; - if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':') - return 0; - } - for (i = 0; i < ETH_ALEN; i++) { - mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]); +static void __net_random_once_disable_jump(struct static_key *key) +{ + struct __net_random_once_work *w; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (!w) + return; + + INIT_WORK(&w->work, __net_random_once_deferred); + w->key = key; + schedule_work(&w->work); +} + +bool __net_get_random_once(void *buf, int nbytes, bool *done, + struct static_key *once_key) +{ + static DEFINE_SPINLOCK(lock); + unsigned long flags; + + spin_lock_irqsave(&lock, flags); + if (*done) { + spin_unlock_irqrestore(&lock, flags); + return false; } - return 1; + + get_random_bytes(buf, nbytes); + *done = true; + spin_unlock_irqrestore(&lock, flags); + + __net_random_once_disable_jump(once_key); + + return true; } -EXPORT_SYMBOL(mac_pton); +EXPORT_SYMBOL(__net_get_random_once); |
