diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-25 13:25:22 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-25 13:25:22 +0200 |
commit | 8a9ea3237e7eb5c25f09e429ad242ae5a3d5ea22 (patch) | |
tree | a0a63398a9983667d52cbbbf4e2405b4f22b1d83 /drivers/net/ethernet/tile | |
parent | 1be025d3cb40cd295123af2c394f7229ef9b30ca (diff) | |
parent | 8b3408f8ee994973869d8ba32c5bf482bc4ddca4 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1745 commits)
dp83640: free packet queues on remove
dp83640: use proper function to free transmit time stamping packets
ipv6: Do not use routes from locally generated RAs
|PATCH net-next] tg3: add tx_dropped counter
be2net: don't create multiple RX/TX rings in multi channel mode
be2net: don't create multiple TXQs in BE2
be2net: refactor VF setup/teardown code into be_vf_setup/clear()
be2net: add vlan/rx-mode/flow-control config to be_setup()
net_sched: cls_flow: use skb_header_pointer()
ipv4: avoid useless call of the function check_peer_pmtu
TCP: remove TCP_DEBUG
net: Fix driver name for mdio-gpio.c
ipv4: tcp: fix TOS value in ACK messages sent from TIME_WAIT
rtnetlink: Add missing manual netlink notification in dev_change_net_namespaces
ipv4: fix ipsec forward performance regression
jme: fix irq storm after suspend/resume
route: fix ICMP redirect validation
net: hold sock reference while processing tx timestamps
tcp: md5: add more const attributes
Add ethtool -g support to virtio_net
...
Fix up conflicts in:
- drivers/net/Kconfig:
The split-up generated a trivial conflict with removal of a
stale reference to Documentation/networking/net-modules.txt.
Remove it from the new location instead.
- fs/sysfs/dir.c:
Fairly nasty conflicts with the sysfs rb-tree usage, conflicting
with Eric Biederman's changes for tagged directories.
Diffstat (limited to 'drivers/net/ethernet/tile')
-rw-r--r-- | drivers/net/ethernet/tile/Kconfig | 15 | ||||
-rw-r--r-- | drivers/net/ethernet/tile/Makefile | 10 | ||||
-rw-r--r-- | drivers/net/ethernet/tile/tilepro.c | 2465 |
3 files changed, 2490 insertions, 0 deletions
diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig new file mode 100644 index 00000000000..2d9218f86bc --- /dev/null +++ b/drivers/net/ethernet/tile/Kconfig @@ -0,0 +1,15 @@ +# +# Tilera network device configuration +# + +config TILE_NET + tristate "Tilera GBE/XGBE network driver support" + depends on TILE + default y + select CRC32 + ---help--- + This is a standard Linux network device driver for the + on-chip Tilera Gigabit Ethernet and XAUI interfaces. + + To compile this driver as a module, choose M here: the module + will be called tile_net. diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile new file mode 100644 index 00000000000..f634f142cab --- /dev/null +++ b/drivers/net/ethernet/tile/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the TILE on-chip networking support. +# + +obj-$(CONFIG_TILE_NET) += tile_net.o +ifdef CONFIG_TILEGX +tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o +else +tile_net-objs := tilepro.o +endif diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c new file mode 100644 index 00000000000..10826d8a2a2 --- /dev/null +++ b/drivers/net/ethernet/tile/tilepro.c @@ -0,0 +1,2465 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/kernel.h> /* printk() */ +#include <linux/slab.h> /* kmalloc() */ +#include <linux/errno.h> /* error codes */ +#include <linux/types.h> /* size_t */ +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/netdevice.h> /* struct device, and other headers */ +#include <linux/etherdevice.h> /* eth_type_trans */ +#include <linux/skbuff.h> +#include <linux/ioctl.h> +#include <linux/cdev.h> +#include <linux/hugetlb.h> +#include <linux/in6.h> +#include <linux/timer.h> +#include <linux/io.h> +#include <asm/checksum.h> +#include <asm/homecache.h> + +#include <hv/drv_xgbe_intf.h> +#include <hv/drv_xgbe_impl.h> +#include <hv/hypervisor.h> +#include <hv/netio_intf.h> + +/* For TSO */ +#include <linux/ip.h> +#include <linux/tcp.h> + + +/* + * First, "tile_net_init_module()" initializes all four "devices" which + * can be used by linux. + * + * Then, "ifconfig DEVICE up" calls "tile_net_open()", which analyzes + * the network cpus, then uses "tile_net_open_aux()" to initialize + * LIPP/LEPP, and then uses "tile_net_open_inner()" to register all + * the tiles, provide buffers to LIPP, allow ingress to start, and + * turn on hypervisor interrupt handling (and NAPI) on all tiles. + * + * If registration fails due to the link being down, then "retry_work" + * is used to keep calling "tile_net_open_inner()" until it succeeds. + * + * If "ifconfig DEVICE down" is called, it uses "tile_net_stop()" to + * stop egress, drain the LIPP buffers, unregister all the tiles, stop + * LIPP/LEPP, and wipe the LEPP queue. + * + * We start out with the ingress interrupt enabled on each CPU. When + * this interrupt fires, we disable it, and call "napi_schedule()". + * This will cause "tile_net_poll()" to be called, which will pull + * packets from the netio queue, filtering them out, or passing them + * to "netif_receive_skb()". If our budget is exhausted, we will + * return, knowing we will be called again later. Otherwise, we + * reenable the ingress interrupt, and call "napi_complete()". + * + * HACK: Since disabling the ingress interrupt is not reliable, we + * ignore the interrupt if the global "active" flag is false. + * + * + * NOTE: The use of "native_driver" ensures that EPP exists, and that + * we are using "LIPP" and "LEPP". + * + * NOTE: Failing to free completions for an arbitrarily long time + * (which is defined to be illegal) does in fact cause bizarre + * problems. The "egress_timer" helps prevent this from happening. + */ + + +/* HACK: Allow use of "jumbo" packets. */ +/* This should be 1500 if "jumbo" is not set in LIPP. */ +/* This should be at most 10226 (10240 - 14) if "jumbo" is set in LIPP. */ +/* ISSUE: This has not been thoroughly tested (except at 1500). */ +#define TILE_NET_MTU 1500 + +/* HACK: Define to support GSO. */ +/* ISSUE: This may actually hurt performance of the TCP blaster. */ +/* #define TILE_NET_GSO */ + +/* Define this to collapse "duplicate" acks. */ +/* #define IGNORE_DUP_ACKS */ + +/* HACK: Define this to verify incoming packets. */ +/* #define TILE_NET_VERIFY_INGRESS */ + +/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */ +#define TILE_NET_TX_QUEUE_LEN 0 + +/* Define to dump packets (prints out the whole packet on tx and rx). */ +/* #define TILE_NET_DUMP_PACKETS */ + +/* Define to enable debug spew (all PDEBUG's are enabled). */ +/* #define TILE_NET_DEBUG */ + + +/* Define to activate paranoia checks. */ +/* #define TILE_NET_PARANOIA */ + +/* Default transmit lockup timeout period, in jiffies. */ +#define TILE_NET_TIMEOUT (5 * HZ) + +/* Default retry interval for bringing up the NetIO interface, in jiffies. */ +#define TILE_NET_RETRY_INTERVAL (5 * HZ) + +/* Number of ports (xgbe0, xgbe1, gbe0, gbe1). */ +#define TILE_NET_DEVS 4 + + + +/* Paranoia. */ +#if NET_IP_ALIGN != LIPP_PACKET_PADDING +#error "NET_IP_ALIGN must match LIPP_PACKET_PADDING." +#endif + + +/* Debug print. */ +#ifdef TILE_NET_DEBUG +#define PDEBUG(fmt, args...) net_printk(fmt, ## args) +#else +#define PDEBUG(fmt, args...) +#endif + + +MODULE_AUTHOR("Tilera"); +MODULE_LICENSE("GPL"); + + +/* + * Queue of incoming packets for a specific cpu and device. + * + * Includes a pointer to the "system" data, and the actual "user" data. + */ +struct tile_netio_queue { + netio_queue_impl_t *__system_part; + netio_queue_user_impl_t __user_part; + +}; + + +/* + * Statistics counters for a specific cpu and device. + */ +struct tile_net_stats_t { + u32 rx_packets; + u32 rx_bytes; + u32 tx_packets; + u32 tx_bytes; +}; + + +/* + * Info for a specific cpu and device. + * + * ISSUE: There is a "dev" pointer in "napi" as well. + */ +struct tile_net_cpu { + /* The NAPI struct. */ + struct napi_struct napi; + /* Packet queue. */ + struct tile_netio_queue queue; + /* Statistics. */ + struct tile_net_stats_t stats; + /* True iff NAPI is enabled. */ + bool napi_enabled; + /* True if this tile has successfully registered with the IPP. */ + bool registered; + /* True if the link was down last time we tried to register. */ + bool link_down; + /* True if "egress_timer" is scheduled. */ + bool egress_timer_scheduled; + /* Number of small sk_buffs which must still be provided. */ + unsigned int num_needed_small_buffers; + /* Number of large sk_buffs which must still be provided. */ + unsigned int num_needed_large_buffers; + /* A timer for handling egress completions. */ + struct timer_list egress_timer; +}; + + +/* + * Info for a specific device. + */ +struct tile_net_priv { + /* Our network device. */ + struct net_device *dev; + /* Pages making up the egress queue. */ + struct page *eq_pages; + /* Address of the actual egress queue. */ + lepp_queue_t *eq; + /* Protects "eq". */ + spinlock_t eq_lock; + /* The hypervisor handle for this interface. */ + int hv_devhdl; + /* The intr bit mask that IDs this device. */ + u32 intr_id; + /* True iff "tile_net_open_aux()" has succeeded. */ + bool partly_opened; + /* True iff the device is "active". */ + bool active; + /* Effective network cpus. */ + struct cpumask network_cpus_map; + /* Number of network cpus. */ + int network_cpus_count; + /* Credits per network cpu. */ + int network_cpus_credits; + /* Network stats. */ + struct net_device_stats stats; + /* For NetIO bringup retries. */ + struct delayed_work retry_work; + /* Quick access to per cpu data. */ + struct tile_net_cpu *cpu[NR_CPUS]; +}; + +/* Log2 of the number of small pages needed for the egress queue. */ +#define EQ_ORDER get_order(sizeof(lepp_queue_t)) +/* Size of the egress queue's pages. */ +#define EQ_SIZE (1 << (PAGE_SHIFT + EQ_ORDER)) + +/* + * The actual devices (xgbe0, xgbe1, gbe0, gbe1). + */ +static struct net_device *tile_net_devs[TILE_NET_DEVS]; + +/* + * The "tile_net_cpu" structures for each device. + */ +static DEFINE_PER_CPU(struct tile_net_cpu, hv_xgbe0); +static DEFINE_PER_CPU(struct tile_net_cpu, hv_xgbe1); +static DEFINE_PER_CPU(struct tile_net_cpu, hv_gbe0); +static DEFINE_PER_CPU(struct tile_net_cpu, hv_gbe1); + + +/* + * True if "network_cpus" was specified. + */ +static bool network_cpus_used; + +/* + * The actual cpus in "network_cpus". + */ +static struct cpumask network_cpus_map; + + + +#ifdef TILE_NET_DEBUG +/* + * printk with extra stuff. + * + * We print the CPU we're running in brackets. + */ +static void net_printk(char *fmt, ...) +{ + int i; + int len; + va_list args; + static char buf[256]; + + len = sprintf(buf, "tile_net[%2.2d]: ", smp_processor_id()); + va_start(args, fmt); + i = vscnprintf(buf + len, sizeof(buf) - len - 1, fmt, args); + va_end(args); + buf[255] = '\0'; + pr_notice(buf); +} +#endif + + +#ifdef TILE_NET_DUMP_PACKETS +/* + * Dump a packet. + */ +static void dump_packet(unsigned char *data, unsigned long length, char *s) +{ + int my_cpu = smp_processor_id(); + + unsigned long i; + char buf[128]; + + static unsigned int count; + + pr_info("dump_packet(data %p, length 0x%lx s %s count 0x%x)\n", + data, length, s, count++); + + pr_info("\n"); + + for (i = 0; i < length; i++) { + if ((i & 0xf) == 0) + sprintf(buf, "[%02d] %8.8lx:", my_cpu, i); + sprintf(buf + strlen(buf), " %2.2x", data[i]); + if ((i & 0xf) == 0xf || i == length - 1) { + strcat(buf, "\n"); + pr_info("%s", buf); + } + } +} +#endif + + +/* + * Provide support for the __netio_fastio1() swint + * (see <hv/drv_xgbe_intf.h> for how it is used). + * + * The fastio swint2 call may clobber all the caller-saved registers. + * It rarely clobbers memory, but we allow for the possibility in + * the signature just to be on the safe side. + * + * Also, gcc doesn't seem to allow an input operand to be + * clobbered, so we fake it with dummy outputs. + * + * This function can't be static because of the way it is declared + * in the netio header. + */ +inline int __netio_fastio1(u32 fastio_index, u32 arg0) +{ + long result, clobber_r1, clobber_r10; + asm volatile("swint2" + : "=R00" (result), + "=R01" (clobber_r1), "=R10" (clobber_r10) + : "R10" (fastio_index), "R01" (arg0) + : "memory", "r2", "r3", "r4", + "r5", "r6", "r7", "r8", "r9", + "r11", "r12", "r13", "r14", + "r15", "r16", "r17", "r18", "r19", + "r20", "r21", "r22", "r23", "r24", + "r25", "r26", "r27", "r28", "r29"); + return result; +} + + +/* + * Provide a linux buffer to LIPP. + */ +static void tile_net_provide_linux_buffer(struct tile_net_cpu *info, + void *va, bool small) +{ + struct tile_netio_queue *queue = &info->queue; + + /* Convert "va" and "small" to "linux_buffer_t". */ + unsigned int buffer = ((unsigned int)(__pa(va) >> 7) << 1) + small; + + __netio_fastio_free_buffer(queue->__user_part.__fastio_index, buffer); +} + + +/* + * Provide a linux buffer for LIPP. + * + * Note that the ACTUAL allocation for each buffer is a "struct sk_buff", + * plus a chunk of memory that includes not only the requested bytes, but + * also NET_SKB_PAD bytes of initial padding, and a "struct skb_shared_info". + * + * Note that "struct skb_shared_info" is 88 bytes with 64K pages and + * 268 bytes with 4K pages (since the frags[] array needs 18 entries). + * + * Without jumbo packets, the maximum packet size will be 1536 bytes, + * and we use 2 bytes (NET_IP_ALIGN) of padding. ISSUE: If we told + * the hardware to clip at 1518 bytes instead of 1536 bytes, then we + * could save an entire cache line, but in practice, we don't need it. + * + * Since CPAs are 38 bits, and we can only encode the high 31 bits in + * a "linux_buffer_t", the low 7 bits must be zero, and thus, we must + * align the actual "va" mod 128. + * + * We assume that the underlying "head" will be aligned mod 64. Note + * that in practice, we have seen "head" NOT aligned mod 128 even when + * using 2048 byte allocations, which is surprising. + * + * If "head" WAS always aligned mod 128, we could change LIPP to + * assume that the low SIX bits are zero, and the 7th bit is one, that + * is, align the actual "va" mod 128 plus 64, which would be "free". + * + * For now, the actual "head" pointer points at NET_SKB_PAD bytes of + * padding, plus 28 or 92 bytes of extra padding, plus the sk_buff + * pointer, plus the NET_IP_ALIGN padding, plus 126 or 1536 bytes for + * the actual packet, plus 62 bytes of empty padding, plus some + * padding and the "struct skb_shared_info". + * + * With 64K pages, a large buffer thus needs 32+92+4+2+1536+62+88 + * bytes, or 1816 bytes, which fits comfortably into 2048 bytes. + * + * With 64K pages, a small buffer thus needs 32+92+4+2+126+88 + * bytes, or 344 bytes, which means we are wasting 64+ bytes, and + * could presumably increase the size of small buffers. + * + * With 4K pages, a large buffer thus needs 32+92+4+2+1536+62+268 + * bytes, or 1996 bytes, which fits comfortably into 2048 bytes. + * + * With 4K pages, a small buffer thus needs 32+92+4+2+126+268 + * bytes, or 524 bytes, which is annoyingly wasteful. + * + * Maybe we should increase LIPP_SMALL_PACKET_SIZE to 192? + * + * ISSUE: Maybe we should increase "NET_SKB_PAD" to 64? + */ +static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info, + bool small) +{ +#if TILE_NET_MTU <= 1536 + /* Without "jumbo", 2 + 1536 should be sufficient. */ + unsigned int large_size = NET_IP_ALIGN + 1536; +#else + /* ISSUE: This has not been tested. */ + unsigned int large_size = NET_IP_ALIGN + TILE_NET_MTU + 100; +#endif + + /* Avoid "false sharing" with last cache line. */ + /* ISSUE: This is already done by "dev_alloc_skb()". */ + unsigned int len = + (((small ? LIPP_SMALL_PACKET_SIZE : large_size) + + CHIP_L2_LINE_SIZE() - 1) & -CHIP_L2_LINE_SIZE()); + + unsigned int padding = 128 - NET_SKB_PAD; + unsigned int align; + + struct sk_buff *skb; + void *va; + + struct sk_buff **skb_ptr; + + /* Request 96 extra bytes for alignment purposes. */ + skb = dev_alloc_skb(len + padding); + if (skb == NULL) + return false; + + /* Skip 32 or 96 bytes to align "data" mod 128. */ + align = -(long)skb->data & (128 - 1); + BUG_ON(align > padding); + skb_reserve(skb, align); + + /* This address is given to IPP. */ + va = skb->data; + + /* Buffers must not span a huge page. */ + BUG_ON(((((long)va & ~HPAGE_MASK) + len) & HPAGE_MASK) != 0); + +#ifdef TILE_NET_PARANOIA +#if CHIP_HAS_CBOX_HOME_MAP() + if (hash_default) { + HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)va); + if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3) + panic("Non-HFH ingress buffer! VA=%p Mode=%d PTE=%llx", + va, hv_pte_get_mode(pte), hv_pte_val(pte)); + } +#endif +#endif + + /* Invalidate the packet buffer. */ + if (!hash_default) + __inv_buffer(va, len); + + /* Skip two bytes to satisfy LIPP assumptions. */ + /* Note that this aligns IP on a 16 byte boundary. */ + /* ISSUE: Do this when the packet arrives? */ + skb_reserve(skb, NET_IP_ALIGN); + + /* Save a back-pointer to 'skb'. */ + skb_ptr = va - sizeof(*skb_ptr); + *skb_ptr = skb; + + /* Make sure "skb_ptr" has been flushed. */ + __insn_mf(); + + /* Provide the new buffer. */ + tile_net_provide_linux_buffer(info, va, small); + + return true; +} + + +/* + * Provide linux buffers for LIPP. + */ +static void tile_net_provide_needed_buffers(struct tile_net_cpu *info) +{ + while (info->num_needed_small_buffers != 0) { + if (!tile_net_provide_needed_buffer(info, true)) + goto oops; + info->num_needed_small_buffers--; + } + + while (info->num_needed_large_buffers != 0) { + if (!tile_net_provide_needed_buffer(info, false)) + goto oops; + info->num_needed_large_buffers--; + } + + return; + +oops: + + /* Add a description to the page allocation failure dump. */ + pr_notice("Could not provide a linux buffer to LIPP.\n"); +} + + +/* + * Grab some LEPP completions, and store them in "comps", of size + * "comps_size", and return the number of completions which were + * stored, so the caller can free them. + */ +static unsigned int tile_net_lepp_grab_comps(lepp_queue_t *eq, + struct sk_buff *comps[], + unsigned int comps_size, + unsigned int min_size) +{ + unsigned int n = 0; + + unsigned int comp_head = eq->comp_head; + unsigned int comp_busy = eq->comp_busy; + + while (comp_head != comp_busy && n < comps_size) { + comps[n++] = eq->comps[comp_head]; + LEPP_QINC(comp_head); + } + + if (n < min_size) + return 0; + + eq->comp_head = comp_head; + + return n; +} + + +/* + * Free some comps, and return true iff there are still some pending. + */ +static bool tile_net_lepp_free_comps(struct net_device *dev, bool all) +{ + struct tile_net_priv *priv = netdev_priv(dev); + + lepp_queue_t *eq = priv->eq; + + struct sk_buff *olds[64]; + unsigned int wanted = 64; + unsigned int i, n; + bool pending; + + spin_lock(&priv->eq_lock); + + if (all) + eq->comp_busy = eq->comp_tail; + + n = tile_net_lepp_grab_comps(eq, olds, wanted, 0); + + pending = (eq->comp_head != eq->comp_tail); + + spin_unlock(&priv->eq_lock); + + for (i = 0; i < n; i++) + kfree_skb(olds[i]); + + return pending; +} + + +/* + * Make sure the egress timer is scheduled. + * + * Note that we use "schedule if not scheduled" logic instead of the more + * obvious "reschedule" logic, because "reschedule" is fairly expensive. + */ +static void tile_net_schedule_egress_timer(struct tile_net_cpu *info) +{ + if (!info->egress_timer_scheduled) { + mod_timer_pinned(&info->egress_timer, jiffies + 1); + info->egress_timer_scheduled = true; + } +} + + +/* + * The "function" for "info->egress_timer". + * + * This timer will reschedule itself as long as there are any pending + * completions expected (on behalf of any tile). + * + * ISSUE: Realistically, will the timer ever stop scheduling itself? + * + * ISSUE: This timer is almost never actually needed, so just use a global + * timer that can run on any tile. + * + * ISSUE: Maybe instead track number of expected completions, and free + * only that many, resetting to zero if "pending" is ever false. + */ +static void tile_net_handle_egress_timer(unsigned long arg) +{ + struct tile_net_cpu *info = (struct tile_net_cpu *)arg; + struct net_device *dev = info->napi.dev; + + /* The timer is no longer scheduled. */ + info->egress_timer_scheduled = false; + + /* Free comps, and reschedule timer if more are pending. */ + if (tile_net_lepp_free_comps(dev, false)) + tile_net_schedule_egress_timer(info); +} + + +#ifdef IGNORE_DUP_ACKS + +/* + * Help detect "duplicate" ACKs. These are sequential packets (for a + * given flow) which are exactly 66 bytes long, sharing everything but + * ID=2@0x12, Hsum=2@0x18, Ack=4@0x2a, WinSize=2@0x30, Csum=2@0x32, + * Tstamps=10@0x38. The ID's are +1, the Hsum's are -1, the Ack's are + * +N, and the Tstamps are usually identical. + * + * NOTE: Apparently truly duplicate acks (with identical "ack" values), + * should not be collapsed, as they are used for some kind of flow control. + */ +static bool is_dup_ack(char *s1, char *s2, unsigned int len) +{ + int i; + + unsigned long long ignorable = 0; + + /* Identification. */ + ignorable |= (1ULL << 0x12); + ignorable |= (1ULL << 0x13); + + /* Header checksum. */ + ignorable |= (1ULL << 0x18); + ignorable |= (1ULL << 0x19); + + /* ACK. */ + ignorable |= (1ULL << 0x2a); + ignorable |= (1ULL << 0x2b); + ignorable |= (1ULL << 0x2c); + ignorable |= (1ULL << 0x2d); + + /* WinSize. */ + ignorable |= (1ULL << 0x30); + ignorable |= (1ULL << 0x31); + + /* Checksum. */ + ignorable |= (1ULL << 0x32); + ignorable |= (1ULL << 0x33); + + for (i = 0; i < len; i++, ignorable >>= 1) { + + if ((ignorable & 1) || (s1[i] == s2[i])) + continue; + +#ifdef TILE_NET_DEBUG + /* HACK: Mention non-timestamp diffs. */ + if (i < 0x38 && i != 0x2f && + net_ratelimit()) + pr_info("Diff at 0x%x\n", i); +#endif + + return false; + } + +#ifdef TILE_NET_NO_SUPPRESS_DUP_ACKS + /* HACK: Do not suppress truly duplicate ACKs. */ + /* ISSUE: Is this actually necessary or helpful? */ + if (s1[0x2a] == s2[0x2a] && + s1[0x2b] == s2[0x2b] && + s1[0x2c] == s2[0x2c] && + s1[0x2d] == s2[0x2d]) { + return false; + } +#endif + + return true; +} + +#endif + + + +static void tile_net_discard_aux(struct tile_net_cpu *info, int index) +{ + struct tile_netio_queue *queue = &info->queue; + netio_queue_impl_t *qsp = queue->__system_part; + netio_queue_user_impl_t *qup = &queue->__user_part; + + int index2_aux = index + sizeof(netio_pkt_t); + int index2 = + ((index2_aux == + qsp->__packet_receive_queue.__last_packet_plus_one) ? + 0 : index2_aux); + + netio_pkt_t *pkt = (netio_pkt_t *)((unsigned long) &qsp[1] + index); + + /* Extract the "linux_buffer_t". */ + unsigned int buffer = pkt->__packet.word; + + /* Convert "linux_buffer_t" to "va". */ + void *va = __va((phys_addr_t)(buffer >> 1) << 7); + + /* Acquire the associated "skb". */ + struct sk_buff **skb_ptr = va - sizeof(*skb_ptr); + struct sk_buff *skb = *skb_ptr; + + kfree_skb(skb); + + /* Consume this packet. */ + qup->__packet_receive_read = index2; +} + + +/* + * Like "tile_net_poll()", but just discard packets. + */ +static void tile_net_discard_packets(struct net_device *dev) +{ + struct tile_net_priv *priv = netdev_priv(dev); + int my_cpu = smp_processor_id(); + struct tile_net_cpu *info = priv->cpu[my_cpu]; + struct tile_netio_queue *queue = &info->queue; + netio_queue_impl_t *qsp = queue->__system_part; + netio_queue_user_impl_t *qup = &queue->__user_part; + + while (qup->__packet_receive_read != + qsp->__packet_receive_queue.__packet_write) { + int index = qup->__packet_receive_read; + tile_net_discard_aux(info, index); + } +} + + +/* + * Handle the next packet. Return true if "processed", false if "filtered". + */ +static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) +{ + struct net_device *dev = info->napi.dev; + + struct tile_netio_queue *queue = &info->queue; + netio_queue_impl_t *qsp = queue->__system_part; + netio_queue_user_impl_t *qup = &queue->__user_part; + struct tile_net_stats_t *stats = &info->stats; + + int filter; + + int index2_aux = index + sizeof(netio_pkt_t); + int index2 = + ((index2_aux == + qsp->__packet_receive_queue.__last_packet_plus_one) ? + 0 : index2_aux); + + netio_pkt_t *pkt = (netio_pkt_t *)((unsigned long) &qsp[1] + index); + + netio_pkt_metadata_t *metadata = NETIO_PKT_METADATA(pkt); + + /* Extract the packet size. FIXME: Shouldn't the second line */ + /* get subtracted? Mostly moot, since it should be "zero". */ + unsigned long len = + (NETIO_PKT_CUSTOM_LENGTH(pkt) + + NET_IP_ALIGN - NETIO_PACKET_PADDING); + + /* Extract the "linux_buffer_t". */ + unsigned int buffer = pkt->__packet.word; + + /* Extract "small" (vs "large"). */ + bool small = ((buffer & 1) != 0); + + /* Convert "linux_buffer_t" to "va". */ + void *va = __va((phys_addr_t)(buffer >> 1) << 7); + + /* Extract the packet data pointer. */ + /* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */ + unsigned char *buf = va + NET_IP_ALIGN; + + /* Invalidate the packet buffer. */ + if (!hash_default) + __inv_buffer(buf, len); + + /* ISSUE: Is this needed? */ + dev->last_rx = jiffies; + +#ifdef TILE_NET_DUMP_PACKETS + dump_packet(buf, len, "rx"); +#endif /* TILE_NET_DUMP_PACKETS */ + +#ifdef TILE_NET_VERIFY_INGRESS + if (!NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt) && + NETIO_PKT_L4_CSUM_CALCULATED_M(metadata, pkt)) { + /* Bug 6624: Includes UDP packets with a "zero" checksum. */ + pr_warning("Bad L4 checksum on %d byte packet.\n", len); + } + if (!NETIO_PKT_L3_CSUM_CORRECT_M(metadata, pkt) && + NETIO_PKT_L3_CSUM_CALCULATED_M(metadata, pkt)) { + dump_packet(buf, len, "rx"); + panic("Bad L3 checksum."); + } + switch (NETIO_PKT_STATUS_M(metadata, pkt)) { + case NETIO_PKT_STATUS_OVERSIZE: + if (len >= 64) { + dump_packet(buf, len, "rx"); + panic("Unexpected OVERSIZE."); + } + break; + case NETIO_PKT_STATUS_BAD: + pr_warning("Unexpected BAD %ld byte packet.\n", len); + } +#endif + + filter = 0; + + /* ISSUE: Filter TCP packets with "bad" checksums? */ + + if (!(dev->flags & IFF_UP)) { + /* Filter packets received before we're up. */ + filter = 1; + } else if (NETIO_PKT_STATUS_M(metadata, pkt) == NETIO_PKT_STATUS_BAD) { + /* Filter "truncated" packets. */ + filter = 1; + } else if (!(dev->flags & IFF_PROMISC)) { + /* FIXME: Implement HW multicast filter. */ + if (!is_multicast_ether_addr(buf)) { + /* Filter packets not for our address. */ + const u8 *mine = dev->dev_addr; + filter = compare_ether_addr(mine, buf); + } + } + + if (filter) { + + /* ISSUE: Update "drop" statistics? */ + + tile_net_provide_linux_buffer(info, va, small); + + } else { + + /* Acquire the associated "skb". */ + struct sk_buff **skb_ptr = va - sizeof(*skb_ptr); + struct sk_buff *skb = *skb_ptr; + + /* Paranoia. */ + if (skb->data != buf) + panic("Corrupt linux buffer from LIPP! " + "VA=%p, skb=%p, skb->data=%p\n", + va, skb, skb->data); + + /* Encode the actual packet length. */ + skb_put(skb, len); + + /* NOTE: This call also sets "skb->dev = dev". */ + skb->protocol = eth_type_trans(skb, dev); + + /* Avoid recomputing "good" TCP/UDP checksums. */ + if (NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + netif_receive_skb(skb); + + stats->rx_packets++; + stats->rx_bytes += len; + + if (small) + info->num_needed_small_buffers++; + else + info->num_needed_large_buffers++; + } + + /* Return four credits after every fourth packet. */ + if (--qup->__receive_credit_remaining == 0) { + u32 interval = qup->__receive_credit_interval; + qup->__receive_credit_remaining = interval; + __netio_fastio_return_credits(qup->__fastio_index, interval); + } + + /* Consume this packet. */ + qup->__packet_receive_read = index2; + + return !filter; +} + + +/* + * Handle some packets for the given device on the current CPU. + * + * If "tile_net_stop()" is called on some other tile while this + * function is running, we will return, hopefully before that + * other tile asks us to call "napi_disable()". + * + * The "rotting packet" race condition occurs if a packet arrives + * during the extremely narrow window between the queue appearing to + * be empty, and the ingress interrupt being re-enabled. This happens + * a LOT under heavy network load. + */ +static int tile_net_poll(struct napi_struct *napi, int budget) +{ + struct net_device *dev = napi->dev; + struct tile_net_priv *priv = netdev_priv(dev); + int my_cpu = smp_processor_id(); + struct tile_net_cpu *info = priv->cpu[my_cpu]; + struct tile_netio_queue *queue = &info->queue; + netio_queue_impl_t *qsp = queue->__system_part; + netio_queue_user_impl_t *qup = &queue->__user_part; + + unsigned int work = 0; + + while (priv->active) { + int index = qup->__packet_receive_read; + if (index == qsp->__packet_receive_queue.__packet_write) + break; + + if (tile_net_poll_aux(info, index)) { + if (++work >= budget) + goto done; + } + } + + napi_complete(&info->napi); + + if (!priv->active) + goto done; + + /* Re-enable the ingress interrupt. */ + enable_percpu_irq(priv->intr_id); + + /* HACK: Avoid the "rotting packet" problem (see above). */ + if (qup->__packet_receive_read != + qsp->__packet_receive_queue.__packet_write) { + /* ISSUE: Sometimes this returns zero, presumably */ + /* because an interrupt was handled for this tile. */ + (void)napi_reschedule(&info->napi); + } + +done: + + if (priv->active) + tile_net_provide_needed_buffers(info); + + return work; +} + + +/* + * Handle an ingress interrupt for the given device on the current cpu. + * + * ISSUE: Sometimes this gets called after "disable_percpu_irq()" has + * been called! This is probably due to "pending hypervisor downcalls". + * + * ISSUE: Is there any race condition between the "napi_schedule()" here + * and the "napi_complete()" call above? + */ +static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr) +{ + struct net_device *dev = (struct net_device *)dev_ptr; + struct tile_net_priv *priv = netdev_priv(dev); + int my_cpu = smp_processor_id(); + struct tile_net_cpu *info = priv->cpu[my_cpu]; + + /* Disable the ingress interrupt. */ + disable_percpu_irq(priv->intr_id); + + /* Ignore unwanted interrupts. */ + if (!priv->active) + return IRQ_HANDLED; + + /* ISSUE: Sometimes "info->napi_enabled" is false here. */ + + napi_schedule(&info->napi); + + return IRQ_HANDLED; +} + + +/* + * One time initialization per interface. + */ +static int tile_net_open_aux(struct net_device *dev) +{ + struct tile_net_priv *priv = netdev_priv(dev); + + int ret; + int dummy; + unsigned int epp_lotar; + + /* + * Find out where EPP memory should be homed. + */ + ret = hv_dev_pread(priv->hv_devhdl, 0, + (HV_VirtAddr)&epp_lotar, sizeof(epp_lotar), + NETIO_EPP_SHM_OFF); + if (ret < 0) { + pr_err("could not read epp_shm_queue lotar.\n"); + return -EIO; + } + + /* + * Home the page on the EPP. + */ + { + int epp_home = hv_lotar_to_cpu(epp_lotar); + homecache_change_page_home(priv->eq_pages, EQ_ORDER, epp_home); + } + + /* + * Register the EPP shared memory queue. + */ + { + netio_ipp_address_t ea = { + .va = 0, + .pa = __pa(priv->eq), + .pte = hv_pte(0), + .size = EQ_SIZE, + }; + ea.pte = hv_pte_set_lotar(ea.pte, epp_lotar); + ea.pte = hv_pte_set_mode(ea.pte, HV_PTE_MODE_CACHE_TILE_L3); + ret = hv_dev_pwrite(priv->hv_devhdl, 0, + (HV_VirtAddr)&ea, + sizeof(ea), + NETIO_EPP_SHM_OFF); + if (ret < 0) + return -EIO; + } + + /* + * Start LIPP/LEPP. + */ + if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy, + sizeof(dummy), NETIO_IPP_START_SHIM_OFF) < 0) { + pr_warning("Failed to start LIPP/LEPP.\n"); + return -EIO; + } + + return 0; +} + + +/* + * Register with hypervisor on the current CPU. + * + * Strangely, this function does important things even if it "fails", + * which is especially common if the link is not up yet. Hopefully + * these things are all "harmless" if done twice! + */ +static void tile_net_register(void *dev_ptr) +{ + struct net_device *dev = (struct net_device *)dev_ptr; + struct tile_net_priv *priv = netdev_priv(dev); |