diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-17 19:34:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-17 19:34:12 -0700 |
commit | 08351fc6a75731226e1112fc7254542bd3a2912e (patch) | |
tree | 8b25bd168e0663c766f0332c8be082aa7d6ed265 /drivers | |
parent | 0df0914d414a504b975f3cc66ace0c16ef55b7f3 (diff) | |
parent | 0dccb0489f9a5a13a33e828ab965aa49685d12f8 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile
* git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile: (27 commits)
arch/tile: support newer binutils assembler shift semantics
arch/tile: fix deadlock bugs in rwlock implementation
drivers/edac: provide support for tile architecture
tile on-chip network driver: sync up with latest fixes
arch/tile: support 4KB page size as well as 64KB
arch/tile: add some more VMSPLIT options and use consistent naming
arch/tile: fix some comments and whitespace
arch/tile: export some additional module symbols
arch/tile: enhance existing finv_buffer_remote() routine
arch/tile: fix two bugs in the backtracer code
arch/tile: use extended assembly to inline __mb_incoherent()
arch/tile: use a cleaner technique to enable interrupt for cpu_idle()
arch/tile: sync up with <arch/sim.h> and <arch/sim_def.h> changes
arch/tile: fix reversed test of strict_strtol() return value
arch/tile: avoid a simulator warning during bootup
arch/tile: export <asm/hardwall.h> to userspace
arch/tile: warn and retry if an IPI is not accepted by the target cpu
arch/tile: stop disabling INTCTRL_1 interrupts during hypervisor downcalls
arch/tile: fix __ndelay etc to work better
arch/tile: bug fix: exec'ed task thought it was still single-stepping
...
Fix up trivial conflict in arch/tile/kernel/vmlinux.lds.S (percpu
alignment vs section naming convention fix)
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/edac/Kconfig | 10 | ||||
-rw-r--r-- | drivers/edac/Makefile | 1 | ||||
-rw-r--r-- | drivers/edac/tile_edac.c | 254 | ||||
-rw-r--r-- | drivers/net/tile/tilepro.c | 965 |
4 files changed, 781 insertions, 449 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index fe70a341bd8..fac1a2002e6 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -7,7 +7,7 @@ menuconfig EDAC bool "EDAC (Error Detection And Correction) reporting" depends on HAS_IOMEM - depends on X86 || PPC + depends on X86 || PPC || TILE help EDAC is designed to report errors in the core system. These are low-level errors that are reported in the CPU or @@ -282,4 +282,12 @@ config EDAC_CPC925 a companion chip to the PowerPC 970 family of processors. +config EDAC_TILE + tristate "Tilera Memory Controller" + depends on EDAC_MM_EDAC && TILE + default y + help + Support for error detection and correction on the + Tilera memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index ba2898b3639..3e239133e29 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -54,3 +54,4 @@ obj-$(CONFIG_EDAC_PPC4XX) += ppc4xx_edac.o obj-$(CONFIG_EDAC_AMD8111) += amd8111_edac.o obj-$(CONFIG_EDAC_AMD8131) += amd8131_edac.o +obj-$(CONFIG_EDAC_TILE) += tile_edac.o diff --git a/drivers/edac/tile_edac.c b/drivers/edac/tile_edac.c new file mode 100644 index 00000000000..1d5cf06f6c6 --- /dev/null +++ b/drivers/edac/tile_edac.c @@ -0,0 +1,254 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * Tilera-specific EDAC driver. + * + * This source code is derived from the following driver: + * + * Cell MIC driver for ECC counting + * + * Copyright 2007 Benjamin Herrenschmidt, IBM Corp. + * <benh@kernel.crashing.org> + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/io.h> +#include <linux/uaccess.h> +#include <linux/edac.h> +#include <hv/hypervisor.h> +#include <hv/drv_mshim_intf.h> + +#include "edac_core.h" + +#define DRV_NAME "tile-edac" + +/* Number of cs_rows needed per memory controller on TILEPro. */ +#define TILE_EDAC_NR_CSROWS 1 + +/* Number of channels per memory controller on TILEPro. */ +#define TILE_EDAC_NR_CHANS 1 + +/* Granularity of reported error in bytes on TILEPro. */ +#define TILE_EDAC_ERROR_GRAIN 8 + +/* TILE processor has multiple independent memory controllers. */ +struct platform_device *mshim_pdev[TILE_MAX_MSHIMS]; + +struct tile_edac_priv { + int hv_devhdl; /* Hypervisor device handle. */ + int node; /* Memory controller instance #. */ + unsigned int ce_count; /* + * Correctable-error counter + * kept by the driver. + */ +}; + +static void tile_edac_check(struct mem_ctl_info *mci) +{ + struct tile_edac_priv *priv = mci->pvt_info; + struct mshim_mem_error mem_error; + + if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_error, + sizeof(struct mshim_mem_error), MSHIM_MEM_ERROR_OFF) != + sizeof(struct mshim_mem_error)) { + pr_err(DRV_NAME ": MSHIM_MEM_ERROR_OFF pread failure.\n"); + return; + } + + /* Check if the current error count is different from the saved one. */ + if (mem_error.sbe_count != priv->ce_count) { + dev_dbg(mci->dev, "ECC CE err on node %d\n", priv->node); + priv->ce_count = mem_error.sbe_count; + edac_mc_handle_ce(mci, 0, 0, 0, 0, 0, mci->ctl_name); + } +} + +/* + * Initialize the 'csrows' table within the mci control structure with the + * addressing of memory. + */ +static int __devinit tile_edac_init_csrows(struct mem_ctl_info *mci) +{ + struct csrow_info *csrow = &mci->csrows[0]; + struct tile_edac_priv *priv = mci->pvt_info; + struct mshim_mem_info mem_info; + + if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_info, + sizeof(struct mshim_mem_info), MSHIM_MEM_INFO_OFF) != + sizeof(struct mshim_mem_info)) { + pr_err(DRV_NAME ": MSHIM_MEM_INFO_OFF pread failure.\n"); + return -1; + } + + if (mem_info.mem_ecc) + csrow->edac_mode = EDAC_SECDED; + else + csrow->edac_mode = EDAC_NONE; + switch (mem_info.mem_type) { + case DDR2: + csrow->mtype = MEM_DDR2; + break; + + case DDR3: + csrow->mtype = MEM_DDR3; + break; + + default: + return -1; + } + + csrow->first_page = 0; + csrow->nr_pages = mem_info.mem_size >> PAGE_SHIFT; + csrow->last_page = csrow->first_page + csrow->nr_pages - 1; + csrow->grain = TILE_EDAC_ERROR_GRAIN; + csrow->dtype = DEV_UNKNOWN; + + return 0; +} + +static int __devinit tile_edac_mc_probe(struct platform_device *pdev) +{ + char hv_file[32]; + int hv_devhdl; + struct mem_ctl_info *mci; + struct tile_edac_priv *priv; + int rc; + + sprintf(hv_file, "mshim/%d", pdev->id); + hv_devhdl = hv_dev_open((HV_VirtAddr)hv_file, 0); + if (hv_devhdl < 0) + return -EINVAL; + + /* A TILE MC has a single channel and one chip-select row. */ + mci = edac_mc_alloc(sizeof(struct tile_edac_priv), + TILE_EDAC_NR_CSROWS, TILE_EDAC_NR_CHANS, pdev->id); + if (mci == NULL) + return -ENOMEM; + priv = mci->pvt_info; + priv->node = pdev->id; + priv->hv_devhdl = hv_devhdl; + + mci->dev = &pdev->dev; + mci->mtype_cap = MEM_FLAG_DDR2; + mci->edac_ctl_cap = EDAC_FLAG_SECDED; + + mci->mod_name = DRV_NAME; + mci->ctl_name = "TILEPro_Memory_Controller"; + mci->dev_name = dev_name(&pdev->dev); + mci->edac_check = tile_edac_check; + + /* + * Initialize the MC control structure 'csrows' table + * with the mapping and control information. + */ + if (tile_edac_init_csrows(mci)) { + /* No csrows found. */ + mci->edac_cap = EDAC_FLAG_NONE; + } else { + mci->edac_cap = EDAC_FLAG_SECDED; + } + + platform_set_drvdata(pdev, mci); + + /* Register with EDAC core */ + rc = edac_mc_add_mc(mci); + if (rc) { + dev_err(&pdev->dev, "failed to register with EDAC core\n"); + edac_mc_free(mci); + return rc; + } + + return 0; +} + +static int __devexit tile_edac_mc_remove(struct platform_device *pdev) +{ + struct mem_ctl_info *mci = platform_get_drvdata(pdev); + + edac_mc_del_mc(&pdev->dev); + if (mci) + edac_mc_free(mci); + return 0; +} + +static struct platform_driver tile_edac_mc_driver = { + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + }, + .probe = tile_edac_mc_probe, + .remove = __devexit_p(tile_edac_mc_remove), +}; + +/* + * Driver init routine. + */ +static int __init tile_edac_init(void) +{ + char hv_file[32]; + struct platform_device *pdev; + int i, err, num = 0; + + /* Only support POLL mode. */ + edac_op_state = EDAC_OPSTATE_POLL; + + err = platform_driver_register(&tile_edac_mc_driver); + if (err) + return err; + + for (i = 0; i < TILE_MAX_MSHIMS; i++) { + /* + * Not all memory controllers are configured such as in the + * case of a simulator. So we register only those mshims + * that are configured by the hypervisor. + */ + sprintf(hv_file, "mshim/%d", i); + if (hv_dev_open((HV_VirtAddr)hv_file, 0) < 0) + continue; + + pdev = platform_device_register_simple(DRV_NAME, i, NULL, 0); + if (IS_ERR(pdev)) + continue; + mshim_pdev[i] = pdev; + num++; + } + + if (num == 0) { + platform_driver_unregister(&tile_edac_mc_driver); + return -ENODEV; + } + return 0; +} + +/* + * Driver cleanup routine. + */ +static void __exit tile_edac_exit(void) +{ + int i; + + for (i = 0; i < TILE_MAX_MSHIMS; i++) { + struct platform_device *pdev = mshim_pdev[i]; + if (!pdev) + continue; + + platform_set_drvdata(pdev, NULL); + platform_device_unregister(pdev); + } + platform_driver_unregister(&tile_edac_mc_driver); +} + +module_init(tile_edac_init); +module_exit(tile_edac_exit); diff --git a/drivers/net/tile/tilepro.c b/drivers/net/tile/tilepro.c index 7cb301da747..0825db6d883 100644 --- a/drivers/net/tile/tilepro.c +++ b/drivers/net/tile/tilepro.c @@ -1,5 +1,5 @@ /* - * Copyright 2010 Tilera Corporation. All Rights Reserved. + * Copyright 2011 Tilera Corporation. All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -44,10 +44,6 @@ #include <linux/tcp.h> -/* There is no singlethread_cpu, so schedule work on the current cpu. */ -#define singlethread_cpu -1 - - /* * First, "tile_net_init_module()" initializes all four "devices" which * can be used by linux. @@ -73,15 +69,16 @@ * return, knowing we will be called again later. Otherwise, we * reenable the ingress interrupt, and call "napi_complete()". * + * HACK: Since disabling the ingress interrupt is not reliable, we + * ignore the interrupt if the global "active" flag is false. + * * * NOTE: The use of "native_driver" ensures that EPP exists, and that - * "epp_sendv" is legal, and that "LIPP" is being used. + * we are using "LIPP" and "LEPP". * * NOTE: Failing to free completions for an arbitrarily long time * (which is defined to be illegal) does in fact cause bizarre * problems. The "egress_timer" helps prevent this from happening. - * - * NOTE: The egress code can be interrupted by the interrupt handler. */ @@ -142,6 +139,7 @@ MODULE_AUTHOR("Tilera"); MODULE_LICENSE("GPL"); + /* * Queue of incoming packets for a specific cpu and device. * @@ -177,7 +175,7 @@ struct tile_net_cpu { struct tile_netio_queue queue; /* Statistics. */ struct tile_net_stats_t stats; - /* ISSUE: Is this needed? */ + /* True iff NAPI is enabled. */ bool napi_enabled; /* True if this tile has succcessfully registered with the IPP. */ bool registered; @@ -200,20 +198,20 @@ struct tile_net_cpu { struct tile_net_priv { /* Our network device. */ struct net_device *dev; - /* The actual egress queue. */ - lepp_queue_t *epp_queue; - /* Protects "epp_queue->cmd_tail" and "epp_queue->comp_tail" */ - spinlock_t cmd_lock; - /* Protects "epp_queue->comp_head". */ - spinlock_t comp_lock; + /* Pages making up the egress queue. */ + struct page *eq_pages; + /* Address of the actual egress queue. */ + lepp_queue_t *eq; + /* Protects "eq". */ + spinlock_t eq_lock; /* The hypervisor handle for this interface. */ int hv_devhdl; /* The intr bit mask that IDs this device. */ u32 intr_id; /* True iff "tile_net_open_aux()" has succeeded. */ - int partly_opened; - /* True iff "tile_net_open_inner()" has succeeded. */ - int fully_opened; + bool partly_opened; + /* True iff the device is "active". */ + bool active; /* Effective network cpus. */ struct cpumask network_cpus_map; /* Number of network cpus. */ @@ -228,6 +226,10 @@ struct tile_net_priv { struct tile_net_cpu *cpu[NR_CPUS]; }; +/* Log2 of the number of small pages needed for the egress queue. */ +#define EQ_ORDER get_order(sizeof(lepp_queue_t)) +/* Size of the egress queue's pages. */ +#define EQ_SIZE (1 << (PAGE_SHIFT + EQ_ORDER)) /* * The actual devices (xgbe0, xgbe1, gbe0, gbe1). @@ -284,7 +286,11 @@ static void net_printk(char *fmt, ...) */ static void dump_packet(unsigned char *data, unsigned long length, char *s) { + int my_cpu = smp_processor_id(); + unsigned long i; + char buf[128]; + static unsigned int count; pr_info("dump_packet(data %p, length 0x%lx s %s count 0x%x)\n", @@ -294,10 +300,12 @@ static void dump_packet(unsigned char *data, unsigned long length, char *s) for (i = 0; i < length; i++) { if ((i & 0xf) == 0) - sprintf(buf, "%8.8lx:", i); + sprintf(buf, "[%02d] %8.8lx:", my_cpu, i); sprintf(buf + strlen(buf), " %2.2x", data[i]); - if ((i & 0xf) == 0xf || i == length - 1) - pr_info("%s\n", buf); + if ((i & 0xf) == 0xf || i == length - 1) { + strcat(buf, "\n"); + pr_info("%s", buf); + } } } #endif @@ -351,60 +359,109 @@ static void tile_net_provide_linux_buffer(struct tile_net_cpu *info, /* * Provide a linux buffer for LIPP. + * + * Note that the ACTUAL allocation for each buffer is a "struct sk_buff", + * plus a chunk of memory that includes not only the requested bytes, but + * also NET_SKB_PAD bytes of initial padding, and a "struct skb_shared_info". + * + * Note that "struct skb_shared_info" is 88 bytes with 64K pages and + * 268 bytes with 4K pages (since the frags[] array needs 18 entries). + * + * Without jumbo packets, the maximum packet size will be 1536 bytes, + * and we use 2 bytes (NET_IP_ALIGN) of padding. ISSUE: If we told + * the hardware to clip at 1518 bytes instead of 1536 bytes, then we + * could save an entire cache line, but in practice, we don't need it. + * + * Since CPAs are 38 bits, and we can only encode the high 31 bits in + * a "linux_buffer_t", the low 7 bits must be zero, and thus, we must + * align the actual "va" mod 128. + * + * We assume that the underlying "head" will be aligned mod 64. Note + * that in practice, we have seen "head" NOT aligned mod 128 even when + * using 2048 byte allocations, which is surprising. + * + * If "head" WAS always aligned mod 128, we could change LIPP to + * assume that the low SIX bits are zero, and the 7th bit is one, that + * is, align the actual "va" mod 128 plus 64, which would be "free". + * + * For now, the actual "head" pointer points at NET_SKB_PAD bytes of + * padding, plus 28 or 92 bytes of extra padding, plus the sk_buff + * pointer, plus the NET_IP_ALIGN padding, plus 126 or 1536 bytes for + * the actual packet, plus 62 bytes of empty padding, plus some + * padding and the "struct skb_shared_info". + * + * With 64K pages, a large buffer thus needs 32+92+4+2+1536+62+88 + * bytes, or 1816 bytes, which fits comfortably into 2048 bytes. + * + * With 64K pages, a small buffer thus needs 32+92+4+2+126+88 + * bytes, or 344 bytes, which means we are wasting 64+ bytes, and + * could presumably increase the size of small buffers. + * + * With 4K pages, a large buffer thus needs 32+92+4+2+1536+62+268 + * bytes, or 1996 bytes, which fits comfortably into 2048 bytes. + * + * With 4K pages, a small buffer thus needs 32+92+4+2+126+268 + * bytes, or 524 bytes, which is annoyingly wasteful. + * + * Maybe we should increase LIPP_SMALL_PACKET_SIZE to 192? + * + * ISSUE: Maybe we should increase "NET_SKB_PAD" to 64? */ static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info, bool small) { - /* ISSUE: What should we use here? */ +#if TILE_NET_MTU <= 1536 + /* Without "jumbo", 2 + 1536 should be sufficient. */ + unsigned int large_size = NET_IP_ALIGN + 1536; +#else + /* ISSUE: This has not been tested. */ unsigned int large_size = NET_IP_ALIGN + TILE_NET_MTU + 100; +#endif - /* Round up to ensure to avoid "false sharing" with last cache line. */ - unsigned int buffer_size = + /* Avoid "false sharing" with last cache line. */ + /* ISSUE: This is already done by "dev_alloc_skb()". */ + unsigned int len = (((small ? LIPP_SMALL_PACKET_SIZE : large_size) + CHIP_L2_LINE_SIZE() - 1) & -CHIP_L2_LINE_SIZE()); - /* - * ISSUE: Since CPAs are 38 bits, and we can only encode the - * high 31 bits in a "linux_buffer_t", the low 7 bits must be - * zero, and thus, we must align the actual "va" mod 128. - */ - const unsigned long align = 128; + unsigned int padding = 128 - NET_SKB_PAD; + unsigned int align; struct sk_buff *skb; void *va; struct sk_buff **skb_ptr; - /* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes, */ - /* and also "reserves" that many bytes. */ - /* ISSUE: Can we "share" the NET_SKB_PAD bytes with "skb_ptr"? */ - int len = sizeof(*skb_ptr) + align + buffer_size; - - while (1) { - - /* Allocate (or fail). */ - skb = dev_alloc_skb(len); - if (skb == NULL) - return false; - - /* Make room for a back-pointer to 'skb'. */ - skb_reserve(skb, sizeof(*skb_ptr)); + /* Request 96 extra bytes for alignment purposes. */ + skb = dev_alloc_skb(len + padding); + if (skb == NULL) + return false; - /* Make sure we are aligned. */ - skb_reserve(skb, -(long)skb->data & (align - 1)); + /* Skip 32 or 96 bytes to align "data" mod 128. */ + align = -(long)skb->data & (128 - 1); + BUG_ON(align > padding); + skb_reserve(skb, align); - /* This address is given to IPP. */ - va = skb->data; + /* This address is given to IPP. */ + va = skb->data; - if (small) - break; + /* Buffers must not span a huge page. */ + BUG_ON(((((long)va & ~HPAGE_MASK) + len) & HPAGE_MASK) != 0); - /* ISSUE: This has never been observed! */ - /* Large buffers must not span a huge page. */ - if (((((long)va & ~HPAGE_MASK) + 1535) & HPAGE_MASK) == 0) - break; - pr_err("Leaking unaligned linux buffer at %p.\n", va); +#ifdef TILE_NET_PARANOIA +#if CHIP_HAS_CBOX_HOME_MAP() + if (hash_default) { + HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)va); + if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3) + panic("Non-HFH ingress buffer! VA=%p Mode=%d PTE=%llx", + va, hv_pte_get_mode(pte), hv_pte_val(pte)); } +#endif +#endif + + /* Invalidate the packet buffer. */ + if (!hash_default) + __inv_buffer(va, len); /* Skip two bytes to satisfy LIPP assumptions. */ /* Note that this aligns IP on a 16 byte boundary. */ @@ -415,23 +472,9 @@ static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info, skb_ptr = va - sizeof(*skb_ptr); *skb_ptr = skb; - /* Invalidate the packet buffer. */ - if (!hash_default) - __inv_buffer(skb->data, buffer_size); - /* Make sure "skb_ptr" has been flushed. */ __insn_mf(); -#ifdef TILE_NET_PARANOIA -#if CHIP_HAS_CBOX_HOME_MAP() - if (hash_default) { - HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)va); - if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3) - panic("Non-coherent ingress buffer!"); - } -#endif -#endif - /* Provide the new buffer. */ tile_net_provide_linux_buffer(info, va, small); @@ -469,48 +512,64 @@ oops: * Grab some LEPP completions, and store them in "comps", of size * "comps_size", and return the number of completions which were * stored, so the caller can free them. - * - * If "pending" is not NULL, it will be set to true if there might - * still be some pending completions caused by this tile, else false. */ -static unsigned int tile_net_lepp_grab_comps(struct net_device *dev, +static unsigned int tile_net_lepp_grab_comps(lepp_queue_t *eq, struct sk_buff *comps[], unsigned int comps_size, - bool *pending) + unsigned int min_size) { - struct tile_net_priv *priv = netdev_priv(dev); - - lepp_queue_t *eq = priv->epp_queue; - unsigned int n = 0; - unsigned int comp_head; - unsigned int comp_busy; - unsigned int comp_tail; - - spin_lock(&priv->comp_lock); - - comp_head = eq->comp_head; - comp_busy = eq->comp_busy; - comp_tail = eq->comp_tail; + unsigned int comp_head = eq->comp_head; + unsigned int comp_busy = eq->comp_busy; while (comp_head != comp_busy && n < comps_size) { comps[n++] = eq->comps[comp_head]; LEPP_QINC(comp_head); } - if (pending != NULL) - *pending = (comp_head != comp_tail); + if (n < min_size) + return 0; eq->comp_head = comp_head; - spin_unlock(&priv->comp_lock); - return n; } /* + * Free some comps, and return true iff there are still some pending. + */ +static bool tile_net_lepp_free_comps(struct net_device *dev, bool all) +{ + struct tile_net_priv *priv = netdev_priv(dev); + + lepp_queue_t *eq = priv->eq; + + struct sk_buff *olds[64]; + unsigned int wanted = 64; + unsigned int i, n; + bool pending; + + spin_lock(&priv->eq_lock); + + if (all) + eq->comp_busy = eq->comp_tail; + + n = tile_net_lepp_grab_comps(eq, olds, wanted, 0); + + pending = (eq->comp_head != eq->comp_tail); + + spin_unlock(&priv->eq_lock); + + for (i = 0; i < n; i++) + kfree_skb(olds[i]); + + return pending; +} + + +/* * Make sure the egress timer is scheduled. * * Note that we use "schedule if not scheduled" logic instead of the more @@ -544,21 +603,11 @@ static void tile_net_handle_egress_timer(unsigned long arg) struct tile_net_cpu *info = (struct tile_net_cpu *)arg; struct net_device *dev = info->napi.dev; - struct sk_buff *olds[32]; - unsigned int wanted = 32; - unsigned int i, nolds = 0; - bool pending; - /* The timer is no longer scheduled. */ info->egress_timer_scheduled = false; - nolds = tile_net_lepp_grab_comps(dev, olds, wanted, &pending); - - for (i = 0; i < nolds; i++) - kfree_skb(olds[i]); - - /* Reschedule timer if needed. */ - if (pending) + /* Free comps, and reschedule timer if more are pending. */ + if (tile_net_lepp_free_comps(dev, false)) tile_net_schedule_egress_timer(info); } @@ -636,8 +685,39 @@ static bool is_dup_ack(char *s1, char *s2, unsigned int len) +static void tile_net_discard_aux(struct tile_net_cpu *info, int index) +{ + struct tile_netio_queue *queue = &info->queue; + netio_queue_impl_t *qsp = queue->__system_part; + netio_queue_user_impl_t *qup = &queue->__user_part; + + int index2_aux = index + sizeof(netio_pkt_t); + int index2 = + ((index2_aux == + qsp->__packet_receive_queue.__last_packet_plus_one) ? + 0 : index2_aux); + + netio_pkt_t *pkt = (netio_pkt_t *)((unsigned long) &qsp[1] + index); + + /* Extract the "linux_buffer_t". */ + unsigned int buffer = pkt->__packet.word; + + /* Convert "linux_buffer_t" to "va". */ + void *va = __va((phys_addr_t)(buffer >> 1) << 7); + + /* Acquire the associated "skb". */ + struct sk_buff **skb_ptr = va - sizeof(*skb_ptr); + struct sk_buff *skb = *skb_ptr; + + kfree_skb(skb); + + /* Consume this packet. */ + qup->__packet_receive_read = index2; +} + + /* - * Like "tile_net_handle_packets()", but just discard packets. + * Like "tile_net_poll()", but just discard packets. */ static void tile_net_discard_packets(struct net_device *dev) { @@ -650,32 +730,8 @@ static void tile_net_discard_packets(struct net_device *dev) while (qup->__packet_receive_read != qsp->__packet_receive_queue.__packet_write) { - int index = qup->__packet_receive_read; - - int index2_aux = index + sizeof(netio_pkt_t); - int index2 = - ((index2_aux == - qsp->__packet_receive_queue.__last_packet_plus_one) ? - 0 : index2_aux); - - netio_pkt_t *pkt = (netio_pkt_t *) - ((unsigned long) &qsp[1] + index); - - /* Extract the "linux_buffer_t". */ - unsigned int buffer = pkt->__packet.word; - - /* Convert "linux_buffer_t" to "va". */ - void *va = __va((phys_addr_t)(buffer >> 1) << 7); - - /* Acquire the associated "skb". */ - struct sk_buff **skb_ptr = va - sizeof(*skb_ptr); - struct sk_buff *skb = *skb_ptr; - - kfree_skb(skb); - - /* Consume this packet. */ - qup->__packet_receive_read = index2; + tile_net_discard_aux(info, index); } } @@ -704,7 +760,8 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) netio_pkt_metadata_t *metadata = NETIO_PKT_METADATA(pkt); - /* Extract the packet size. */ + /* Extract the packet size. FIXME: Shouldn't the second line */ + /* get subtracted? Mostly moot, since it should be "zero". */ unsigned long len = (NETIO_PKT_CUSTOM_LENGTH(pkt) + NET_IP_ALIGN - NETIO_PACKET_PADDING); @@ -722,15 +779,6 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) /* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */ unsigned char *buf = va + NET_IP_ALIGN; -#ifdef IGNORE_DUP_ACKS - - static int other; - static int final; - static int keep; - static int skip; - -#endif - /* Invalidate the packet buffer. */ if (!hash_default) __inv_buffer(buf, len); @@ -745,16 +793,8 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) #ifdef TILE_NET_VERIFY_INGRESS if (!NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt) && NETIO_PKT_L4_CSUM_CALCULATED_M(metadata, pkt)) { - /* - * FIXME: This complains about UDP packets - * with a "zero" checksum (bug 6624). - */ -#ifdef TILE_NET_PANIC_ON_BAD - dump_packet(buf, len, "rx"); - panic("Bad L4 checksum."); -#else + /* Bug 6624: Includes UDP packets with a "zero" checksum. */ pr_warning("Bad L4 checksum on %d byte packet.\n", len); -#endif } if (!NETIO_PKT_L3_CSUM_CORRECT_M(metadata, pkt) && NETIO_PKT_L3_CSUM_CALCULATED_M(metadata, pkt)) { @@ -769,90 +809,29 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) } break; case NETIO_PKT_STATUS_BAD: -#ifdef TILE_NET_PANIC_ON_BAD - dump_packet(buf, len, "rx"); - panic("Unexpected BAD packet."); -#else - pr_warning("Unexpected BAD %d byte packet.\n", len); -#endif + pr_warning("Unexpected BAD %ld byte packet.\n", len); } #endif filter = 0; + /* ISSUE: Filter TCP packets with "bad" checksums? */ + if (!(dev->flags & IFF_UP)) { /* Filter packets received before we're up. */ filter = 1; + } else if (NETIO_PKT_STATUS_M(metadata, pkt) == NETIO_PKT_STATUS_BAD) { + /* Filter "truncated" packets. */ + filter = 1; } else if (!(dev->flags & IFF_PROMISC)) { - /* - * FIXME: Implement HW multicast filter. - */ - if (is_unicast_ether_addr(buf)) { + /* FIXME: Implement HW multicast filter. */ + if (!is_multicast_ether_addr(buf)) { /* Filter packets not for our address. */ const u8 *mine = dev->dev_addr; filter = compare_ether_addr(mine, buf); } } -#ifdef IGNORE_DUP_ACKS - - if (len != 66) { - /* FIXME: Must check "is_tcp_ack(buf, len)" somehow. */ - - other++; - - } else if (index2 == - qsp->__packet_receive_queue.__packet_write) { - - final++; - - } else { - - netio_pkt_t *pkt2 = (netio_pkt_t *) - ((unsigned long) &qsp[1] + index2); - - netio_pkt_metadata_t *metadata2 = - NETIO_PKT_METADATA(pkt2); - - /* Extract the packet size. */ - unsigned long len2 = - (NETIO_PKT_CUSTOM_LENGTH(pkt2) + - NET_IP_ALIGN - NETIO_PACKET_PADDING); - - if (len2 == 66 && - NETIO_PKT_FLOW_HASH_M(metadata, pkt) == - NETIO_PKT_FLOW_HASH_M(metadata2, pkt2)) { - - /* Extract the "linux_buffer_t". */ - unsigned int buffer2 = pkt2->__packet.word; - - /* Convert "linux_buffer_t" to "va". */ - void *va2 = - __va((phys_addr_t)(buffer2 >> 1) << 7); - - /* Extract the packet data pointer. */ - /* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */ - unsigned char *buf2 = va2 + NET_IP_ALIGN; - - /* Invalidate the packet buffer. */ - if (!hash_default) - __inv_buffer(buf2, len2); - - if (is_dup_ack(buf, buf2, len)) { - skip++; - filter = 1; - } else { - keep++; - } - } - } - - if (net_ratelimit()) - pr_info("Other %d Final %d Keep %d Skip %d.\n", - other, final, keep, skip); - -#endif - if (filter) { /* ISSUE: Update "drop" statistics? */ @@ -877,10 +856,7 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) /* NOTE: This call also sets "skb->dev = dev". */ skb->protocol = eth_type_trans(skb, dev); - /* ISSUE: Discard corrupt packets? */ - /* ISSUE: Discard packets with bad checksums? */ - - /* Avoid recomputing TCP/UDP checksums. */ + /* Avoid recomputing "good" TCP/UDP checksums. */ if (NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt)) skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -912,9 +888,14 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) /* * Handle some packets for the given device on the current CPU. * - * ISSUE: The "rotting packet" race condition occurs if a packet - * arrives after the queue appears to be empty, and before the - * hypervisor interrupt is re-enabled. + * If "tile_net_stop()" is called on some other tile while this + * function is running, we will return, hopefully before that + * other tile asks us to call "napi_disable()". + * + * The "rotting packet" race condition occurs if a packet arrives + * during the extremely narrow window between the queue appearing to + * be empty, and the ingress interrupt being re-enabled. This happens + * a LOT under heavy network load. */ static int tile_net_poll(struct napi_struct *napi, int budget) { @@ -928,7 +909,7 @@ static int tile_net_poll(struct napi_struct *napi, int budget) unsigned int work = 0; - while (1) { + while (priv->active) { int index = qup->__packet_receive_read; if (index == qsp->__packet_receive_queue.__packet_write) break; @@ -941,19 +922,24 @@ static int tile_net_poll(struct napi_struct *napi, int budget) napi_complete(&info->napi); - /* Re-enable hypervisor interrupts. */ + if (!priv->active) + goto done; + + /* Re-enable the ingress interrupt. */ enable_percpu_irq(priv->intr_id); - /* HACK: Avoid the "rotting packet" problem. */ + /* HACK: Avoid the "rotting packet" problem (see above). */ if (qup->__packet_receive_read != - qsp->__packet_receive_queue.__packet_write) - napi_schedule(&info->napi); - - /* ISSUE: Handle completions? */ + qsp->__packet_receive_queue.__packet_write) { + /* ISSUE: Sometimes this returns zero, presumably */ + /* because an interrupt was handled for this tile. */ + (void)napi_reschedule(&info->napi); + } done: - tile_net_provide_needed_buffers(info); + if (priv->active) + tile_net_provide_needed_buffers(info); return work; } @@ -961,6 +947,12 @@ done: /* * Handle an ingress interrupt for the given device on the current cpu. + * + * ISSUE: Sometimes this gets called after "disable_percpu_irq()" has + * been called! This is probably due to "pending hypervisor downcalls". + * + * ISSUE: Is there any race condition between the "napi_schedule()" here + * and the "napi_complete()" call above? */ static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr) { @@ -969,9 +961,15 @@ static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr) int my_cpu = smp_processor_id(); struct tile_net_cpu *info = priv->cpu[my_cpu]; - /* Disable hypervisor interrupt. */ + /* Disable the ingress interrupt. */ disable_percpu_irq(priv->intr_id); + /* Ignore unwanted interrupts. */ + if (!priv->active) + return IRQ_HANDLED; + + /* ISSUE: Sometimes "info->napi_enabled" is false here. */ + napi_schedule(&info->napi); return IRQ_HANDLED; @@ -1005,8 +1003,7 @@ static int tile_net_open_aux(struct net_device *dev) */ { int epp_home = hv_lotar_to_cpu(epp_lotar); - struct page *page = virt_to_page(priv->epp_queue); - homecache_change_page_home(page, 0, epp_home); + homecache_change_page_home(priv->eq_pages, EQ_ORDER, epp_home); } /* @@ -1015,9 +1012,9 @@ static int tile_net_open_aux(struct net_device *dev) { netio_ipp_address_t ea = { .va = 0, - .pa = __pa(priv->epp_queue), + .pa = __pa(priv->eq), .pte = hv_pte(0), - .size = PAGE_SIZE, + .size = EQ_SIZE, }; ea.pte = hv_pte_set_lotar(ea.pte, epp_lotar); ea.pte = hv_pte_set_mode(ea.pte, HV_PTE_MODE_CACHE_TILE_L3); @@ -1043,7 +1040,7 @@ static int tile_net_open_aux(struct net_device *dev) /* - * Register with hypervisor on each CPU. + * Register with hypervisor on the current CPU. * * Strangely, this function does important things even if it "fails", * which is especially common if the link is not up yet. Hopefully @@ -1092,7 +1089,8 @@ static void tile_net_register(void *dev_ptr) priv->cpu[my_cpu] = info; /* - * Register ourselves with the IPP. + * Register ourselves with LIPP. This does a lot of stuff, + * including invoking the LIPP registration code. */ ret = hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&config, @@ -1101,8 +1099,11 @@ static void tile_net_register(void *dev_ptr) PDEBUG("hv_dev_pwrite(NETIO_IPP_INPUT_REGISTER_OFF) returned %d\n", ret); if (ret < 0) { - printk(KERN_DEBUG "hv_dev_pwrite NETIO_IPP_INPUT_REGISTER_OFF" - " failu |