diff options
Diffstat (limited to 'lib/raid6')
| -rw-r--r-- | lib/raid6/.gitignore | 1 | ||||
| -rw-r--r-- | lib/raid6/Makefile | 53 | ||||
| -rw-r--r-- | lib/raid6/algos.c | 147 | ||||
| -rw-r--r-- | lib/raid6/altivec.uc | 5 | ||||
| -rw-r--r-- | lib/raid6/avx2.c | 251 | ||||
| -rw-r--r-- | lib/raid6/int.uc | 2 | ||||
| -rw-r--r-- | lib/raid6/mktables.c | 26 | ||||
| -rw-r--r-- | lib/raid6/mmx.c | 2 | ||||
| -rw-r--r-- | lib/raid6/neon.c | 58 | ||||
| -rw-r--r-- | lib/raid6/neon.uc | 80 | ||||
| -rw-r--r-- | lib/raid6/recov.c | 19 | ||||
| -rw-r--r-- | lib/raid6/recov_avx2.c | 323 | ||||
| -rw-r--r-- | lib/raid6/recov_ssse3.c | 332 | ||||
| -rw-r--r-- | lib/raid6/sse1.c | 2 | ||||
| -rw-r--r-- | lib/raid6/sse2.c | 8 | ||||
| -rw-r--r-- | lib/raid6/test/Makefile | 62 | ||||
| -rw-r--r-- | lib/raid6/test/test.c | 32 | ||||
| -rw-r--r-- | lib/raid6/tilegx.uc | 86 | ||||
| -rw-r--r-- | lib/raid6/x86.h | 19 | 
19 files changed, 1429 insertions, 79 deletions
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 162becacf97..0a7e494b2bc 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore @@ -2,3 +2,4 @@ mktables  altivec*.c  int*.c  tables.c +neon?.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 8a38102770f..c7dab064555 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -1,8 +1,13 @@  obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o  raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \ -		   int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ -		   altivec8.o mmx.o sse1.o sse2.o +		   int8.o int16.o int32.o + +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o +raid6_pq-$(CONFIG_TILEGX) += tilegx8.o +  hostprogs-y	+= mktables  quiet_cmd_unroll = UNROLL  $@ @@ -13,6 +18,21 @@ ifeq ($(CONFIG_ALTIVEC),y)  altivec_flags := -maltivec -mabi=altivec  endif +# The GCC option -ffreestanding is required in order to compile code containing +# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) +ifeq ($(CONFIG_KERNEL_MODE_NEON),y) +NEON_FLAGS := -ffreestanding +ifeq ($(ARCH),arm) +NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon +endif +ifeq ($(ARCH),arm64) +CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only +CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only +CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only +CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only +endif +endif +  targets += int1.c  $(obj)/int1.c:   UNROLL := 1  $(obj)/int1.c:   $(src)/int.uc $(src)/unroll.awk FORCE @@ -67,6 +87,35 @@ $(obj)/altivec8.c:   UNROLL := 8  $(obj)/altivec8.c:   $(src)/altivec.uc $(src)/unroll.awk FORCE  	$(call if_changed,unroll) +CFLAGS_neon1.o += $(NEON_FLAGS) +targets += neon1.c +$(obj)/neon1.c:   UNROLL := 1 +$(obj)/neon1.c:   $(src)/neon.uc $(src)/unroll.awk FORCE +	$(call if_changed,unroll) + +CFLAGS_neon2.o += $(NEON_FLAGS) +targets += neon2.c +$(obj)/neon2.c:   UNROLL := 2 +$(obj)/neon2.c:   $(src)/neon.uc $(src)/unroll.awk FORCE +	$(call if_changed,unroll) + +CFLAGS_neon4.o += $(NEON_FLAGS) +targets += neon4.c +$(obj)/neon4.c:   UNROLL := 4 +$(obj)/neon4.c:   $(src)/neon.uc $(src)/unroll.awk FORCE +	$(call if_changed,unroll) + +CFLAGS_neon8.o += $(NEON_FLAGS) +targets += neon8.c +$(obj)/neon8.c:   UNROLL := 8 +$(obj)/neon8.c:   $(src)/neon.uc $(src)/unroll.awk FORCE +	$(call if_changed,unroll) + +targets += tilegx8.c +$(obj)/tilegx8.c:   UNROLL := 8 +$(obj)/tilegx8.c:   $(src)/tilegx.uc $(src)/unroll.awk FORCE +	$(call if_changed,unroll) +  quiet_cmd_mktable = TABLE   $@        cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index b595f560bee..f0b1aa3586d 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -21,6 +21,7 @@  #include <sys/mman.h>  #include <stdio.h>  #else +#include <linux/module.h>  #include <linux/gfp.h>  #if !RAID6_USE_EMPTY_ZERO_PAGE  /* In .bss so it's zeroed */ @@ -33,10 +34,6 @@ struct raid6_calls raid6_call;  EXPORT_SYMBOL_GPL(raid6_call);  const struct raid6_calls * const raid6_algos[] = { -	&raid6_intx1, -	&raid6_intx2, -	&raid6_intx4, -	&raid6_intx8,  #if defined(__ia64__)  	&raid6_intx16,  	&raid6_intx32, @@ -48,11 +45,20 @@ const struct raid6_calls * const raid6_algos[] = {  	&raid6_sse1x2,  	&raid6_sse2x1,  	&raid6_sse2x2, +#ifdef CONFIG_AS_AVX2 +	&raid6_avx2x1, +	&raid6_avx2x2, +#endif  #endif  #if defined(__x86_64__) && !defined(__arch_um__)  	&raid6_sse2x1,  	&raid6_sse2x2,  	&raid6_sse2x4, +#ifdef CONFIG_AS_AVX2 +	&raid6_avx2x1, +	&raid6_avx2x2, +	&raid6_avx2x4, +#endif  #endif  #ifdef CONFIG_ALTIVEC  	&raid6_altivec1, @@ -60,6 +66,36 @@ const struct raid6_calls * const raid6_algos[] = {  	&raid6_altivec4,  	&raid6_altivec8,  #endif +#if defined(CONFIG_TILEGX) +	&raid6_tilegx8, +#endif +	&raid6_intx1, +	&raid6_intx2, +	&raid6_intx4, +	&raid6_intx8, +#ifdef CONFIG_KERNEL_MODE_NEON +	&raid6_neonx1, +	&raid6_neonx2, +	&raid6_neonx4, +	&raid6_neonx8, +#endif +	NULL +}; + +void (*raid6_2data_recov)(int, size_t, int, int, void **); +EXPORT_SYMBOL_GPL(raid6_2data_recov); + +void (*raid6_datap_recov)(int, size_t, int, void **); +EXPORT_SYMBOL_GPL(raid6_datap_recov); + +const struct raid6_recov_calls *const raid6_recov_algos[] = { +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) +#ifdef CONFIG_AS_AVX2 +	&raid6_recov_avx2, +#endif +	&raid6_recov_ssse3, +#endif +	&raid6_recov_intx1,  	NULL  }; @@ -71,59 +107,55 @@ const struct raid6_calls * const raid6_algos[] = {  #define time_before(x, y) ((x) < (y))  #endif -/* Try to pick the best algorithm */ -/* This code uses the gfmul table as convenient data set to abuse */ - -int __init raid6_select_algo(void) +static inline const struct raid6_recov_calls *raid6_choose_recov(void)  { -	const struct raid6_calls * const * algo; -	const struct raid6_calls * best; -	char *syndromes; -	void *dptrs[(65536/PAGE_SIZE)+2]; -	int i, disks; -	unsigned long perf, bestperf; -	int bestprefer; -	unsigned long j0, j1; +	const struct raid6_recov_calls *const *algo; +	const struct raid6_recov_calls *best; -	disks = (65536/PAGE_SIZE)+2; -	for ( i = 0 ; i < disks-2 ; i++ ) { -		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; -	} +	for (best = NULL, algo = raid6_recov_algos; *algo; algo++) +		if (!best || (*algo)->priority > best->priority) +			if (!(*algo)->valid || (*algo)->valid()) +				best = *algo; -	/* Normal code - use a 2-page allocation to avoid D$ conflict */ -	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); +	if (best) { +		raid6_2data_recov = best->data2; +		raid6_datap_recov = best->datap; -	if ( !syndromes ) { -		printk("raid6: Yikes!  No memory available.\n"); -		return -ENOMEM; -	} +		printk("raid6: using %s recovery algorithm\n", best->name); +	} else +		printk("raid6: Yikes! No recovery algorithm found!\n"); -	dptrs[disks-2] = syndromes; -	dptrs[disks-1] = syndromes + PAGE_SIZE; +	return best; +} -	bestperf = 0;  bestprefer = 0;  best = NULL; +static inline const struct raid6_calls *raid6_choose_gen( +	void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) +{ +	unsigned long perf, bestperf, j0, j1; +	const struct raid6_calls *const *algo; +	const struct raid6_calls *best; + +	for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { +		if (!best || (*algo)->prefer >= best->prefer) { +			if ((*algo)->valid && !(*algo)->valid()) +				continue; -	for ( algo = raid6_algos ; *algo ; algo++ ) { -		if ( !(*algo)->valid || (*algo)->valid() ) {  			perf = 0;  			preempt_disable();  			j0 = jiffies; -			while ( (j1 = jiffies) == j0 ) +			while ((j1 = jiffies) == j0)  				cpu_relax();  			while (time_before(jiffies,  					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { -				(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); +				(*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);  				perf++;  			}  			preempt_enable(); -			if ( (*algo)->prefer > bestprefer || -			     ((*algo)->prefer == bestprefer && -			      perf > bestperf) ) { -				best = *algo; -				bestprefer = best->prefer; +			if (perf > bestperf) {  				bestperf = perf; +				best = *algo;  			}  			printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,  			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); @@ -138,9 +170,46 @@ int __init raid6_select_algo(void)  	} else  		printk("raid6: Yikes!  No algorithm found!\n"); +	return best; +} + + +/* Try to pick the best algorithm */ +/* This code uses the gfmul table as convenient data set to abuse */ + +int __init raid6_select_algo(void) +{ +	const int disks = (65536/PAGE_SIZE)+2; + +	const struct raid6_calls *gen_best; +	const struct raid6_recov_calls *rec_best; +	char *syndromes; +	void *dptrs[(65536/PAGE_SIZE)+2]; +	int i; + +	for (i = 0; i < disks-2; i++) +		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; + +	/* Normal code - use a 2-page allocation to avoid D$ conflict */ +	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + +	if (!syndromes) { +		printk("raid6: Yikes!  No memory available.\n"); +		return -ENOMEM; +	} + +	dptrs[disks-2] = syndromes; +	dptrs[disks-1] = syndromes + PAGE_SIZE; + +	/* select raid gen_syndrome function */ +	gen_best = raid6_choose_gen(&dptrs, disks); + +	/* select raid recover functions */ +	rec_best = raid6_choose_recov(); +  	free_pages((unsigned long)syndromes, 1); -	return best ? 0 : -EINVAL; +	return gen_best && rec_best ? 0 : -EINVAL;  }  static void raid6_exit(void) diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc index 2654d5c854b..7cc12b532e9 100644 --- a/lib/raid6/altivec.uc +++ b/lib/raid6/altivec.uc @@ -24,13 +24,10 @@  #include <linux/raid/pq.h> -#ifdef CONFIG_ALTIVEC -  #include <altivec.h>  #ifdef __KERNEL__ -# include <asm/system.h>  # include <asm/cputable.h> -#endif +# include <asm/switch_to.h>  /*   * This is the C data type to use.  We use a vector of diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c new file mode 100644 index 00000000000..bc3b1dd436e --- /dev/null +++ b/lib/raid6/avx2.c @@ -0,0 +1,251 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright (C) 2012 Intel Corporation + *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com> + * + *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Boston MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * AVX2 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX2 + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_avx2_constants { +	u64 x1d[4]; +} raid6_avx2_constants __aligned(32) = { +	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, +	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx2(void) +{ +	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); +} + +/* + * Plain AVX2 implementation + */ +static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	kernel_fpu_begin(); + +	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); +	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */ + +	for (d = 0; d < bytes; d += 32) { +		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); +		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ +		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); +		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ +		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); +		for (z = z0-2; z >= 0; z--) { +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); +			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); +			asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); +			asm volatile("vpand %ymm0,%ymm5,%ymm5"); +			asm volatile("vpxor %ymm5,%ymm4,%ymm4"); +			asm volatile("vpxor %ymm6,%ymm2,%ymm2"); +			asm volatile("vpxor %ymm6,%ymm4,%ymm4"); +			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); +		} +		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); +		asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); +		asm volatile("vpand %ymm0,%ymm5,%ymm5"); +		asm volatile("vpxor %ymm5,%ymm4,%ymm4"); +		asm volatile("vpxor %ymm6,%ymm2,%ymm2"); +		asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + +		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); +		asm volatile("vpxor %ymm2,%ymm2,%ymm2"); +		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); +		asm volatile("vpxor %ymm4,%ymm4,%ymm4"); +	} + +	asm volatile("sfence" : : : "memory"); +	kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x1 = { +	raid6_avx21_gen_syndrome, +	raid6_have_avx2, +	"avx2x1", +	1			/* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX2 implementation + */ +static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	kernel_fpu_begin(); + +	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); +	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + +	/* We uniformly assume a single prefetch covers at least 32 bytes */ +	for (d = 0; d < bytes; d += 64) { +		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); +		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); +		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ +		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ +		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ +		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ +		for (z = z0-1; z >= 0; z--) { +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); +			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); +			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); +			asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); +			asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); +			asm volatile("vpand %ymm0,%ymm5,%ymm5"); +			asm volatile("vpand %ymm0,%ymm7,%ymm7"); +			asm volatile("vpxor %ymm5,%ymm4,%ymm4"); +			asm volatile("vpxor %ymm7,%ymm6,%ymm6"); +			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); +			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); +			asm volatile("vpxor %ymm5,%ymm2,%ymm2"); +			asm volatile("vpxor %ymm7,%ymm3,%ymm3"); +			asm volatile("vpxor %ymm5,%ymm4,%ymm4"); +			asm volatile("vpxor %ymm7,%ymm6,%ymm6"); +		} +		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); +		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); +		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); +		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); +	} + +	asm volatile("sfence" : : : "memory"); +	kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x2 = { +	raid6_avx22_gen_syndrome, +	raid6_have_avx2, +	"avx2x2", +	1			/* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u8 *p, *q; +	int d, z, z0; + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	kernel_fpu_begin(); + +	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); +	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */ +	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */ +	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */ +	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */ +	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */ +	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */ +	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */ +	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */ +	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */ + +	for (d = 0; d < bytes; d += 128) { +		for (z = z0; z >= 0; z--) { +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); +			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); +			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); +			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); +			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); +			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); +			asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); +			asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); +			asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); +			asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); +			asm volatile("vpand %ymm0,%ymm5,%ymm5"); +			asm volatile("vpand %ymm0,%ymm7,%ymm7"); +			asm volatile("vpand %ymm0,%ymm13,%ymm13"); +			asm volatile("vpand %ymm0,%ymm15,%ymm15"); +			asm volatile("vpxor %ymm5,%ymm4,%ymm4"); +			asm volatile("vpxor %ymm7,%ymm6,%ymm6"); +			asm volatile("vpxor %ymm13,%ymm12,%ymm12"); +			asm volatile("vpxor %ymm15,%ymm14,%ymm14"); +			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); +			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); +			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); +			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); +			asm volatile("vpxor %ymm5,%ymm2,%ymm2"); +			asm volatile("vpxor %ymm7,%ymm3,%ymm3"); +			asm volatile("vpxor %ymm13,%ymm10,%ymm10"); +			asm volatile("vpxor %ymm15,%ymm11,%ymm11"); +			asm volatile("vpxor %ymm5,%ymm4,%ymm4"); +			asm volatile("vpxor %ymm7,%ymm6,%ymm6"); +			asm volatile("vpxor %ymm13,%ymm12,%ymm12"); +			asm volatile("vpxor %ymm15,%ymm14,%ymm14"); +		} +		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); +		asm volatile("vpxor %ymm2,%ymm2,%ymm2"); +		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); +		asm volatile("vpxor %ymm3,%ymm3,%ymm3"); +		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); +		asm volatile("vpxor %ymm10,%ymm10,%ymm10"); +		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); +		asm volatile("vpxor %ymm11,%ymm11,%ymm11"); +		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); +		asm volatile("vpxor %ymm4,%ymm4,%ymm4"); +		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); +		asm volatile("vpxor %ymm6,%ymm6,%ymm6"); +		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); +		asm volatile("vpxor %ymm12,%ymm12,%ymm12"); +		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); +		asm volatile("vpxor %ymm14,%ymm14,%ymm14"); +	} + +	asm volatile("sfence" : : : "memory"); +	kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x4 = { +	raid6_avx24_gen_syndrome, +	raid6_have_avx2, +	"avx2x4", +	1			/* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX2 */ diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc index d1e276a14fa..5b50f8dfc5d 100644 --- a/lib/raid6/int.uc +++ b/lib/raid6/int.uc @@ -11,7 +11,7 @@   * ----------------------------------------------------------------------- */  /* - * raid6int$#.c + * int$#.c   *   * $#-way unrolled portable integer math RAID-6 instruction set   * diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index 3b1500843bb..39787db588b 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -60,6 +60,7 @@ int main(int argc, char *argv[])  	uint8_t exptbl[256], invtbl[256];  	printf("#include <linux/raid/pq.h>\n"); +	printf("#include <linux/export.h>\n");  	/* Compute multiplication table */  	printf("\nconst u8  __attribute__((aligned(256)))\n" @@ -80,6 +81,31 @@ int main(int argc, char *argv[])  	printf("EXPORT_SYMBOL(raid6_gfmul);\n");  	printf("#endif\n"); +	/* Compute vector multiplication table */ +	printf("\nconst u8  __attribute__((aligned(256)))\n" +		"raid6_vgfmul[256][32] =\n" +		"{\n"); +	for (i = 0; i < 256; i++) { +		printf("\t{\n"); +		for (j = 0; j < 16; j += 8) { +			printf("\t\t"); +			for (k = 0; k < 8; k++) +				printf("0x%02x,%c", gfmul(i, j + k), +				       (k == 7) ? '\n' : ' '); +		} +		for (j = 0; j < 16; j += 8) { +			printf("\t\t"); +			for (k = 0; k < 8; k++) +				printf("0x%02x,%c", gfmul(i, (j + k) << 4), +				       (k == 7) ? '\n' : ' '); +		} +		printf("\t},\n"); +	} +	printf("};\n"); +	printf("#ifdef __KERNEL__\n"); +	printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); +	printf("#endif\n"); +  	/* Compute power-of-2 table (exponent) */  	v = 1;  	printf("\nconst u8 __attribute__((aligned(256)))\n" diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c index 279347f2309..590c71c9e20 100644 --- a/lib/raid6/mmx.c +++ b/lib/raid6/mmx.c @@ -16,7 +16,7 @@   * MMX implementation of RAID-6 syndrome functions   */ -#if defined(__i386__) && !defined(__arch_um__) +#ifdef CONFIG_X86_32  #include <linux/raid/pq.h>  #include "x86.h" diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c new file mode 100644 index 00000000000..36ad4705df1 --- /dev/null +++ b/lib/raid6/neon.c @@ -0,0 +1,58 @@ +/* + * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/raid/pq.h> + +#ifdef __KERNEL__ +#include <asm/neon.h> +#else +#define kernel_neon_begin() +#define kernel_neon_end() +#define cpu_has_neon()		(1) +#endif + +/* + * There are 2 reasons these wrappers are kept in a separate compilation unit + * from the actual implementations in neonN.c (generated from neon.uc by + * unroll.awk): + * - the actual implementations use NEON intrinsics, and the GCC support header + *   (arm_neon.h) is not fully compatible (type wise) with the kernel; + * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, + *   and we have to make sure that we never use *any* NEON/VFP instructions + *   outside a kernel_neon_begin()/kernel_neon_end() pair. + */ + +#define RAID6_NEON_WRAPPER(_n)						\ +	static void raid6_neon ## _n ## _gen_syndrome(int disks,	\ +					size_t bytes, void **ptrs)	\ +	{								\ +		void raid6_neon ## _n  ## _gen_syndrome_real(int,	\ +						unsigned long, void**);	\ +		kernel_neon_begin();					\ +		raid6_neon ## _n ## _gen_syndrome_real(disks,		\ +					(unsigned long)bytes, ptrs);	\ +		kernel_neon_end();					\ +	}								\ +	struct raid6_calls const raid6_neonx ## _n = {			\ +		raid6_neon ## _n ## _gen_syndrome,			\ +		raid6_have_neon,					\ +		"neonx" #_n,						\ +		0							\ +	} + +static int raid6_have_neon(void) +{ +	return cpu_has_neon(); +} + +RAID6_NEON_WRAPPER(1); +RAID6_NEON_WRAPPER(2); +RAID6_NEON_WRAPPER(4); +RAID6_NEON_WRAPPER(8); diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc new file mode 100644 index 00000000000..1b9ed793342 --- /dev/null +++ b/lib/raid6/neon.uc @@ -0,0 +1,80 @@ +/* ----------------------------------------------------------------------- + * + *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions + * + *   Copyright (C) 2012 Rob Herring + * + *   Based on altivec.uc: + *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Boston MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * neon$#.c + * + * $#-way unrolled NEON intrinsics math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <arm_neon.h> + +typedef uint8x16_t unative_t; + +#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE	sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline unative_t SHLBYTE(unative_t v) +{ +	return vshlq_n_u8(v, 1); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline unative_t MASK(unative_t v) +{ +	const uint8x16_t temp = NBYTES(0); +	return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); +} + +void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ +	uint8_t **dptr = (uint8_t **)ptrs; +	uint8_t *p, *q; +	int d, z, z0; + +	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; +	const unative_t x1d = NBYTES(0x1d); + +	z0 = disks - 3;		/* Highest data disk */ +	p = dptr[z0+1];		/* XOR parity */ +	q = dptr[z0+2];		/* RS syndrome */ + +	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { +		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); +			wp$$ = veorq_u8(wp$$, wd$$); +			w2$$ = MASK(wq$$); +			w1$$ = SHLBYTE(wq$$); + +			w2$$ = vandq_u8(w2$$, x1d); +			w1$$ = veorq_u8(w1$$, w2$$); +			wq$$ = veorq_u8(w1$$, wd$$); +		} +		vst1q_u8(&p[d+NSIZE*$$], wp$$); +		vst1q_u8(&q[d+NSIZE*$$], wq$$); +	} +} diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index 8590d19cf52..a95bccb8497 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -18,11 +18,12 @@   * the syndrome.)   */ +#include <linux/export.h>  #include <linux/raid/pq.h>  /* Recover two failed data blocks. */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, -		       void **ptrs) +static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, +		int failb, void **ptrs)  {  	u8 *p, *q, *dp, *dq;  	u8 px, qx, db; @@ -63,10 +64,10 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,  		p++; q++;  	}  } -EXPORT_SYMBOL_GPL(raid6_2data_recov);  /* Recover failure of one data block plus the P block */ -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) +static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, +		void **ptrs)  {  	u8 *p, *q, *dq;  	const u8 *qmul;		/* Q multiplier table */ @@ -95,7 +96,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)  		q++; dq++;  	}  } -EXPORT_SYMBOL_GPL(raid6_datap_recov); + + +const struct raid6_recov_calls raid6_recov_intx1 = { +	.data2 = raid6_2data_recov_intx1, +	.datap = raid6_datap_recov_intx1, +	.valid = NULL, +	.name = "intx1", +	.priority = 0, +};  #ifndef __KERNEL__  /* Testing only */ diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c new file mode 100644 index 00000000000..e1eea433a49 --- /dev/null +++ b/lib/raid6/recov_avx2.c @@ -0,0 +1,323 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#if CONFIG_AS_AVX2 + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_avx2(void) +{ +	return boot_cpu_has(X86_FEATURE_AVX2) && +		boot_cpu_has(X86_FEATURE_AVX); +} + +static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, +		int failb, void **ptrs) +{ +	u8 *p, *q, *dp, *dq; +	const u8 *pbmul;	/* P multiplier table for B data */ +	const u8 *qmul;		/* Q multiplier table (for both) */ +	const u8 x0f = 0x0f; + +	p = (u8 *)ptrs[disks-2]; +	q = (u8 *)ptrs[disks-1]; + +	/* Compute syndrome with zero for the missing data pages +	   Use the dead data pages as temporary storage for +	   delta p and delta q */ +	dp = (u8 *)ptrs[faila]; +	ptrs[faila] = (void *)raid6_empty_zero_page; +	ptrs[disks-2] = dp; +	dq = (u8 *)ptrs[failb]; +	ptrs[failb] = (void *)raid6_empty_zero_page; +	ptrs[disks-1] = dq; + +	raid6_call.gen_syndrome(disks, bytes, ptrs); + +	/* Restore pointer table */ +	ptrs[faila]   = dp; +	ptrs[failb]   = dq; +	ptrs[disks-2] = p; +	ptrs[disks-1] = q; + +	/* Now, pick the proper data tables */ +	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; +	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ +		raid6_gfexp[failb]]]; + +	kernel_fpu_begin(); + +	/* ymm0 = x0f[16] */ +	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + +	while (bytes) { +#ifdef CONFIG_X86_64 +		asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0])); +		asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32])); +		asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0])); +		asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32])); +		asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0])); +		asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32])); +		asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0])); +		asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32])); + +		/* +		 * 1 = dq[0]  ^ q[0] +		 * 9 = dq[32] ^ q[32] +		 * 0 = dp[0]  ^ p[0] +		 * 8 = dp[32] ^ p[32] +		 */ + +		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); +		asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + +		asm volatile("vpsraw $4, %ymm1, %ymm3"); +		asm volatile("vpsraw $4, %ymm9, %ymm12"); +		asm volatile("vpand %ymm7, %ymm1, %ymm1"); +		asm volatile("vpand %ymm7, %ymm9, %ymm9"); +		asm volatile("vpand %ymm7, %ymm3, %ymm3"); +		asm volatile("vpand %ymm7, %ymm12, %ymm12"); +		asm volatile("vpshufb %ymm9, %ymm4, %ymm14"); +		asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); +		asm volatile("vpshufb %ymm12, %ymm5, %ymm15"); +		asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); +		asm volatile("vpxor %ymm14, %ymm15, %ymm15"); +		asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + +		/* +		 * 5 = qx[0] +		 * 15 = qx[32] +		 */ + +		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); +		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); +		asm volatile("vpsraw $4, %ymm0, %ymm2"); +		asm volatile("vpsraw $4, %ymm8, %ymm6"); +		asm volatile("vpand %ymm7, %ymm0, %ymm3"); +		asm volatile("vpand %ymm7, %ymm8, %ymm14"); +		asm volatile("vpand %ymm7, %ymm2, %ymm2"); +		asm volatile("vpand %ymm7, %ymm6, %ymm6"); +		asm volatile("vpshufb %ymm14, %ymm4, %ymm12"); +		asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); +		asm volatile("vpshufb %ymm6, %ymm1, %ymm13"); +		asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); +		asm volatile("vpxor %ymm4, %ymm1, %ymm1"); +		asm volatile("vpxor %ymm12, %ymm13, %ymm13"); + +		/* +		 * 1  = pbmul[px[0]] +		 * 13 = pbmul[px[32]] +		 */ +		asm volatile("vpxor %ymm5, %ymm1, %ymm1"); +		asm volatile("vpxor %ymm15, %ymm13, %ymm13"); + +		/* +		 * 1 = db = DQ +		 * 13 = db[32] = DQ[32] +		 */ +		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); +		asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32])); +		asm volatile("vpxor %ymm1, %ymm0, %ymm0"); +		asm volatile("vpxor %ymm13, %ymm8, %ymm8"); + +		asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); +		asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32])); + +		bytes -= 64; +		p += 64; +		q += 64; +		dp += 64; +		dq += 64; +#else +		asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q)); +		asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p)); +		asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq)); +		asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp)); + +		/* 1 = dq ^ q;  0 = dp ^ p */ + +		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); +		asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + +		/* +		 * 1 = dq ^ q +		 * 3 = dq ^ p >> 4 +		 */ +		asm volatile("vpsraw $4, %ymm1, %ymm3"); +		asm volatile("vpand %ymm7, %ymm1, %ymm1"); +		asm volatile("vpand %ymm7, %ymm3, %ymm3"); +		asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); +		asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); +		asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + +		/* 5 = qx */ + +		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); +		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + +		asm volatile("vpsraw $4, %ymm0, %ymm2"); +		asm volatile("vpand %ymm7, %ymm0, %ymm3"); +		asm volatile("vpand %ymm7, %ymm2, %ymm2"); +		asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); +		asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); +		asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + +		/* 1 = pbmul[px] */ +		asm volatile("vpxor %ymm5, %ymm1, %ymm1"); +		/* 1 = db = DQ */ +		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + +		asm volatile("vpxor %ymm1, %ymm0, %ymm0"); +		asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + +		bytes -= 32; +		p += 32; +		q += 32; +		dp += 32; +		dq += 32; +#endif +	} + +	kernel_fpu_end(); +} + +static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, +		void **ptrs) +{ +	u8 *p, *q, *dq; +	const u8 *qmul;		/* Q multiplier table */ +	const u8 x0f = 0x0f; + +	p = (u8 *)ptrs[disks-2]; +	q = (u8 *)ptrs[disks-1]; + +	/* Compute syndrome with zero for the missing data page +	   Use the dead data page as temporary storage for delta q */ +	dq = (u8 *)ptrs[faila]; +	ptrs[faila] = (void *)raid6_empty_zero_page; +	ptrs[disks-1] = dq; + +	raid6_call.gen_syndrome(disks, bytes, ptrs); + +	/* Restore pointer table */ +	ptrs[faila]   = dq; +	ptrs[disks-1] = q; + +	/* Now, pick the proper data tables */ +	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + +	kernel_fpu_begin(); + +	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + +	while (bytes) { +#ifdef CONFIG_X86_64 +		asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); +		asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32])); +		asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); +		asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32])); + +		/* +		 * 3 = q[0] ^ dq[0] +		 * 8 = q[32] ^ dq[32] +		 */ +		asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); +		asm volatile("vmovapd %ymm0, %ymm13"); +		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); +		asm volatile("vmovapd %ymm1, %ymm14"); + +		asm volatile("vpsraw $4, %ymm3, %ymm6"); +		asm volatile("vpsraw $4, %ymm8, %ymm12"); +		asm volatile("vpand %ymm7, %ymm3, %ymm3"); +		asm volatile("vpand %ymm7, %ymm8, %ymm8"); +		asm volatile("vpand %ymm7, %ymm6, %ymm6"); +		asm volatile("vpand %ymm7, %ymm12, %ymm12"); +		asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); +		asm volatile("vpshufb %ymm8, %ymm13, %ymm13"); +		asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); +		asm volatile("vpshufb %ymm12, %ymm14, %ymm14"); +		asm volatile("vpxor %ymm0, %ymm1, %ymm1"); +		asm volatile("vpxor %ymm13, %ymm14, %ymm14"); + +		/* +		 * 1  = qmul[q[0]  ^ dq[0]] +		 * 14 = qmul[q[32] ^ dq[32]] +		 */ +		asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); +		asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32])); +		asm volatile("vpxor %ymm1, %ymm2, %ymm2"); +		asm volatile("vpxor %ymm14, %ymm12, %ymm12"); + +		/* +		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]] +		 * 12 = p[32] ^ qmul[q[32] ^ dq[32]] +		 */ + +		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); +		asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32])); +		asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); +		asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32])); + +		bytes -= 64; +		p += 64; +		q += 64; +		dq += 64; +#else +		asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); +		asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + +		/* 3 = q ^ dq */ + +		asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); +		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + +		asm volatile("vpsraw $4, %ymm3, %ymm6"); +		asm volatile("vpand %ymm7, %ymm3, %ymm3"); +		asm volatile("vpand %ymm7, %ymm6, %ymm6"); +		asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); +		asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); +		asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + +		/* 1 = qmul[q ^ dq] */ + +		asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); +		asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + +		/* 2 = p ^ qmul[q ^ dq] */ + +		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); +		asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + +		bytes -= 32; +		p += 32; +		q += 32; +		dq += 32; +#endif +	} + +	kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx2 = { +	.data2 = raid6_2data_recov_avx2, +	.datap = raid6_datap_recov_avx2, +	.valid = raid6_has_avx2, +#ifdef CONFIG_X86_64 +	.name = "avx2x2", +#else +	.name = "avx2x1", +#endif +	.priority = 2, +}; + +#else +#warning "your version of binutils lacks AVX2 support" +#endif diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c new file mode 100644 index 00000000000..a9168328f03 --- /dev/null +++ b/lib/raid6/recov_ssse3.c @@ -0,0 +1,332 @@ +/* + * Copyright (C) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_ssse3(void) +{ +	return boot_cpu_has(X86_FEATURE_XMM) && +		boot_cpu_has(X86_FEATURE_XMM2) && +		boot_cpu_has(X86_FEATURE_SSSE3); +} + +static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, +		int failb, void **ptrs) +{ +	u8 *p, *q, *dp, *dq; +	const u8 *pbmul;	/* P multiplier table for B data */ +	const u8 *qmul;		/* Q multiplier table (for both) */ +	static const u8 __aligned(16) x0f[16] = { +		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, +		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + +	p = (u8 *)ptrs[disks-2]; +	q = (u8 *)ptrs[disks-1]; + +	/* Compute syndrome with zero for the missing data pages +	   Use the dead data pages as temporary storage for +	   delta p and delta q */ +	dp = (u8 *)ptrs[faila]; +	ptrs[faila] = (void *)raid6_empty_zero_page; +	ptrs[disks-2] = dp; +	dq = (u8 *)ptrs[failb]; +	ptrs[failb] = (void *)raid6_empty_zero_page; +	ptrs[disks-1] = dq; + +	raid6_call.gen_syndrome(disks, bytes, ptrs); + +	/* Restore pointer table */ +	ptrs[faila]   = dp; +	ptrs[failb]   = dq; +	ptrs[disks-2] = p; +	ptrs[disks-1] = q; + +	/* Now, pick the proper data tables */ +	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; +	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ +		raid6_gfexp[failb]]]; + +	kernel_fpu_begin(); + +	asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); + +#ifdef CONFIG_X86_64 +	asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); +	asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); +	asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); +#endif + +	/* Now do it... */ +	while (bytes) { +#ifdef CONFIG_X86_64 +		/* xmm6, xmm14, xmm15 */ + +		asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); +		asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); +		asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); +		asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); +		asm volatile("pxor   %0,%%xmm1" : : "m" (dq[0])); +		asm volatile("pxor   %0,%%xmm9" : : "m" (dq[16])); +		asm volatile("pxor   %0,%%xmm0" : : "m" (dp[0])); +		asm volatile("pxor   %0,%%xmm8" : : "m" (dp[16])); + +		/* xmm0/8 = px */ + +		asm volatile("movdqa %xmm6,%xmm4"); +		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); +		asm volatile("movdqa %xmm6,%xmm12"); +		asm volatile("movdqa %xmm5,%xmm13"); +		asm volatile("movdqa %xmm1,%xmm3"); +		asm volatile("movdqa %xmm9,%xmm11"); +		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ +		asm volatile("movdqa %xmm8,%xmm10"); +		asm volatile("psraw  $4,%xmm1"); +		asm volatile("psraw  $4,%xmm9"); +		asm volatile("pand   %xmm7,%xmm3"); +		asm volatile("pand   %xmm7,%xmm11"); +		asm volatile("pand   %xmm7,%xmm1"); +		asm volatile("pand   %xmm7,%xmm9"); +		asm volatile("pshufb %xmm3,%xmm4"); +		asm volatile("pshufb %xmm11,%xmm12"); +		asm volatile("pshufb %xmm1,%xmm5"); +		asm volatile("pshufb %xmm9,%xmm13"); +		asm volatile("pxor   %xmm4,%xmm5"); +		asm volatile("pxor   %xmm12,%xmm13"); + +		/* xmm5/13 = qx */ + +		asm volatile("movdqa %xmm14,%xmm4"); +		asm volatile("movdqa %xmm15,%xmm1"); +		asm volatile("movdqa %xmm14,%xmm12"); +		asm volatile("movdqa %xmm15,%xmm9"); +		asm volatile("movdqa %xmm2,%xmm3"); +		asm volatile("movdqa %xmm10,%xmm11"); +		asm volatile("psraw  $4,%xmm2"); +		asm volatile("psraw  $4,%xmm10"); +		asm volatile("pand   %xmm7,%xmm3"); +		asm volatile("pand   %xmm7,%xmm11"); +		asm volatile("pand   %xmm7,%xmm2"); +		asm volatile("pand   %xmm7,%xmm10"); +		asm volatile("pshufb %xmm3,%xmm4"); +		asm volatile("pshufb %xmm11,%xmm12"); +		asm volatile("pshufb %xmm2,%xmm1"); +		asm volatile("pshufb %xmm10,%xmm9"); +		asm volatile("pxor   %xmm4,%xmm1"); +		asm volatile("pxor   %xmm12,%xmm9"); + +		/* xmm1/9 = pbmul[px] */ +		asm volatile("pxor   %xmm5,%xmm1"); +		asm volatile("pxor   %xmm13,%xmm9"); +		/* xmm1/9 = db = DQ */ +		asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); +		asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); + +		asm volatile("pxor   %xmm1,%xmm0"); +		asm volatile("pxor   %xmm9,%xmm8"); +		asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); +		asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); + +		bytes -= 32; +		p += 32; +		q += 32; +		dp += 32; +		dq += 32; +#else +		asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); +		asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); +		asm volatile("pxor   %0,%%xmm1" : : "m" (*dq)); +		asm volatile("pxor   %0,%%xmm0" : : "m" (*dp)); + +		/* 1 = dq ^ q +		 * 0 = dp ^ p +		 */ +		asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); +		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + +		asm volatile("movdqa %xmm1,%xmm3"); +		asm volatile("psraw  $4,%xmm1"); +		asm volatile("pand   %xmm7,%xmm3"); +		asm volatile("pand   %xmm7,%xmm1"); +		asm volatile("pshufb %xmm3,%xmm4"); +		asm volatile("pshufb %xmm1,%xmm5"); +		asm volatile("pxor   %xmm4,%xmm5"); + +		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ + +		/* xmm5 = qx */ + +		asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); +		asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); +		asm volatile("movdqa %xmm2,%xmm3"); +		asm volatile("psraw  $4,%xmm2"); +		asm volatile("pand   %xmm7,%xmm3"); +		asm volatile("pand   %xmm7,%xmm2"); +		asm volatile("pshufb %xmm3,%xmm4"); +		asm volatile("pshufb %xmm2,%xmm1"); +		asm volatile("pxor   %xmm4,%xmm1"); + +		/* xmm1 = pbmul[px] */ +		asm volatile("pxor   %xmm5,%xmm1"); +		/* xmm1 = db = DQ */ +		asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); + +		asm volatile("pxor   %xmm1,%xmm0"); +		asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); + +		bytes -= 16; +		p += 16; +		q += 16; +		dp += 16; +		dq += 16; +#endif +	} + +	kernel_fpu_end(); +} + + +static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, +		void **ptrs) +{ +	u8 *p, *q, *dq; +	const u8 *qmul;		/* Q multiplier table */ +	static const u8 __aligned(16) x0f[16] = { +		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, +		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + +	p = (u8 *)ptrs[disks-2]; +	q = (u8 *)ptrs[disks-1]; + +	/* Compute syndrome with zero for the missing data page +	   Use the dead data page as temporary storage for delta q */ +	dq = (u8 *)ptrs[faila]; +	ptrs[faila] = (void *)raid6_empty_zero_page; +	ptrs[disks-1] = dq; + +	raid6_call.gen_syndrome(disks, bytes, ptrs); + +	/* Restore pointer table */ +	ptrs[faila]   = dq; +	ptrs[disks-1] = q; + +	/* Now, pick the proper data tables */ +	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + +	kernel_fpu_begin(); + +	asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); + +	while (bytes) { +#ifdef CONFIG_X86_64 +		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); +		asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); +		asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); +		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + +		/* xmm3 = q[0] ^ dq[0] */ + +		asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); +		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + +		/* xmm4 = q[16] ^ dq[16] */ + +		asm volatile("movdqa %xmm3, %xmm6"); +		asm volatile("movdqa %xmm4, %xmm8"); + +		/* xmm4 = xmm8 = q[16] ^ dq[16] */ + +		asm volatile("psraw $4, %xmm3"); +		asm volatile("pand %xmm7, %xmm6"); +		asm volatile("pand %xmm7, %xmm3"); +		asm volatile("pshufb %xmm6, %xmm0"); +		asm volatile("pshufb %xmm3, %xmm1"); +		asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); +		asm volatile("pxor %xmm0, %xmm1"); +		asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); + +		/* xmm1 = qmul[q[0] ^ dq[0]] */ + +		asm volatile("psraw $4, %xmm4"); +		asm volatile("pand %xmm7, %xmm8"); +		asm volatile("pand %xmm7, %xmm4"); +		asm volatile("pshufb %xmm8, %xmm10"); +		asm volatile("pshufb %xmm4, %xmm11"); +		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); +		asm volatile("pxor %xmm10, %xmm11"); +		asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); + +		/* xmm11 = qmul[q[16] ^ dq[16]] */ + +		asm volatile("pxor %xmm1, %xmm2"); + +		/* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ + +		asm volatile("pxor %xmm11, %xmm12"); + +		/* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ + +		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); +		asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); + +		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); +		asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); + +		bytes -= 32; +		p += 32; +		q += 32; +		dq += 32; + +#else +		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); +		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); +		asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); +		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + +		/* xmm3 = *q ^ *dq */ + +		asm volatile("movdqa %xmm3, %xmm6"); +		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); +		asm volatile("psraw $4, %xmm3"); +		asm volatile("pand %xmm7, %xmm6"); +		asm volatile("pand %xmm7, %xmm3"); +		asm volatile("pshufb %xmm6, %xmm0"); +		asm volatile("pshufb %xmm3, %xmm1"); +		asm volatile("pxor %xmm0, %xmm1"); + +		/* xmm1 = qmul[*q ^ *dq */ + +		asm volatile("pxor %xmm1, %xmm2"); + +		/* xmm2 = *p ^ qmul[*q ^ *dq] */ + +		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); +		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + +		bytes -= 16; +		p += 16; +		q += 16; +		dq += 16; +#endif +	} + +	kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_ssse3 = { +	.data2 = raid6_2data_recov_ssse3, +	.datap = raid6_datap_recov_ssse3, +	.valid = raid6_has_ssse3, +#ifdef CONFIG_X86_64 +	.name = "ssse3x2", +#else +	.name = "ssse3x1", +#endif +	.priority = 1, +}; diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c index 10dd91948c0..f7629713944 100644 --- a/lib/raid6/sse1.c +++ b/lib/raid6/sse1.c @@ -21,7 +21,7 @@   * worthwhile as a separate implementation.   */ -#if defined(__i386__) && !defined(__arch_um__) +#ifdef CONFIG_X86_32  #include <linux/raid/pq.h>  #include "x86.h" diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c index bc2d57daa58..85b82c85f28 100644 --- a/lib/raid6/sse2.c +++ b/lib/raid6/sse2.c @@ -17,8 +17,6 @@   *   */ -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) -  #include <linux/raid/pq.h>  #include "x86.h" @@ -159,9 +157,7 @@ const struct raid6_calls raid6_sse2x2 = {  	1			/* Has cache hints */  }; -#endif - -#if defined(__x86_64__) && !defined(__arch_um__) +#ifdef CONFIG_X86_64  /*   * Unrolled-by-4 SSE2 implementation @@ -259,4 +255,4 @@ const struct raid6_calls raid6_sse2x4 = {  	1			/* Has cache hints */  }; -#endif +#endif /* CONFIG_X86_64 */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index aa651697b6d..29090f3db67 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -10,6 +10,46 @@ LD	 = ld  AWK	 = awk -f  AR	 = ar  RANLIB	 = ranlib +OBJS	 = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o + +ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) +ifeq ($(ARCH),i386) +        CFLAGS += -DCONFIG_X86_32 +        IS_X86 = yes +endif +ifeq ($(ARCH),x86_64) +        CFLAGS += -DCONFIG_X86_64 +        IS_X86 = yes +endif + +ifeq ($(ARCH),arm) +        CFLAGS += -I../../../arch/arm/include -mfpu=neon +        HAS_NEON = yes +endif +ifeq ($(ARCH),arm64) +        CFLAGS += -I../../../arch/arm64/include +        HAS_NEON = yes +endif + +ifeq ($(IS_X86),yes) +        OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o +        CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" |	\ +                    gcc -c -x assembler - >&/dev/null &&	\ +                    rm ./-.o && echo -DCONFIG_AS_AVX2=1) +else ifeq ($(HAS_NEON),yes) +        OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o +        CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 +else +        HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\ +                         gcc -c -x c - >&/dev/null && \ +                         rm ./-.o && echo yes) +        ifeq ($(HAS_ALTIVEC),yes) +                OBJS += altivec1.o altivec2.o altivec4.o altivec8.o +        endif +endif +ifeq ($(ARCH),tilegx) +OBJS += tilegx8.o +endif  .c.o:  	$(CC) $(CFLAGS) -c -o $@ $< @@ -22,9 +62,7 @@ RANLIB	 = ranlib  all:	raid6.a raid6test -raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ -	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ -	 tables.o +raid6.a: $(OBJS)  	 rm -f $@  	 $(AR) cq $@ $^  	 $(RANLIB) $@ @@ -32,6 +70,18 @@ raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \  raid6test: test.c raid6.a  	$(CC) $(CFLAGS) -o raid6test $^ +neon1.c: neon.uc ../unroll.awk +	$(AWK) ../unroll.awk -vN=1 < neon.uc > $@ + +neon2.c: neon.uc ../unroll.awk +	$(AWK) ../unroll.awk -vN=2 < neon.uc > $@ + +neon4.c: neon.uc ../unroll.awk +	$(AWK) ../unroll.awk -vN=4 < neon.uc > $@ + +neon8.c: neon.uc ../unroll.awk +	$(AWK) ../unroll.awk -vN=8 < neon.uc > $@ +  altivec1.c: altivec.uc ../unroll.awk  	$(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ @@ -62,11 +112,15 @@ int16.c: int.uc ../unroll.awk  int32.c: int.uc ../unroll.awk  	$(AWK) ../unroll.awk -vN=32 < int.uc > $@ +tilegx8.c: tilegx.uc ../unroll.awk +	$(AWK) ../unroll.awk -vN=8 < tilegx.uc > $@ +  tables.c: mktables  	./mktables > tables.c  clean: -	rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test +	rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test +	rm -f tilegx*.c  spotless: clean  	rm -f *~ diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 7a930318b17..5a485b7a7d3 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -90,25 +90,35 @@ static int test_disks(int i, int j)  int main(int argc, char *argv[])  {  	const struct raid6_calls *const *algo; +	const struct raid6_recov_calls *const *ra;  	int i, j;  	int err = 0;  	makedata(); -	for (algo = raid6_algos; *algo; algo++) { -		if (!(*algo)->valid || (*algo)->valid()) { -			raid6_call = **algo; +	for (ra = raid6_recov_algos; *ra; ra++) { +		if ((*ra)->valid  && !(*ra)->valid()) +			continue; +		raid6_2data_recov = (*ra)->data2; +		raid6_datap_recov = (*ra)->datap; -			/* Nuke syndromes */ -			memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); +		printf("using recovery %s\n", (*ra)->name); -			/* Generate assumed good syndrome */ -			raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, -						(void **)&dataptrs); +		for (algo = raid6_algos; *algo; algo++) { +			if (!(*algo)->valid || (*algo)->valid()) { +				raid6_call = **algo; -			for (i = 0; i < NDISKS-1; i++) -				for (j = i+1; j < NDISKS; j++) -					err += test_disks(i, j); +				/* Nuke syndromes */ +				memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + +				/* Generate assumed good syndrome */ +				raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, +							(void **)&dataptrs); + +				for (i = 0; i < NDISKS-1; i++) +					for (j = i+1; j < NDISKS; j++) +						err += test_disks(i, j); +			}  		}  		printf("\n");  	} diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc new file mode 100644 index 00000000000..e7c29459cbc --- /dev/null +++ b/lib/raid6/tilegx.uc @@ -0,0 +1,86 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + *   Copyright 2002 H. Peter Anvin - All Rights Reserved + *   Copyright 2012 Tilera Corporation - All Rights Reserved + * + *   This program is free software; you can redistribute it and/or modify + *   it under the terms of the GNU General Public License as published by + *   the Free Software Foundation, Inc., 53 Temple Place Ste 330, + *   Boston MA 02111-1307, USA; either version 2 of the License, or + *   (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * tilegx$#.c + * + * $#-way unrolled TILE-Gx SIMD for RAID-6 math. + * + * This file is postprocessed using unroll.awk. + * + */ + +#include <linux/raid/pq.h> + +/* Create 8 byte copies of constant byte */ +# define NBYTES(x) (__insn_v1addi(0, x)) +# define NSIZE  8 + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ u64 SHLBYTE(u64 v) +{ +	/* Vector One Byte Shift Left Immediate. */ +	return __insn_v1shli(v, 1); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ u64 MASK(u64 v) +{ +	/* Vector One Byte Shift Right Signed Immediate. */ +	return __insn_v1shrsi(v, 7); +} + + +void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ +	u8 **dptr = (u8 **)ptrs; +	u64 *p, *q; +	int d, z, z0; + +	u64 wd$$, wq$$, wp$$, w1$$, w2$$; +	u64 x1d = NBYTES(0x1d); +	u64 * z0ptr; + +	z0 = disks - 3;			/* Highest data disk */ +	p = (u64 *)dptr[z0+1];	/* XOR parity */ +	q = (u64 *)dptr[z0+2];	/* RS syndrome */ + +	z0ptr = (u64 *)&dptr[z0][0]; +	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { +		wq$$ = wp$$ = *z0ptr++; +		for ( z = z0-1 ; z >= 0 ; z-- ) { +			wd$$ = *(u64 *)&dptr[z][d+$$*NSIZE]; +			wp$$ = wp$$ ^ wd$$; +			w2$$ = MASK(wq$$); +			w1$$ = SHLBYTE(wq$$); +			w2$$ = w2$$ & x1d; +			w1$$ = w1$$ ^ w2$$; +			wq$$ = w1$$ ^ wd$$; +		} +		*p++ = wp$$; +		*q++ = wq$$; +	} +} + +const struct raid6_calls raid6_tilegx$# = { +	raid6_tilegx$#_gen_syndrome, +	NULL, +	"tilegx$#", +	0 +}; diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index cb2a8c91c88..b7595484a81 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -35,24 +35,33 @@ static inline void kernel_fpu_end(void)  {  } +#define __aligned(x) __attribute__((aligned(x))) +  #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */  #define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions  					   * (fast save and restore) */  #define X86_FEATURE_XMM		(0*32+25) /* Streaming SIMD Extensions */  #define X86_FEATURE_XMM2	(0*32+26) /* Streaming SIMD Extensions-2 */ +#define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_AVX2        (9*32+ 5) /* AVX2 instructions */  #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */  /* Should work well enough on modern CPUs for testing */  static inline int boot_cpu_has(int flag)  { -	u32 eax = (flag >> 5) ? 0x80000001 : 1; -	u32 edx; +	u32 eax, ebx, ecx, edx; + +	eax = (flag & 0x100) ? 7 : +		(flag & 0x20) ? 0x80000001 : 1; +	ecx = 0;  	asm volatile("cpuid" -		     : "+a" (eax), "=d" (edx) -		     : : "ecx", "ebx"); +		     : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); -	return (edx >> (flag & 31)) & 1; +	return ((flag & 0x100 ? ebx : +		(flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1;  }  #endif /* ndef __KERNEL__ */  | 
