diff options
Diffstat (limited to 'arch/sparc')
83 files changed, 7839 insertions, 719 deletions
diff --git a/arch/sparc/Kbuild b/arch/sparc/Kbuild index 5cd01161fd0..675afa285dd 100644 --- a/arch/sparc/Kbuild +++ b/arch/sparc/Kbuild @@ -6,3 +6,4 @@ obj-y += kernel/ obj-y += mm/ obj-y += math-emu/ obj-y += net/ +obj-y += crypto/ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 67f1f6f5f4e..91c780c973b 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -18,6 +18,7 @@ config SPARC select HAVE_OPROFILE select HAVE_ARCH_KGDB if !SMP || SPARC64 select HAVE_ARCH_TRACEHOOK + select SYSCTL_EXCEPTION_TRACE select ARCH_WANT_OPTIONAL_GPIOLIB select RTC_CLASS select RTC_DRV_M48T59 @@ -32,6 +33,7 @@ config SPARC select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 select HAVE_BPF_JIT + select HAVE_DEBUG_BUGVERBOSE select GENERIC_SMP_IDLE_THREAD select GENERIC_CMOS_UPDATE select GENERIC_CLOCKEVENTS @@ -42,6 +44,7 @@ config SPARC32 def_bool !64BIT select GENERIC_ATOMIC64 select CLZ_TAB + select HAVE_UID16 config SPARC64 def_bool 64BIT @@ -59,6 +62,7 @@ config SPARC64 select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_SYSCALL_TRACEPOINTS + select HAVE_DEBUG_KMEMLEAK select RTC_DRV_CMOS select RTC_DRV_BQ4802 select RTC_DRV_SUN4V @@ -226,25 +230,6 @@ config EARLYFB help Say Y here to enable a faster early framebuffer boot console. -choice - prompt "Kernel page size" if SPARC64 - default SPARC64_PAGE_SIZE_8KB - -config SPARC64_PAGE_SIZE_8KB - bool "8KB" - help - This lets you select the page size of the kernel. - - 8KB and 64KB work quite well, since SPARC ELF sections - provide for up to 64KB alignment. - - If you don't know what to do, choose 8KB. - -config SPARC64_PAGE_SIZE_64KB - bool "64KB" - -endchoice - config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on SPARC64 && PROC_FS @@ -316,23 +301,6 @@ config GENERIC_LOCKBREAK default y depends on SPARC64 && SMP && PREEMPT -choice - prompt "SPARC64 Huge TLB Page Size" - depends on SPARC64 && HUGETLB_PAGE - default HUGETLB_PAGE_SIZE_4MB - -config HUGETLB_PAGE_SIZE_4MB - bool "4MB" - -config HUGETLB_PAGE_SIZE_512K - bool "512K" - -config HUGETLB_PAGE_SIZE_64K - depends on !SPARC64_PAGE_SIZE_64KB - bool "64K" - -endchoice - config NUMA bool "NUMA support" depends on SPARC64 && SMP @@ -571,6 +539,7 @@ config COMPAT depends on SPARC64 default y select COMPAT_BINFMT_ELF + select HAVE_UID16 select ARCH_WANT_OLD_COMPAT_IPC config SYSVIPC_COMPAT diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile new file mode 100644 index 00000000000..6ae1ad5e502 --- /dev/null +++ b/arch/sparc/crypto/Makefile @@ -0,0 +1,25 @@ +# +# Arch-specific CryptoAPI modules. +# + +obj-$(CONFIG_CRYPTO_SHA1_SPARC64) += sha1-sparc64.o +obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o +obj-$(CONFIG_CRYPTO_SHA512_SPARC64) += sha512-sparc64.o +obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o + +obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o +obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o +obj-$(CONFIG_CRYPTO_DES_SPARC64) += camellia-sparc64.o + +obj-$(CONFIG_CRYPTO_CRC32C_SPARC64) += crc32c-sparc64.o + +sha1-sparc64-y := sha1_asm.o sha1_glue.o crop_devid.o +sha256-sparc64-y := sha256_asm.o sha256_glue.o crop_devid.o +sha512-sparc64-y := sha512_asm.o sha512_glue.o crop_devid.o +md5-sparc64-y := md5_asm.o md5_glue.o crop_devid.o + +aes-sparc64-y := aes_asm.o aes_glue.o crop_devid.o +des-sparc64-y := des_asm.o des_glue.o crop_devid.o +camellia-sparc64-y := camellia_asm.o camellia_glue.o crop_devid.o + +crc32c-sparc64-y := crc32c_asm.o crc32c_glue.o crop_devid.o diff --git a/arch/sparc/crypto/aes_asm.S b/arch/sparc/crypto/aes_asm.S new file mode 100644 index 00000000000..23f6cbb910d --- /dev/null +++ b/arch/sparc/crypto/aes_asm.S @@ -0,0 +1,1535 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> + +#include "opcodes.h" + +#define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \ + AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \ + AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \ + AES_EROUND01(KEY_BASE + 4, T0, T1, I0) \ + AES_EROUND23(KEY_BASE + 6, T0, T1, I1) + +#define ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \ + AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \ + AES_EROUND01(KEY_BASE + 0, I2, I3, T2) \ + AES_EROUND23(KEY_BASE + 2, I2, I3, T3) \ + AES_EROUND01(KEY_BASE + 4, T0, T1, I0) \ + AES_EROUND23(KEY_BASE + 6, T0, T1, I1) \ + AES_EROUND01(KEY_BASE + 4, T2, T3, I2) \ + AES_EROUND23(KEY_BASE + 6, T2, T3, I3) + +#define ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \ + AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \ + AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \ + AES_EROUND01_L(KEY_BASE + 4, T0, T1, I0) \ + AES_EROUND23_L(KEY_BASE + 6, T0, T1, I1) + +#define ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \ + AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \ + AES_EROUND01(KEY_BASE + 0, I2, I3, T2) \ + AES_EROUND23(KEY_BASE + 2, I2, I3, T3) \ + AES_EROUND01_L(KEY_BASE + 4, T0, T1, I0) \ + AES_EROUND23_L(KEY_BASE + 6, T0, T1, I1) \ + AES_EROUND01_L(KEY_BASE + 4, T2, T3, I2) \ + AES_EROUND23_L(KEY_BASE + 6, T2, T3, I3) + + /* 10 rounds */ +#define ENCRYPT_128(KEY_BASE, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1) + +#define ENCRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) + + /* 12 rounds */ +#define ENCRYPT_192(KEY_BASE, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1) + +#define ENCRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \ + ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3) + + /* 14 rounds */ +#define ENCRYPT_256(KEY_BASE, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \ + ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1) + +#define ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \ + ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \ + TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE + 6) + +#define ENCRYPT_256_2(KEY_BASE, I0, I1, I2, I3) \ + ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, KEY_BASE + 48) \ + ldd [%o0 + 0xd0], %f56; \ + ldd [%o0 + 0xd8], %f58; \ + ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, KEY_BASE + 0) \ + ldd [%o0 + 0xe0], %f60; \ + ldd [%o0 + 0xe8], %f62; \ + ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE + 0) \ + ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE + 0) \ + ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE + 0) \ + ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE + 0) \ + AES_EROUND01(KEY_BASE + 48, I0, I1, KEY_BASE + 0) \ + AES_EROUND23(KEY_BASE + 50, I0, I1, KEY_BASE + 2) \ + AES_EROUND01(KEY_BASE + 48, I2, I3, KEY_BASE + 4) \ + AES_EROUND23(KEY_BASE + 50, I2, I3, KEY_BASE + 6) \ + AES_EROUND01_L(KEY_BASE + 52, KEY_BASE + 0, KEY_BASE + 2, I0) \ + AES_EROUND23_L(KEY_BASE + 54, KEY_BASE + 0, KEY_BASE + 2, I1) \ + ldd [%o0 + 0x10], %f8; \ + ldd [%o0 + 0x18], %f10; \ + AES_EROUND01_L(KEY_BASE + 52, KEY_BASE + 4, KEY_BASE + 6, I2) \ + AES_EROUND23_L(KEY_BASE + 54, KEY_BASE + 4, KEY_BASE + 6, I3) \ + ldd [%o0 + 0x20], %f12; \ + ldd [%o0 + 0x28], %f14; + +#define DECRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \ + AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \ + AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \ + AES_DROUND23(KEY_BASE + 4, T0, T1, I1) \ + AES_DROUND01(KEY_BASE + 6, T0, T1, I0) + +#define DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \ + AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \ + AES_DROUND23(KEY_BASE + 0, I2, I3, T3) \ + AES_DROUND01(KEY_BASE + 2, I2, I3, T2) \ + AES_DROUND23(KEY_BASE + 4, T0, T1, I1) \ + AES_DROUND01(KEY_BASE + 6, T0, T1, I0) \ + AES_DROUND23(KEY_BASE + 4, T2, T3, I3) \ + AES_DROUND01(KEY_BASE + 6, T2, T3, I2) + +#define DECRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \ + AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \ + AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \ + AES_DROUND23_L(KEY_BASE + 4, T0, T1, I1) \ + AES_DROUND01_L(KEY_BASE + 6, T0, T1, I0) + +#define DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \ + AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \ + AES_DROUND23(KEY_BASE + 0, I2, I3, T3) \ + AES_DROUND01(KEY_BASE + 2, I2, I3, T2) \ + AES_DROUND23_L(KEY_BASE + 4, T0, T1, I1) \ + AES_DROUND01_L(KEY_BASE + 6, T0, T1, I0) \ + AES_DROUND23_L(KEY_BASE + 4, T2, T3, I3) \ + AES_DROUND01_L(KEY_BASE + 6, T2, T3, I2) + + /* 10 rounds */ +#define DECRYPT_128(KEY_BASE, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1) + +#define DECRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) + + /* 12 rounds */ +#define DECRYPT_192(KEY_BASE, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1) + +#define DECRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \ + DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3) + + /* 14 rounds */ +#define DECRYPT_256(KEY_BASE, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \ + DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1) + +#define DECRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \ + DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \ + TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE + 6) + +#define DECRYPT_256_2(KEY_BASE, I0, I1, I2, I3) \ + DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, KEY_BASE + 48) \ + ldd [%o0 + 0x18], %f56; \ + ldd [%o0 + 0x10], %f58; \ + DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, KEY_BASE + 0) \ + ldd [%o0 + 0x08], %f60; \ + ldd [%o0 + 0x00], %f62; \ + DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE + 0) \ + DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE + 0) \ + DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE + 0) \ + DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE + 0) \ + AES_DROUND23(KEY_BASE + 48, I0, I1, KEY_BASE + 2) \ + AES_DROUND01(KEY_BASE + 50, I0, I1, KEY_BASE + 0) \ + AES_DROUND23(KEY_BASE + 48, I2, I3, KEY_BASE + 6) \ + AES_DROUND01(KEY_BASE + 50, I2, I3, KEY_BASE + 4) \ + AES_DROUND23_L(KEY_BASE + 52, KEY_BASE + 0, KEY_BASE + 2, I1) \ + AES_DROUND01_L(KEY_BASE + 54, KEY_BASE + 0, KEY_BASE + 2, I0) \ + ldd [%o0 + 0xd8], %f8; \ + ldd [%o0 + 0xd0], %f10; \ + AES_DROUND23_L(KEY_BASE + 52, KEY_BASE + 4, KEY_BASE + 6, I3) \ + AES_DROUND01_L(KEY_BASE + 54, KEY_BASE + 4, KEY_BASE + 6, I2) \ + ldd [%o0 + 0xc8], %f12; \ + ldd [%o0 + 0xc0], %f14; + + .align 32 +ENTRY(aes_sparc64_key_expand) + /* %o0=input_key, %o1=output_key, %o2=key_len */ + VISEntry + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + ld [%o0 + 0x0c], %f3 + + std %f0, [%o1 + 0x00] + std %f2, [%o1 + 0x08] + add %o1, 0x10, %o1 + + cmp %o2, 24 + bl 2f + nop + + be 1f + nop + + /* 256-bit key expansion */ + ld [%o0 + 0x10], %f4 + ld [%o0 + 0x14], %f5 + ld [%o0 + 0x18], %f6 + ld [%o0 + 0x1c], %f7 + + std %f4, [%o1 + 0x00] + std %f6, [%o1 + 0x08] + add %o1, 0x10, %o1 + + AES_KEXPAND1(0, 6, 0x0, 8) + AES_KEXPAND2(2, 8, 10) + AES_KEXPAND0(4, 10, 12) + AES_KEXPAND2(6, 12, 14) + AES_KEXPAND1(8, 14, 0x1, 16) + AES_KEXPAND2(10, 16, 18) + AES_KEXPAND0(12, 18, 20) + AES_KEXPAND2(14, 20, 22) + AES_KEXPAND1(16, 22, 0x2, 24) + AES_KEXPAND2(18, 24, 26) + AES_KEXPAND0(20, 26, 28) + AES_KEXPAND2(22, 28, 30) + AES_KEXPAND1(24, 30, 0x3, 32) + AES_KEXPAND2(26, 32, 34) + AES_KEXPAND0(28, 34, 36) + AES_KEXPAND2(30, 36, 38) + AES_KEXPAND1(32, 38, 0x4, 40) + AES_KEXPAND2(34, 40, 42) + AES_KEXPAND0(36, 42, 44) + AES_KEXPAND2(38, 44, 46) + AES_KEXPAND1(40, 46, 0x5, 48) + AES_KEXPAND2(42, 48, 50) + AES_KEXPAND0(44, 50, 52) + AES_KEXPAND2(46, 52, 54) + AES_KEXPAND1(48, 54, 0x6, 56) + AES_KEXPAND2(50, 56, 58) + + std %f8, [%o1 + 0x00] + std %f10, [%o1 + 0x08] + std %f12, [%o1 + 0x10] + std %f14, [%o1 + 0x18] + std %f16, [%o1 + 0x20] + std %f18, [%o1 + 0x28] + std %f20, [%o1 + 0x30] + std %f22, [%o1 + 0x38] + std %f24, [%o1 + 0x40] + std %f26, [%o1 + 0x48] + std %f28, [%o1 + 0x50] + std %f30, [%o1 + 0x58] + std %f32, [%o1 + 0x60] + std %f34, [%o1 + 0x68] + std %f36, [%o1 + 0x70] + std %f38, [%o1 + 0x78] + std %f40, [%o1 + 0x80] + std %f42, [%o1 + 0x88] + std %f44, [%o1 + 0x90] + std %f46, [%o1 + 0x98] + std %f48, [%o1 + 0xa0] + std %f50, [%o1 + 0xa8] + std %f52, [%o1 + 0xb0] + std %f54, [%o1 + 0xb8] + std %f56, [%o1 + 0xc0] + ba,pt %xcc, 80f + std %f58, [%o1 + 0xc8] + +1: + /* 192-bit key expansion */ + ld [%o0 + 0x10], %f4 + ld [%o0 + 0x14], %f5 + + std %f4, [%o1 + 0x00] + add %o1, 0x08, %o1 + + AES_KEXPAND1(0, 4, 0x0, 6) + AES_KEXPAND2(2, 6, 8) + AES_KEXPAND2(4, 8, 10) + AES_KEXPAND1(6, 10, 0x1, 12) + AES_KEXPAND2(8, 12, 14) + AES_KEXPAND2(10, 14, 16) + AES_KEXPAND1(12, 16, 0x2, 18) + AES_KEXPAND2(14, 18, 20) + AES_KEXPAND2(16, 20, 22) + AES_KEXPAND1(18, 22, 0x3, 24) + AES_KEXPAND2(20, 24, 26) + AES_KEXPAND2(22, 26, 28) + AES_KEXPAND1(24, 28, 0x4, 30) + AES_KEXPAND2(26, 30, 32) + AES_KEXPAND2(28, 32, 34) + AES_KEXPAND1(30, 34, 0x5, 36) + AES_KEXPAND2(32, 36, 38) + AES_KEXPAND2(34, 38, 40) + AES_KEXPAND1(36, 40, 0x6, 42) + AES_KEXPAND2(38, 42, 44) + AES_KEXPAND2(40, 44, 46) + AES_KEXPAND1(42, 46, 0x7, 48) + AES_KEXPAND2(44, 48, 50) + + std %f6, [%o1 + 0x00] + std %f8, [%o1 + 0x08] + std %f10, [%o1 + 0x10] + std %f12, [%o1 + 0x18] + std %f14, [%o1 + 0x20] + std %f16, [%o1 + 0x28] + std %f18, [%o1 + 0x30] + std %f20, [%o1 + 0x38] + std %f22, [%o1 + 0x40] + std %f24, [%o1 + 0x48] + std %f26, [%o1 + 0x50] + std %f28, [%o1 + 0x58] + std %f30, [%o1 + 0x60] + std %f32, [%o1 + 0x68] + std %f34, [%o1 + 0x70] + std %f36, [%o1 + 0x78] + std %f38, [%o1 + 0x80] + std %f40, [%o1 + 0x88] + std %f42, [%o1 + 0x90] + std %f44, [%o1 + 0x98] + std %f46, [%o1 + 0xa0] + std %f48, [%o1 + 0xa8] + ba,pt %xcc, 80f + std %f50, [%o1 + 0xb0] + +2: + /* 128-bit key expansion */ + AES_KEXPAND1(0, 2, 0x0, 4) + AES_KEXPAND2(2, 4, 6) + AES_KEXPAND1(4, 6, 0x1, 8) + AES_KEXPAND2(6, 8, 10) + AES_KEXPAND1(8, 10, 0x2, 12) + AES_KEXPAND2(10, 12, 14) + AES_KEXPAND1(12, 14, 0x3, 16) + AES_KEXPAND2(14, 16, 18) + AES_KEXPAND1(16, 18, 0x4, 20) + AES_KEXPAND2(18, 20, 22) + AES_KEXPAND1(20, 22, 0x5, 24) + AES_KEXPAND2(22, 24, 26) + AES_KEXPAND1(24, 26, 0x6, 28) + AES_KEXPAND2(26, 28, 30) + AES_KEXPAND1(28, 30, 0x7, 32) + AES_KEXPAND2(30, 32, 34) + AES_KEXPAND1(32, 34, 0x8, 36) + AES_KEXPAND2(34, 36, 38) + AES_KEXPAND1(36, 38, 0x9, 40) + AES_KEXPAND2(38, 40, 42) + + std %f4, [%o1 + 0x00] + std %f6, [%o1 + 0x08] + std %f8, [%o1 + 0x10] + std %f10, [%o1 + 0x18] + std %f12, [%o1 + 0x20] + std %f14, [%o1 + 0x28] + std %f16, [%o1 + 0x30] + std %f18, [%o1 + 0x38] + std %f20, [%o1 + 0x40] + std %f22, [%o1 + 0x48] + std %f24, [%o1 + 0x50] + std %f26, [%o1 + 0x58] + std %f28, [%o1 + 0x60] + std %f30, [%o1 + 0x68] + std %f32, [%o1 + 0x70] + std %f34, [%o1 + 0x78] + std %f36, [%o1 + 0x80] + std %f38, [%o1 + 0x88] + std %f40, [%o1 + 0x90] + std %f42, [%o1 + 0x98] +80: + retl + VISExit +ENDPROC(aes_sparc64_key_expand) + + .align 32 +ENTRY(aes_sparc64_encrypt_128) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ld [%o1 + 0x00], %f4 + ld [%o1 + 0x04], %f5 + ld [%o1 + 0x08], %f6 + ld [%o1 + 0x0c], %f7 + ldd [%o0 + 0x00], %f8 + ldd [%o0 + 0x08], %f10 + ldd [%o0 + 0x10], %f12 + ldd [%o0 + 0x18], %f14 + ldd [%o0 + 0x20], %f16 + ldd [%o0 + 0x28], %f18 + ldd [%o0 + 0x30], %f20 + ldd [%o0 + 0x38], %f22 + ldd [%o0 + 0x40], %f24 + ldd [%o0 + 0x48], %f26 + ldd [%o0 + 0x50], %f28 + ldd [%o0 + 0x58], %f30 + ldd [%o0 + 0x60], %f32 + ldd [%o0 + 0x68], %f34 + ldd [%o0 + 0x70], %f36 + ldd [%o0 + 0x78], %f38 + ldd [%o0 + 0x80], %f40 + ldd [%o0 + 0x88], %f42 + ldd [%o0 + 0x90], %f44 + ldd [%o0 + 0x98], %f46 + ldd [%o0 + 0xa0], %f48 + ldd [%o0 + 0xa8], %f50 + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + ENCRYPT_128(12, 4, 6, 0, 2) + st %f4, [%o2 + 0x00] + st %f5, [%o2 + 0x04] + st %f6, [%o2 + 0x08] + st %f7, [%o2 + 0x0c] + retl + VISExit +ENDPROC(aes_sparc64_encrypt_128) + + .align 32 +ENTRY(aes_sparc64_encrypt_192) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ld [%o1 + 0x00], %f4 + ld [%o1 + 0x04], %f5 + ld [%o1 + 0x08], %f6 + ld [%o1 + 0x0c], %f7 + + ldd [%o0 + 0x00], %f8 + ldd [%o0 + 0x08], %f10 + + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + + ldd [%o0 + 0x10], %f8 + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + add %o0, 0x20, %o0 + + ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2) + + ldd [%o0 + 0x10], %f12 + ldd [%o0 + 0x18], %f14 + ldd [%o0 + 0x20], %f16 + ldd [%o0 + 0x28], %f18 + ldd [%o0 + 0x30], %f20 + ldd [%o0 + 0x38], %f22 + ldd [%o0 + 0x40], %f24 + ldd [%o0 + 0x48], %f26 + ldd [%o0 + 0x50], %f28 + ldd [%o0 + 0x58], %f30 + ldd [%o0 + 0x60], %f32 + ldd [%o0 + 0x68], %f34 + ldd [%o0 + 0x70], %f36 + ldd [%o0 + 0x78], %f38 + ldd [%o0 + 0x80], %f40 + ldd [%o0 + 0x88], %f42 + ldd [%o0 + 0x90], %f44 + ldd [%o0 + 0x98], %f46 + ldd [%o0 + 0xa0], %f48 + ldd [%o0 + 0xa8], %f50 + + + ENCRYPT_128(12, 4, 6, 0, 2) + + st %f4, [%o2 + 0x00] + st %f5, [%o2 + 0x04] + st %f6, [%o2 + 0x08] + st %f7, [%o2 + 0x0c] + + retl + VISExit +ENDPROC(aes_sparc64_encrypt_192) + + .align 32 +ENTRY(aes_sparc64_encrypt_256) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ld [%o1 + 0x00], %f4 + ld [%o1 + 0x04], %f5 + ld [%o1 + 0x08], %f6 + ld [%o1 + 0x0c], %f7 + + ldd [%o0 + 0x00], %f8 + ldd [%o0 + 0x08], %f10 + + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + + ldd [%o0 + 0x10], %f8 + + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + add %o0, 0x20, %o0 + + ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2) + + ldd [%o0 + 0x10], %f8 + + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + add %o0, 0x20, %o0 + + ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2) + + ldd [%o0 + 0x10], %f12 + ldd [%o0 + 0x18], %f14 + ldd [%o0 + 0x20], %f16 + ldd [%o0 + 0x28], %f18 + ldd [%o0 + 0x30], %f20 + ldd [%o0 + 0x38], %f22 + ldd [%o0 + 0x40], %f24 + ldd [%o0 + 0x48], %f26 + ldd [%o0 + 0x50], %f28 + ldd [%o0 + 0x58], %f30 + ldd [%o0 + 0x60], %f32 + ldd [%o0 + 0x68], %f34 + ldd [%o0 + 0x70], %f36 + ldd [%o0 + 0x78], %f38 + ldd [%o0 + 0x80], %f40 + ldd [%o0 + 0x88], %f42 + ldd [%o0 + 0x90], %f44 + ldd [%o0 + 0x98], %f46 + ldd [%o0 + 0xa0], %f48 + ldd [%o0 + 0xa8], %f50 + + ENCRYPT_128(12, 4, 6, 0, 2) + + st %f4, [%o2 + 0x00] + st %f5, [%o2 + 0x04] + st %f6, [%o2 + 0x08] + st %f7, [%o2 + 0x0c] + + retl + VISExit +ENDPROC(aes_sparc64_encrypt_256) + + .align 32 +ENTRY(aes_sparc64_decrypt_128) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ld [%o1 + 0x00], %f4 + ld [%o1 + 0x04], %f5 + ld [%o1 + 0x08], %f6 + ld [%o1 + 0x0c], %f7 + ldd [%o0 + 0xa0], %f8 + ldd [%o0 + 0xa8], %f10 + ldd [%o0 + 0x98], %f12 + ldd [%o0 + 0x90], %f14 + ldd [%o0 + 0x88], %f16 + ldd [%o0 + 0x80], %f18 + ldd [%o0 + 0x78], %f20 + ldd [%o0 + 0x70], %f22 + ldd [%o0 + 0x68], %f24 + ldd [%o0 + 0x60], %f26 + ldd [%o0 + 0x58], %f28 + ldd [%o0 + 0x50], %f30 + ldd [%o0 + 0x48], %f32 + ldd [%o0 + 0x40], %f34 + ldd [%o0 + 0x38], %f36 + ldd [%o0 + 0x30], %f38 + ldd [%o0 + 0x28], %f40 + ldd [%o0 + 0x20], %f42 + ldd [%o0 + 0x18], %f44 + ldd [%o0 + 0x10], %f46 + ldd [%o0 + 0x08], %f48 + ldd [%o0 + 0x00], %f50 + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + DECRYPT_128(12, 4, 6, 0, 2) + st %f4, [%o2 + 0x00] + st %f5, [%o2 + 0x04] + st %f6, [%o2 + 0x08] + st %f7, [%o2 + 0x0c] + retl + VISExit +ENDPROC(aes_sparc64_decrypt_128) + + .align 32 +ENTRY(aes_sparc64_decrypt_192) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ld [%o1 + 0x00], %f4 + ld [%o1 + 0x04], %f5 + ld [%o1 + 0x08], %f6 + ld [%o1 + 0x0c], %f7 + ldd [%o0 + 0xc0], %f8 + ldd [%o0 + 0xc8], %f10 + ldd [%o0 + 0xb8], %f12 + ldd [%o0 + 0xb0], %f14 + ldd [%o0 + 0xa8], %f16 + ldd [%o0 + 0xa0], %f18 + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + ldd [%o0 + 0x98], %f20 + ldd [%o0 + 0x90], %f22 + ldd [%o0 + 0x88], %f24 + ldd [%o0 + 0x80], %f26 + DECRYPT_TWO_ROUNDS(12, 4, 6, 0, 2) + ldd [%o0 + 0x78], %f28 + ldd [%o0 + 0x70], %f30 + ldd [%o0 + 0x68], %f32 + ldd [%o0 + 0x60], %f34 + ldd [%o0 + 0x58], %f36 + ldd [%o0 + 0x50], %f38 + ldd [%o0 + 0x48], %f40 + ldd [%o0 + 0x40], %f42 + ldd [%o0 + 0x38], %f44 + ldd [%o0 + 0x30], %f46 + ldd [%o0 + 0x28], %f48 + ldd [%o0 + 0x20], %f50 + ldd [%o0 + 0x18], %f52 + ldd [%o0 + 0x10], %f54 + ldd [%o0 + 0x08], %f56 + ldd [%o0 + 0x00], %f58 + DECRYPT_128(20, 4, 6, 0, 2) + st %f4, [%o2 + 0x00] + st %f5, [%o2 + 0x04] + st %f6, [%o2 + 0x08] + st %f7, [%o2 + 0x0c] + retl + VISExit +ENDPROC(aes_sparc64_decrypt_192) + + .align 32 +ENTRY(aes_sparc64_decrypt_256) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ld [%o1 + 0x00], %f4 + ld [%o1 + 0x04], %f5 + ld [%o1 + 0x08], %f6 + ld [%o1 + 0x0c], %f7 + ldd [%o0 + 0xe0], %f8 + ldd [%o0 + 0xe8], %f10 + ldd [%o0 + 0xd8], %f12 + ldd [%o0 + 0xd0], %f14 + ldd [%o0 + 0xc8], %f16 + fxor %f8, %f4, %f4 + ldd [%o0 + 0xc0], %f18 + fxor %f10, %f6, %f6 + ldd [%o0 + 0xb8], %f20 + AES_DROUND23(12, 4, 6, 2) + ldd [%o0 + 0xb0], %f22 + AES_DROUND01(14, 4, 6, 0) + ldd [%o0 + 0xa8], %f24 + AES_DROUND23(16, 0, 2, 6) + ldd [%o0 + 0xa0], %f26 + AES_DROUND01(18, 0, 2, 4) + ldd [%o0 + 0x98], %f12 + AES_DROUND23(20, 4, 6, 2) + ldd [%o0 + 0x90], %f14 + AES_DROUND01(22, 4, 6, 0) + ldd [%o0 + 0x88], %f16 + AES_DROUND23(24, 0, 2, 6) + ldd [%o0 + 0x80], %f18 + AES_DROUND01(26, 0, 2, 4) + ldd [%o0 + 0x78], %f20 + AES_DROUND23(12, 4, 6, 2) + ldd [%o0 + 0x70], %f22 + AES_DROUND01(14, 4, 6, 0) + ldd [%o0 + 0x68], %f24 + AES_DROUND23(16, 0, 2, 6) + ldd [%o0 + 0x60], %f26 + AES_DROUND01(18, 0, 2, 4) + ldd [%o0 + 0x58], %f28 + AES_DROUND23(20, 4, 6, 2) + ldd [%o0 + 0x50], %f30 + AES_DROUND01(22, 4, 6, 0) + ldd [%o0 + 0x48], %f32 + AES_DROUND23(24, 0, 2, 6) + ldd [%o0 + 0x40], %f34 + AES_DROUND01(26, 0, 2, 4) + ldd [%o0 + 0x38], %f36 + AES_DROUND23(28, 4, 6, 2) + ldd [%o0 + 0x30], %f38 + AES_DROUND01(30, 4, 6, 0) + ldd [%o0 + 0x28], %f40 + AES_DROUND23(32, 0, 2, 6) + ldd [%o0 + 0x20], %f42 + AES_DROUND01(34, 0, 2, 4) + ldd [%o0 + 0x18], %f44 + AES_DROUND23(36, 4, 6, 2) + ldd [%o0 + 0x10], %f46 + AES_DROUND01(38, 4, 6, 0) + ldd [%o0 + 0x08], %f48 + AES_DROUND23(40, 0, 2, 6) + ldd [%o0 + 0x00], %f50 + AES_DROUND01(42, 0, 2, 4) + AES_DROUND23(44, 4, 6, 2) + AES_DROUND01(46, 4, 6, 0) + AES_DROUND23_L(48, 0, 2, 6) + AES_DROUND01_L(50, 0, 2, 4) + st %f4, [%o2 + 0x00] + st %f5, [%o2 + 0x04] + st %f6, [%o2 + 0x08] + st %f7, [%o2 + 0x0c] + retl + VISExit +ENDPROC(aes_sparc64_decrypt_256) + + .align 32 +ENTRY(aes_sparc64_load_encrypt_keys_128) + /* %o0=key */ + VISEntry + ldd [%o0 + 0x10], %f8 + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + ldd [%o0 + 0x30], %f16 + ldd [%o0 + 0x38], %f18 + ldd [%o0 + 0x40], %f20 + ldd [%o0 + 0x48], %f22 + ldd [%o0 + 0x50], %f24 + ldd [%o0 + 0x58], %f26 + ldd [%o0 + 0x60], %f28 + ldd [%o0 + 0x68], %f30 + ldd [%o0 + 0x70], %f32 + ldd [%o0 + 0x78], %f34 + ldd [%o0 + 0x80], %f36 + ldd [%o0 + 0x88], %f38 + ldd [%o0 + 0x90], %f40 + ldd [%o0 + 0x98], %f42 + ldd [%o0 + 0xa0], %f44 + retl + ldd [%o0 + 0xa8], %f46 +ENDPROC(aes_sparc64_load_encrypt_keys_128) + + .align 32 +ENTRY(aes_sparc64_load_encrypt_keys_192) + /* %o0=key */ + VISEntry + ldd [%o0 + 0x10], %f8 + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + ldd [%o0 + 0x30], %f16 + ldd [%o0 + 0x38], %f18 + ldd [%o0 + 0x40], %f20 + ldd [%o0 + 0x48], %f22 + ldd [%o0 + 0x50], %f24 + ldd [%o0 + 0x58], %f26 + ldd [%o0 + 0x60], %f28 + ldd [%o0 + 0x68], %f30 + ldd [%o0 + 0x70], %f32 + ldd [%o0 + 0x78], %f34 + ldd [%o0 + 0x80], %f36 + ldd [%o0 + 0x88], %f38 + ldd [%o0 + 0x90], %f40 + ldd [%o0 + 0x98], %f42 + ldd [%o0 + 0xa0], %f44 + ldd [%o0 + 0xa8], %f46 + ldd [%o0 + 0xb0], %f48 + ldd [%o0 + 0xb8], %f50 + ldd [%o0 + 0xc0], %f52 + retl + ldd [%o0 + 0xc8], %f54 +ENDPROC(aes_sparc64_load_encrypt_keys_192) + + .align 32 +ENTRY(aes_sparc64_load_encrypt_keys_256) + /* %o0=key */ + VISEntry + ldd [%o0 + 0x10], %f8 + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + ldd [%o0 + 0x30], %f16 + ldd [%o0 + 0x38], %f18 + ldd [%o0 + 0x40], %f20 + ldd [%o0 + 0x48], %f22 + ldd [%o0 + 0x50], %f24 + ldd [%o0 + 0x58], %f26 + ldd [%o0 + 0x60], %f28 + ldd [%o0 + 0x68], %f30 + ldd [%o0 + 0x70], %f32 + ldd [%o0 + 0x78], %f34 + ldd [%o0 + 0x80], %f36 + ldd [%o0 + 0x88], %f38 + ldd [%o0 + 0x90], %f40 + ldd [%o0 + 0x98], %f42 + ldd [%o0 + 0xa0], %f44 + ldd [%o0 + 0xa8], %f46 + ldd [%o0 + 0xb0], %f48 + ldd [%o0 + 0xb8], %f50 + ldd [%o0 + 0xc0], %f52 + ldd [%o0 + 0xc8], %f54 + ldd [%o0 + 0xd0], %f56 + ldd [%o0 + 0xd8], %f58 + ldd [%o0 + 0xe0], %f60 + retl + ldd [%o0 + 0xe8], %f62 +ENDPROC(aes_sparc64_load_encrypt_keys_256) + + .align 32 +ENTRY(aes_sparc64_load_decrypt_keys_128) + /* %o0=key */ + VISEntry + ldd [%o0 + 0x98], %f8 + ldd [%o0 + 0x90], %f10 + ldd [%o0 + 0x88], %f12 + ldd [%o0 + 0x80], %f14 + ldd [%o0 + 0x78], %f16 + ldd [%o0 + 0x70], %f18 + ldd [%o0 + 0x68], %f20 + ldd [%o0 + 0x60], %f22 + ldd [%o0 + 0x58], %f24 + ldd [%o0 + 0x50], %f26 + ldd [%o0 + 0x48], %f28 + ldd [%o0 + 0x40], %f30 + ldd [%o0 + 0x38], %f32 + ldd [%o0 + 0x30], %f34 + ldd [%o0 + 0x28], %f36 + ldd [%o0 + 0x20], %f38 + ldd [%o0 + 0x18], %f40 + ldd [%o0 + 0x10], %f42 + ldd [%o0 + 0x08], %f44 + retl + ldd [%o0 + 0x00], %f46 +ENDPROC(aes_sparc64_load_decrypt_keys_128) + + .align 32 +ENTRY(aes_sparc64_load_decrypt_keys_192) + /* %o0=key */ + VISEntry + ldd [%o0 + 0xb8], %f8 + ldd [%o0 + 0xb0], %f10 + ldd [%o0 + 0xa8], %f12 + ldd [%o0 + 0xa0], %f14 + ldd [%o0 + 0x98], %f16 + ldd [%o0 + 0x90], %f18 + ldd [%o0 + 0x88], %f20 + ldd [%o0 + 0x80], %f22 + ldd [%o0 + 0x78], %f24 + ldd [%o0 + 0x70], %f26 + ldd [%o0 + 0x68], %f28 + ldd [%o0 + 0x60], %f30 + ldd [%o0 + 0x58], %f32 + ldd [%o0 + 0x50], %f34 + ldd [%o0 + 0x48], %f36 + ldd [%o0 + 0x40], %f38 + ldd [%o0 + 0x38], %f40 + ldd [%o0 + 0x30], %f42 + ldd [%o0 + 0x28], %f44 + ldd [%o0 + 0x20], %f46 + ldd [%o0 + 0x18], %f48 + ldd [%o0 + 0x10], %f50 + ldd [%o0 + 0x08], %f52 + retl + ldd [%o0 + 0x00], %f54 +ENDPROC(aes_sparc64_load_decrypt_keys_192) + + .align 32 +ENTRY(aes_sparc64_load_decrypt_keys_256) + /* %o0=key */ + VISEntry + ldd [%o0 + 0xd8], %f8 + ldd [%o0 + 0xd0], %f10 + ldd [%o0 + 0xc8], %f12 + ldd [%o0 + 0xc0], %f14 + ldd [%o0 + 0xb8], %f16 + ldd [%o0 + 0xb0], %f18 + ldd [%o0 + 0xa8], %f20 + ldd [%o0 + 0xa0], %f22 + ldd [%o0 + 0x98], %f24 + ldd [%o0 + 0x90], %f26 + ldd [%o0 + 0x88], %f28 + ldd [%o0 + 0x80], %f30 + ldd [%o0 + 0x78], %f32 + ldd [%o0 + 0x70], %f34 + ldd [%o0 + 0x68], %f36 + ldd [%o0 + 0x60], %f38 + ldd [%o0 + 0x58], %f40 + ldd [%o0 + 0x50], %f42 + ldd [%o0 + 0x48], %f44 + ldd [%o0 + 0x40], %f46 + ldd [%o0 + 0x38], %f48 + ldd [%o0 + 0x30], %f50 + ldd [%o0 + 0x28], %f52 + ldd [%o0 + 0x20], %f54 + ldd [%o0 + 0x18], %f56 + ldd [%o0 + 0x10], %f58 + ldd [%o0 + 0x08], %f60 + retl + ldd [%o0 + 0x00], %f62 +ENDPROC(aes_sparc64_load_decrypt_keys_256) + + .align 32 +ENTRY(aes_sparc64_ecb_encrypt_128) + /* %o0=key, %o1=input, %o2=output, %o3=len */ + ldx [%o0 + 0x00], %g1 + subcc %o3, 0x10, %o3 + be 10f + ldx [%o0 + 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + xor %g1, %o4, %g3 + xor %g2, %o5, %g7 + MOVXTOD_G3_F60 + MOVXTOD_G7_F62 + ENCRYPT_128_2(8, 4, 6, 60, 62, 0, 2, 56, 58) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + std %f60, [%o2 + 0x10] + std %f62, [%o2 + 0x18] + sub %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + ENCRYPT_128(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: retl + nop +ENDPROC(aes_sparc64_ecb_encrypt_128) + + .align 32 +ENTRY(aes_sparc64_ecb_encrypt_192) + /* %o0=key, %o1=input, %o2=output, %o3=len */ + ldx [%o0 + 0x00], %g1 + subcc %o3, 0x10, %o3 + be 10f + ldx [%o0 + 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + xor %g1, %o4, %g3 + xor %g2, %o5, %g7 + MOVXTOD_G3_F60 + MOVXTOD_G7_F62 + ENCRYPT_192_2(8, 4, 6, 60, 62, 0, 2, 56, 58) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + std %f60, [%o2 + 0x10] + std %f62, [%o2 + 0x18] + sub %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + ENCRYPT_192(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: retl + nop +ENDPROC(aes_sparc64_ecb_encrypt_192) + + .align 32 +ENTRY(aes_sparc64_ecb_encrypt_256) + /* %o0=key, %o1=input, %o2=output, %o3=len */ + ldx [%o0 + 0x00], %g1 + subcc %o3, 0x10, %o3 + be 10f + ldx [%o0 + 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + xor %g1, %o4, %g3 + xor %g2, %o5, %g7 + MOVXTOD_G3_F0 + MOVXTOD_G7_F2 + ENCRYPT_256_2(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + std %f0, [%o2 + 0x10] + std %f2, [%o2 + 0x18] + sub %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + ENCRYPT_256(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: retl + nop +ENDPROC(aes_sparc64_ecb_encrypt_256) + + .align 32 +ENTRY(aes_sparc64_ecb_decrypt_128) + /* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */ + ldx [%o0 - 0x10], %g1 + subcc %o3, 0x10, %o3 + be 10f + ldx [%o0 - 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + xor %g1, %o4, %g3 + xor %g2, %o5, %g7 + MOVXTOD_G3_F60 + MOVXTOD_G7_F62 + DECRYPT_128_2(8, 4, 6, 60, 62, 0, 2, 56, 58) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + std %f60, [%o2 + 0x10] + std %f62, [%o2 + 0x18] + sub %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz,pt %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + DECRYPT_128(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: retl + nop +ENDPROC(aes_sparc64_ecb_decrypt_128) + + .align 32 +ENTRY(aes_sparc64_ecb_decrypt_192) + /* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */ + ldx [%o0 - 0x10], %g1 + subcc %o3, 0x10, %o3 + be 10f + ldx [%o0 - 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + xor %g1, %o4, %g3 + xor %g2, %o5, %g7 + MOVXTOD_G3_F60 + MOVXTOD_G7_F62 + DECRYPT_192_2(8, 4, 6, 60, 62, 0, 2, 56, 58) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + std %f60, [%o2 + 0x10] + std %f62, [%o2 + 0x18] + sub %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz,pt %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + DECRYPT_192(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: retl + nop +ENDPROC(aes_sparc64_ecb_decrypt_192) + + .align 32 +ENTRY(aes_sparc64_ecb_decrypt_256) + /* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */ + ldx [%o0 - 0x10], %g1 + subcc %o3, 0x10, %o3 + be 10f + ldx [%o0 - 0x08], %g2 + sub %o0, 0xf0, %o0 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + xor %g1, %o4, %g3 + xor %g2, %o5, %g7 + MOVXTOD_G3_F0 + MOVXTOD_G7_F2 + DECRYPT_256_2(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + std %f0, [%o2 + 0x10] + std %f2, [%o2 + 0x18] + sub %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz,pt %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + DECRYPT_256(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: retl + nop +ENDPROC(aes_sparc64_ecb_decrypt_256) + + .align 32 +ENTRY(aes_sparc64_cbc_encrypt_128) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldd [%o4 + 0x00], %f4 + ldd [%o4 + 0x08], %f6 + ldx [%o0 + 0x00], %g1 + ldx [%o0 + 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + add %o1, 0x10, %o1 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F0 + MOVXTOD_G7_F2 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + ENCRYPT_128(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + subcc %o3, 0x10, %o3 + bne,pt %xcc, 1b + add %o2, 0x10, %o2 + std %f4, [%o4 + 0x00] + std %f6, [%o4 + 0x08] + retl + nop +ENDPROC(aes_sparc64_cbc_encrypt_128) + + .align 32 +ENTRY(aes_sparc64_cbc_encrypt_192) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldd [%o4 + 0x00], %f4 + ldd [%o4 + 0x08], %f6 + ldx [%o0 + 0x00], %g1 + ldx [%o0 + 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + add %o1, 0x10, %o1 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F0 + MOVXTOD_G7_F2 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + ENCRYPT_192(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + subcc %o3, 0x10, %o3 + bne,pt %xcc, 1b + add %o2, 0x10, %o2 + std %f4, [%o4 + 0x00] + std %f6, [%o4 + 0x08] + retl + nop +ENDPROC(aes_sparc64_cbc_encrypt_192) + + .align 32 +ENTRY(aes_sparc64_cbc_encrypt_256) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldd [%o4 + 0x00], %f4 + ldd [%o4 + 0x08], %f6 + ldx [%o0 + 0x00], %g1 + ldx [%o0 + 0x08], %g2 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + add %o1, 0x10, %o1 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F0 + MOVXTOD_G7_F2 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + ENCRYPT_256(8, 4, 6, 0, 2) + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + subcc %o3, 0x10, %o3 + bne,pt %xcc, 1b + add %o2, 0x10, %o2 + std %f4, [%o4 + 0x00] + std %f6, [%o4 + 0x08] + retl + nop +ENDPROC(aes_sparc64_cbc_encrypt_256) + + .align 32 +ENTRY(aes_sparc64_cbc_decrypt_128) + /* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */ + ldx [%o0 - 0x10], %g1 + ldx [%o0 - 0x08], %g2 + ldx [%o4 + 0x00], %o0 + ldx [%o4 + 0x08], %o5 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + add %o1, 0x10, %o1 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + DECRYPT_128(8, 4, 6, 0, 2) + MOVXTOD_O0_F0 + MOVXTOD_O5_F2 + xor %g1, %g3, %o0 + xor %g2, %g7, %o5 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + subcc %o3, 0x10, %o3 + bne,pt %xcc, 1b + add %o2, 0x10, %o2 + stx %o0, [%o4 + 0x00] + stx %o5, [%o4 + 0x08] + retl + nop +ENDPROC(aes_sparc64_cbc_decrypt_128) + + .align 32 +ENTRY(aes_sparc64_cbc_decrypt_192) + /* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */ + ldx [%o0 - 0x10], %g1 + ldx [%o0 - 0x08], %g2 + ldx [%o4 + 0x00], %o0 + ldx [%o4 + 0x08], %o5 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + add %o1, 0x10, %o1 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + DECRYPT_192(8, 4, 6, 0, 2) + MOVXTOD_O0_F0 + MOVXTOD_O5_F2 + xor %g1, %g3, %o0 + xor %g2, %g7, %o5 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + subcc %o3, 0x10, %o3 + bne,pt %xcc, 1b + add %o2, 0x10, %o2 + stx %o0, [%o4 + 0x00] + stx %o5, [%o4 + 0x08] + retl + nop +ENDPROC(aes_sparc64_cbc_decrypt_192) + + .align 32 +ENTRY(aes_sparc64_cbc_decrypt_256) + /* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */ + ldx [%o0 - 0x10], %g1 + ldx [%o0 - 0x08], %g2 + ldx [%o4 + 0x00], %o0 + ldx [%o4 + 0x08], %o5 +1: ldx [%o1 + 0x00], %g3 + ldx [%o1 + 0x08], %g7 + add %o1, 0x10, %o1 + xor %g1, %g3, %g3 + xor %g2, %g7, %g7 + MOVXTOD_G3_F4 + MOVXTOD_G7_F6 + DECRYPT_256(8, 4, 6, 0, 2) + MOVXTOD_O0_F0 + MOVXTOD_O5_F2 + xor %g1, %g3, %o0 + xor %g2, %g7, %o5 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] + subcc %o3, 0x10, %o3 + bne,pt %xcc, 1b + add %o2, 0x10, %o2 + stx %o0, [%o4 + 0x00] + stx %o5, [%o4 + 0x08] + retl + nop +ENDPROC(aes_sparc64_cbc_decrypt_256) + + .align 32 +ENTRY(aes_sparc64_ctr_crypt_128) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldx [%o4 + 0x00], %g3 + ldx [%o4 + 0x08], %g7 + subcc %o3, 0x10, %o3 + ldx [%o0 + 0x00], %g1 + be 10f + ldx [%o0 + 0x08], %g2 +1: xor %g1, %g3, %o5 + MOVXTOD_O5_F0 + xor %g2, %g7, %o5 + MOVXTOD_O5_F2 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + xor %g1, %g3, %o5 + MOVXTOD_O5_F4 + xor %g2, %g7, %o5 + MOVXTOD_O5_F6 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + ENCRYPT_128_2(8, 0, 2, 4, 6, 56, 58, 60, 62) + ldd [%o1 + 0x00], %f56 + ldd [%o1 + 0x08], %f58 + ldd [%o1 + 0x10], %f60 + ldd [%o1 + 0x18], %f62 + fxor %f56, %f0, %f56 + fxor %f58, %f2, %f58 + fxor %f60, %f4, %f60 + fxor %f62, %f6, %f62 + std %f56, [%o2 + 0x00] + std %f58, [%o2 + 0x08] + std %f60, [%o2 + 0x10] + std %f62, [%o2 + 0x18] + subcc %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: xor %g1, %g3, %o5 + MOVXTOD_O5_F0 + xor %g2, %g7, %o5 + MOVXTOD_O5_F2 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + ENCRYPT_128(8, 0, 2, 4, 6) + ldd [%o1 + 0x00], %f4 + ldd [%o1 + 0x08], %f6 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: stx %g3, [%o4 + 0x00] + retl + stx %g7, [%o4 + 0x08] +ENDPROC(aes_sparc64_ctr_crypt_128) + + .align 32 +ENTRY(aes_sparc64_ctr_crypt_192) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldx [%o4 + 0x00], %g3 + ldx [%o4 + 0x08], %g7 + subcc %o3, 0x10, %o3 + ldx [%o0 + 0x00], %g1 + be 10f + ldx [%o0 + 0x08], %g2 +1: xor %g1, %g3, %o5 + MOVXTOD_O5_F0 + xor %g2, %g7, %o5 + MOVXTOD_O5_F2 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + xor %g1, %g3, %o5 + MOVXTOD_O5_F4 + xor %g2, %g7, %o5 + MOVXTOD_O5_F6 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + ENCRYPT_192_2(8, 0, 2, 4, 6, 56, 58, 60, 62) + ldd [%o1 + 0x00], %f56 + ldd [%o1 + 0x08], %f58 + ldd [%o1 + 0x10], %f60 + ldd [%o1 + 0x18], %f62 + fxor %f56, %f0, %f56 + fxor %f58, %f2, %f58 + fxor %f60, %f4, %f60 + fxor %f62, %f6, %f62 + std %f56, [%o2 + 0x00] + std %f58, [%o2 + 0x08] + std %f60, [%o2 + 0x10] + std %f62, [%o2 + 0x18] + subcc %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop +10: xor %g1, %g3, %o5 + MOVXTOD_O5_F0 + xor %g2, %g7, %o5 + MOVXTOD_O5_F2 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + ENCRYPT_192(8, 0, 2, 4, 6) + ldd [%o1 + 0x00], %f4 + ldd [%o1 + 0x08], %f6 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: stx %g3, [%o4 + 0x00] + retl + stx %g7, [%o4 + 0x08] +ENDPROC(aes_sparc64_ctr_crypt_192) + + .align 32 +ENTRY(aes_sparc64_ctr_crypt_256) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldx [%o4 + 0x00], %g3 + ldx [%o4 + 0x08], %g7 + subcc %o3, 0x10, %o3 + ldx [%o0 + 0x00], %g1 + be 10f + ldx [%o0 + 0x08], %g2 +1: xor %g1, %g3, %o5 + MOVXTOD_O5_F0 + xor %g2, %g7, %o5 + MOVXTOD_O5_F2 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + xor %g1, %g3, %o5 + MOVXTOD_O5_F4 + xor %g2, %g7, %o5 + MOVXTOD_O5_F6 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + ENCRYPT_256_2(8, 0, 2, 4, 6) + ldd [%o1 + 0x00], %f56 + ldd [%o1 + 0x08], %f58 + ldd [%o1 + 0x10], %f60 + ldd [%o1 + 0x18], %f62 + fxor %f56, %f0, %f56 + fxor %f58, %f2, %f58 + fxor %f60, %f4, %f60 + fxor %f62, %f6, %f62 + std %f56, [%o2 + 0x00] + std %f58, [%o2 + 0x08] + std %f60, [%o2 + 0x10] + std %f62, [%o2 + 0x18] + subcc %o3, 0x20, %o3 + add %o1, 0x20, %o1 + brgz %o3, 1b + add %o2, 0x20, %o2 + brlz,pt %o3, 11f + nop + ldd [%o0 + 0xd0], %f56 + ldd [%o0 + 0xd8], %f58 + ldd [%o0 + 0xe0], %f60 + ldd [%o0 + 0xe8], %f62 +10: xor %g1, %g3, %o5 + MOVXTOD_O5_F0 + xor %g2, %g7, %o5 + MOVXTOD_O5_F2 + add %g7, 1, %g7 + add %g3, 1, %o5 + movrz %g7, %o5, %g3 + ENCRYPT_256(8, 0, 2, 4, 6) + ldd [%o1 + 0x00], %f4 + ldd [%o1 + 0x08], %f6 + fxor %f4, %f0, %f4 + fxor %f6, %f2, %f6 + std %f4, [%o2 + 0x00] + std %f6, [%o2 + 0x08] +11: stx %g3, [%o4 + 0x00] + retl + stx %g7, [%o4 + 0x08] +ENDPROC(aes_sparc64_ctr_crypt_256) diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c new file mode 100644 index 00000000000..8f1c9980f63 --- /dev/null +++ b/arch/sparc/crypto/aes_glue.c @@ -0,0 +1,477 @@ +/* Glue code for AES encryption optimized for sparc64 crypto opcodes. + * + * This is based largely upon arch/x86/crypto/aesni-intel_glue.c + * + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD + * interface for 64-bit kernels. + * Authors: Adrian Hoban <adrian.hoban@intel.com> + * Gabriele Paoloni <gabriele.paoloni@intel.com> + * Tadeusz Struk (tadeusz.struk@intel.com) + * Aidan O'Mahony (aidan.o.mahony@intel.com) + * Copyright (c) 2010, Intel Corporation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <crypto/algapi.h> +#include <crypto/aes.h> + +#include <asm/fpumacro.h> +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +struct aes_ops { + void (*encrypt)(const u64 *key, const u32 *input, u32 *output); + void (*decrypt)(const u64 *key, const u32 *input, u32 *output); + void (*load_encrypt_keys)(const u64 *key); + void (*load_decrypt_keys)(const u64 *key); + void (*ecb_encrypt)(const u64 *key, const u64 *input, u64 *output, + unsigned int len); + void (*ecb_decrypt)(const u64 *key, const u64 *input, u64 *output, + unsigned int len); + void (*cbc_encrypt)(const u64 *key, const u64 *input, u64 *output, + unsigned int len, u64 *iv); + void (*cbc_decrypt)(const u64 *key, const u64 *input, u64 *output, + unsigned int len, u64 *iv); + void (*ctr_crypt)(const u64 *key, const u64 *input, u64 *output, + unsigned int len, u64 *iv); +}; + +struct crypto_sparc64_aes_ctx { + struct aes_ops *ops; + u64 key[AES_MAX_KEYLENGTH / sizeof(u64)]; + u32 key_length; + u32 expanded_key_length; +}; + +extern void aes_sparc64_encrypt_128(const u64 *key, const u32 *input, + u32 *output); +extern void aes_sparc64_encrypt_192(const u64 *key, const u32 *input, + u32 *output); +extern void aes_sparc64_encrypt_256(const u64 *key, const u32 *input, + u32 *output); + +extern void aes_sparc64_decrypt_128(const u64 *key, const u32 *input, + u32 *output); +extern void aes_sparc64_decrypt_192(const u64 *key, const u32 *input, + u32 *output); +extern void aes_sparc64_decrypt_256(const u64 *key, const u32 *input, + u32 *output); + +extern void aes_sparc64_load_encrypt_keys_128(const u64 *key); +extern void aes_sparc64_load_encrypt_keys_192(const u64 *key); +extern void aes_sparc64_load_encrypt_keys_256(const u64 *key); + +extern void aes_sparc64_load_decrypt_keys_128(const u64 *key); +extern void aes_sparc64_load_decrypt_keys_192(const u64 *key); +extern void aes_sparc64_load_decrypt_keys_256(const u64 *key); + +extern void aes_sparc64_ecb_encrypt_128(const u64 *key, const u64 *input, + u64 *output, unsigned int len); +extern void aes_sparc64_ecb_encrypt_192(const u64 *key, const u64 *input, + u64 *output, unsigned int len); +extern void aes_sparc64_ecb_encrypt_256(const u64 *key, const u64 *input, + u64 *output, unsigned int len); + +extern void aes_sparc64_ecb_decrypt_128(const u64 *key, const u64 *input, + u64 *output, unsigned int len); +extern void aes_sparc64_ecb_decrypt_192(const u64 *key, const u64 *input, + u64 *output, unsigned int len); +extern void aes_sparc64_ecb_decrypt_256(const u64 *key, const u64 *input, + u64 *output, unsigned int len); + +extern void aes_sparc64_cbc_encrypt_128(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +extern void aes_sparc64_cbc_encrypt_192(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +extern void aes_sparc64_cbc_encrypt_256(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +extern void aes_sparc64_cbc_decrypt_128(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +extern void aes_sparc64_cbc_decrypt_192(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +extern void aes_sparc64_cbc_decrypt_256(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +extern void aes_sparc64_ctr_crypt_128(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); +extern void aes_sparc64_ctr_crypt_192(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); +extern void aes_sparc64_ctr_crypt_256(const u64 *key, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +struct aes_ops aes128_ops = { + .encrypt = aes_sparc64_encrypt_128, + .decrypt = aes_sparc64_decrypt_128, + .load_encrypt_keys = aes_sparc64_load_encrypt_keys_128, + .load_decrypt_keys = aes_sparc64_load_decrypt_keys_128, + .ecb_encrypt = aes_sparc64_ecb_encrypt_128, + .ecb_decrypt = aes_sparc64_ecb_decrypt_128, + .cbc_encrypt = aes_sparc64_cbc_encrypt_128, + .cbc_decrypt = aes_sparc64_cbc_decrypt_128, + .ctr_crypt = aes_sparc64_ctr_crypt_128, +}; + +struct aes_ops aes192_ops = { + .encrypt = aes_sparc64_encrypt_192, + .decrypt = aes_sparc64_decrypt_192, + .load_encrypt_keys = aes_sparc64_load_encrypt_keys_192, + .load_decrypt_keys = aes_sparc64_load_decrypt_keys_192, + .ecb_encrypt = aes_sparc64_ecb_encrypt_192, + .ecb_decrypt = aes_sparc64_ecb_decrypt_192, + .cbc_encrypt = aes_sparc64_cbc_encrypt_192, + .cbc_decrypt = aes_sparc64_cbc_decrypt_192, + .ctr_crypt = aes_sparc64_ctr_crypt_192, +}; + +struct aes_ops aes256_ops = { + .encrypt = aes_sparc64_encrypt_256, + .decrypt = aes_sparc64_decrypt_256, + .load_encrypt_keys = aes_sparc64_load_encrypt_keys_256, + .load_decrypt_keys = aes_sparc64_load_decrypt_keys_256, + .ecb_encrypt = aes_sparc64_ecb_encrypt_256, + .ecb_decrypt = aes_sparc64_ecb_decrypt_256, + .cbc_encrypt = aes_sparc64_cbc_encrypt_256, + .cbc_decrypt = aes_sparc64_cbc_decrypt_256, + .ctr_crypt = aes_sparc64_ctr_crypt_256, +}; + +extern void aes_sparc64_key_expand(const u32 *in_key, u64 *output_key, + unsigned int key_len); + +static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + + switch (key_len) { + case AES_KEYSIZE_128: + ctx->expanded_key_length = 0xb0; + ctx->ops = &aes128_ops; + break; + + case AES_KEYSIZE_192: + ctx->expanded_key_length = 0xd0; + ctx->ops = &aes192_ops; + break; + + case AES_KEYSIZE_256: + ctx->expanded_key_length = 0xf0; + ctx->ops = &aes256_ops; + break; + + default: + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + aes_sparc64_key_expand((const u32 *)in_key, &ctx->key[0], key_len); + ctx->key_length = key_len; + + return 0; +} + +static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->ops->encrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst); +} + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->ops->decrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst); +} + +#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) + +static int ecb_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + ctx->ops->load_encrypt_keys(&ctx->key[0]); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & AES_BLOCK_MASK; + + if (likely(block_len)) { + ctx->ops->ecb_encrypt(&ctx->key[0], + (const u64 *)walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, + block_len); + } + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static int ecb_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + u64 *key_end; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + ctx->ops->load_decrypt_keys(&ctx->key[0]); + key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)]; + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & AES_BLOCK_MASK; + + if (likely(block_len)) { + ctx->ops->ecb_decrypt(key_end, + (const u64 *) walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, block_len); + } + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + + return err; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + ctx->ops->load_encrypt_keys(&ctx->key[0]); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & AES_BLOCK_MASK; + + if (likely(block_len)) { + ctx->ops->cbc_encrypt(&ctx->key[0], + (const u64 *)walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, + block_len, (u64 *) walk.iv); + } + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + u64 *key_end; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + ctx->ops->load_decrypt_keys(&ctx->key[0]); + key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)]; + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & AES_BLOCK_MASK; + + if (likely(block_len)) { + ctx->ops->cbc_decrypt(key_end, + (const u64 *) walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, + block_len, (u64 *) walk.iv); + } + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + + return err; +} + +static int ctr_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + ctx->ops->load_encrypt_keys(&ctx->key[0]); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & AES_BLOCK_MASK; + + if (likely(block_len)) { + ctx->ops->ctr_crypt(&ctx->key[0], + (const u64 *)walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, + block_len, (u64 *) walk.iv); + } + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static struct crypto_alg algs[] = { { + .cra_name = "aes", + .cra_driver_name = "aes-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), + .cra_alignmask = 3, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}, { + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_sparc64_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +} }; + +static bool __init sparc64_has_aes_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_AES)) + return false; + + return true; +} + +static int __init aes_sparc64_mod_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(algs); i++) + INIT_LIST_HEAD(&algs[i].cra_list); + + if (sparc64_has_aes_opcode()) { + pr_info("Using sparc64 aes opcodes optimized AES implementation\n"); + return crypto_register_algs(algs, ARRAY_SIZE(algs)); + } + pr_info("sparc64 aes opcodes not available.\n"); + return -ENODEV; +} + +static void __exit aes_sparc64_mod_fini(void) +{ + crypto_unregister_algs(algs, ARRAY_SIZE(algs)); +} + +module_init(aes_sparc64_mod_init); +module_exit(aes_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("AES Secure Hash Algorithm, sparc64 aes opcode accelerated"); + +MODULE_ALIAS("aes"); diff --git a/arch/sparc/crypto/camellia_asm.S b/arch/sparc/crypto/camellia_asm.S new file mode 100644 index 00000000000..cc39553a4e4 --- /dev/null +++ b/arch/sparc/crypto/camellia_asm.S @@ -0,0 +1,563 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> + +#include "opcodes.h" + +#define CAMELLIA_6ROUNDS(KEY_BASE, I0, I1) \ + CAMELLIA_F(KEY_BASE + 0, I1, I0, I1) \ + CAMELLIA_F(KEY_BASE + 2, I0, I1, I0) \ + CAMELLIA_F(KEY_BASE + 4, I1, I0, I1) \ + CAMELLIA_F(KEY_BASE + 6, I0, I1, I0) \ + CAMELLIA_F(KEY_BASE + 8, I1, I0, I1) \ + CAMELLIA_F(KEY_BASE + 10, I0, I1, I0) + +#define CAMELLIA_6ROUNDS_FL_FLI(KEY_BASE, I0, I1) \ + CAMELLIA_6ROUNDS(KEY_BASE, I0, I1) \ + CAMELLIA_FL(KEY_BASE + 12, I0, I0) \ + CAMELLIA_FLI(KEY_BASE + 14, I1, I1) + + .data + + .align 8 +SIGMA: .xword 0xA09E667F3BCC908B + .xword 0xB67AE8584CAA73B2 + .xword 0xC6EF372FE94F82BE + .xword 0x54FF53A5F1D36F1C + .xword 0x10E527FADE682D1D + .xword 0xB05688C2B3E6C1FD + + .text + + .align 32 +ENTRY(camellia_sparc64_key_expand) + /* %o0=in_key, %o1=encrypt_key, %o2=key_len, %o3=decrypt_key */ + VISEntry + ld [%o0 + 0x00], %f0 ! i0, k[0] + ld [%o0 + 0x04], %f1 ! i1, k[1] + ld [%o0 + 0x08], %f2 ! i2, k[2] + ld [%o0 + 0x0c], %f3 ! i3, k[3] + std %f0, [%o1 + 0x00] ! k[0, 1] + fsrc2 %f0, %f28 + std %f2, [%o1 + 0x08] ! k[2, 3] + cmp %o2, 16 + be 10f + fsrc2 %f2, %f30 + + ld [%o0 + 0x10], %f0 + ld [%o0 + 0x14], %f1 + std %f0, [%o1 + 0x20] ! k[8, 9] + cmp %o2, 24 + fone %f10 + be,a 1f + fxor %f10, %f0, %f2 + ld [%o0 + 0x18], %f2 + ld [%o0 + 0x1c], %f3 +1: + std %f2, [%o1 + 0x28] ! k[10, 11] + fxor %f28, %f0, %f0 + fxor %f30, %f2, %f2 + +10: + sethi %hi(SIGMA), %g3 + or %g3, %lo(SIGMA), %g3 + ldd [%g3 + 0x00], %f16 + ldd [%g3 + 0x08], %f18 + ldd [%g3 + 0x10], %f20 + ldd [%g3 + 0x18], %f22 + ldd [%g3 + 0x20], %f24 + ldd [%g3 + 0x28], %f26 + CAMELLIA_F(16, 2, 0, 2) + CAMELLIA_F(18, 0, 2, 0) + fxor %f28, %f0, %f0 + fxor %f30, %f2, %f2 + CAMELLIA_F(20, 2, 0, 2) + CAMELLIA_F(22, 0, 2, 0) + +#define ROTL128(S01, S23, TMP1, TMP2, N) \ + srlx S01, (64 - N), TMP1; \ + sllx S01, N, S01; \ + srlx S23, (64 - N), TMP2; \ + sllx S23, N, S23; \ + or S01, TMP2, S01; \ + or S23, TMP1, S23 + + cmp %o2, 16 + bne 1f + nop + /* 128-bit key */ + std %f0, [%o1 + 0x10] ! k[ 4, 5] + std %f2, [%o1 + 0x18] ! k[ 6, 7] + MOVDTOX_F0_O4 + MOVDTOX_F2_O5 + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x30] ! k[12, 13] + stx %o5, [%o1 + 0x38] ! k[14, 15] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x40] ! k[16, 17] + stx %o5, [%o1 + 0x48] ! k[18, 19] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x60] ! k[24, 25] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x70] ! k[28, 29] + stx %o5, [%o1 + 0x78] ! k[30, 31] + ROTL128(%o4, %o5, %g2, %g3, 34) + stx %o4, [%o1 + 0xa0] ! k[40, 41] + stx %o5, [%o1 + 0xa8] ! k[42, 43] + ROTL128(%o4, %o5, %g2, %g3, 17) + stx %o4, [%o1 + 0xc0] ! k[48, 49] + stx %o5, [%o1 + 0xc8] ! k[50, 51] + + ldx [%o1 + 0x00], %o4 ! k[ 0, 1] + ldx [%o1 + 0x08], %o5 ! k[ 2, 3] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x20] ! k[ 8, 9] + stx %o5, [%o1 + 0x28] ! k[10, 11] + ROTL128(%o4, %o5, %g2, %g3, 30) + stx %o4, [%o1 + 0x50] ! k[20, 21] + stx %o5, [%o1 + 0x58] ! k[22, 23] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o5, [%o1 + 0x68] ! k[26, 27] + ROTL128(%o4, %o5, %g2, %g3, 17) + stx %o4, [%o1 + 0x80] ! k[32, 33] + stx %o5, [%o1 + 0x88] ! k[34, 35] + ROTL128(%o4, %o5, %g2, %g3, 17) + stx %o4, [%o1 + 0x90] ! k[36, 37] + stx %o5, [%o1 + 0x98] ! k[38, 39] + ROTL128(%o4, %o5, %g2, %g3, 17) + stx %o4, [%o1 + 0xb0] ! k[44, 45] + stx %o5, [%o1 + 0xb8] ! k[46, 47] + + ba,pt %xcc, 2f + mov (3 * 16 * 4), %o0 + +1: + /* 192-bit or 256-bit key */ + std %f0, [%o1 + 0x30] ! k[12, 13] + std %f2, [%o1 + 0x38] ! k[14, 15] + ldd [%o1 + 0x20], %f4 ! k[ 8, 9] + ldd [%o1 + 0x28], %f6 ! k[10, 11] + fxor %f0, %f4, %f0 + fxor %f2, %f6, %f2 + CAMELLIA_F(24, 2, 0, 2) + CAMELLIA_F(26, 0, 2, 0) + std %f0, [%o1 + 0x10] ! k[ 4, 5] + std %f2, [%o1 + 0x18] ! k[ 6, 7] + MOVDTOX_F0_O4 + MOVDTOX_F2_O5 + ROTL128(%o4, %o5, %g2, %g3, 30) + stx %o4, [%o1 + 0x50] ! k[20, 21] + stx %o5, [%o1 + 0x58] ! k[22, 23] + ROTL128(%o4, %o5, %g2, %g3, 30) + stx %o4, [%o1 + 0xa0] ! k[40, 41] + stx %o5, [%o1 + 0xa8] ! k[42, 43] + ROTL128(%o4, %o5, %g2, %g3, 51) + stx %o4, [%o1 + 0x100] ! k[64, 65] + stx %o5, [%o1 + 0x108] ! k[66, 67] + ldx [%o1 + 0x20], %o4 ! k[ 8, 9] + ldx [%o1 + 0x28], %o5 ! k[10, 11] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x20] ! k[ 8, 9] + stx %o5, [%o1 + 0x28] ! k[10, 11] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x40] ! k[16, 17] + stx %o5, [%o1 + 0x48] ! k[18, 19] + ROTL128(%o4, %o5, %g2, %g3, 30) + stx %o4, [%o1 + 0x90] ! k[36, 37] + stx %o5, [%o1 + 0x98] ! k[38, 39] + ROTL128(%o4, %o5, %g2, %g3, 34) + stx %o4, [%o1 + 0xd0] ! k[52, 53] + stx %o5, [%o1 + 0xd8] ! k[54, 55] + ldx [%o1 + 0x30], %o4 ! k[12, 13] + ldx [%o1 + 0x38], %o5 ! k[14, 15] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x30] ! k[12, 13] + stx %o5, [%o1 + 0x38] ! k[14, 15] + ROTL128(%o4, %o5, %g2, %g3, 30) + stx %o4, [%o1 + 0x70] ! k[28, 29] + stx %o5, [%o1 + 0x78] ! k[30, 31] + srlx %o4, 32, %g2 + srlx %o5, 32, %g3 + stw %o4, [%o1 + 0xc0] ! k[48] + stw %g3, [%o1 + 0xc4] ! k[49] + stw %o5, [%o1 + 0xc8] ! k[50] + stw %g2, [%o1 + 0xcc] ! k[51] + ROTL128(%o4, %o5, %g2, %g3, 49) + stx %o4, [%o1 + 0xe0] ! k[56, 57] + stx %o5, [%o1 + 0xe8] ! k[58, 59] + ldx [%o1 + 0x00], %o4 ! k[ 0, 1] + ldx [%o1 + 0x08], %o5 ! k[ 2, 3] + ROTL128(%o4, %o5, %g2, %g3, 45) + stx %o4, [%o1 + 0x60] ! k[24, 25] + stx %o5, [%o1 + 0x68] ! k[26, 27] + ROTL128(%o4, %o5, %g2, %g3, 15) + stx %o4, [%o1 + 0x80] ! k[32, 33] + stx %o5, [%o1 + 0x88] ! k[34, 35] + ROTL128(%o4, %o5, %g2, %g3, 17) + stx %o4, [%o1 + 0xb0] ! k[44, 45] + stx %o5, [%o1 + 0xb8] ! k[46, 47] + ROTL128(%o4, %o5, %g2, %g3, 34) + stx %o4, [%o1 + 0xf0] ! k[60, 61] + stx %o5, [%o1 + 0xf8] ! k[62, 63] + mov (4 * 16 * 4), %o0 +2: + add %o1, %o0, %o1 + ldd [%o1 + 0x00], %f0 + ldd [%o1 + 0x08], %f2 + std %f0, [%o3 + 0x00] + std %f2, [%o3 + 0x08] + add %o3, 0x10, %o3 +1: + sub %o1, (16 * 4), %o1 + ldd [%o1 + 0x38], %f0 + ldd [%o1 + 0x30], %f2 + ldd [%o1 + 0x28], %f4 + ldd [%o1 + 0x20], %f6 + ldd [%o1 + 0x18], %f8 + ldd [%o1 + 0x10], %f10 + std %f0, [%o3 + 0x00] + std %f2, [%o3 + 0x08] + std %f4, [%o3 + 0x10] + std %f6, [%o3 + 0x18] + std %f8, [%o3 + 0x20] + std %f10, [%o3 + 0x28] + + ldd [%o1 + 0x08], %f0 + ldd [%o1 + 0x00], %f2 + std %f0, [%o3 + 0x30] + std %f2, [%o3 + 0x38] + subcc %o0, (16 * 4), %o0 + bne,pt %icc, 1b + add %o3, (16 * 4), %o3 + + std %f2, [%o3 - 0x10] + std %f0, [%o3 - 0x08] + + retl + VISExit +ENDPROC(camellia_sparc64_key_expand) + + .align 32 +ENTRY(camellia_sparc64_crypt) + /* %o0=key, %o1=input, %o2=output, %o3=key_len */ + VISEntry + + ld [%o1 + 0x00], %f0 + ld [%o1 + 0x04], %f1 + ld [%o1 + 0x08], %f2 + ld [%o1 + 0x0c], %f3 + + ldd [%o0 + 0x00], %f4 + ldd [%o0 + 0x08], %f6 + + cmp %o3, 16 + fxor %f4, %f0, %f0 + be 1f + fxor %f6, %f2, %f2 + + ldd [%o0 + 0x10], %f8 + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + ldd [%o0 + 0x30], %f16 + ldd [%o0 + 0x38], %f18 + ldd [%o0 + 0x40], %f20 + ldd [%o0 + 0x48], %f22 + add %o0, 0x40, %o0 + + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + +1: + ldd [%o0 + 0x10], %f8 + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + ldd [%o0 + 0x30], %f16 + ldd [%o0 + 0x38], %f18 + ldd [%o0 + 0x40], %f20 + ldd [%o0 + 0x48], %f22 + ldd [%o0 + 0x50], %f24 + ldd [%o0 + 0x58], %f26 + ldd [%o0 + 0x60], %f28 + ldd [%o0 + 0x68], %f30 + ldd [%o0 + 0x70], %f32 + ldd [%o0 + 0x78], %f34 + ldd [%o0 + 0x80], %f36 + ldd [%o0 + 0x88], %f38 + ldd [%o0 + 0x90], %f40 + ldd [%o0 + 0x98], %f42 + ldd [%o0 + 0xa0], %f44 + ldd [%o0 + 0xa8], %f46 + ldd [%o0 + 0xb0], %f48 + ldd [%o0 + 0xb8], %f50 + ldd [%o0 + 0xc0], %f52 + ldd [%o0 + 0xc8], %f54 + + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2) + CAMELLIA_6ROUNDS(40, 0, 2) + fxor %f52, %f2, %f2 + fxor %f54, %f0, %f0 + + st %f2, [%o2 + 0x00] + st %f3, [%o2 + 0x04] + st %f0, [%o2 + 0x08] + st %f1, [%o2 + 0x0c] + + retl + VISExit +ENDPROC(camellia_sparc64_crypt) + + .align 32 +ENTRY(camellia_sparc64_load_keys) + /* %o0=key, %o1=key_len */ + VISEntry + ldd [%o0 + 0x00], %f4 + ldd [%o0 + 0x08], %f6 + ldd [%o0 + 0x10], %f8 + ldd [%o0 + 0x18], %f10 + ldd [%o0 + 0x20], %f12 + ldd [%o0 + 0x28], %f14 + ldd [%o0 + 0x30], %f16 + ldd [%o0 + 0x38], %f18 + ldd [%o0 + 0x40], %f20 + ldd [%o0 + 0x48], %f22 + ldd [%o0 + 0x50], %f24 + ldd [%o0 + 0x58], %f26 + ldd [%o0 + 0x60], %f28 + ldd [%o0 + 0x68], %f30 + ldd [%o0 + 0x70], %f32 + ldd [%o0 + 0x78], %f34 + ldd [%o0 + 0x80], %f36 + ldd [%o0 + 0x88], %f38 + ldd [%o0 + 0x90], %f40 + ldd [%o0 + 0x98], %f42 + ldd [%o0 + 0xa0], %f44 + ldd [%o0 + 0xa8], %f46 + ldd [%o0 + 0xb0], %f48 + ldd [%o0 + 0xb8], %f50 + ldd [%o0 + 0xc0], %f52 + retl + ldd [%o0 + 0xc8], %f54 +ENDPROC(camellia_sparc64_load_keys) + + .align 32 +ENTRY(camellia_sparc64_ecb_crypt_3_grand_rounds) + /* %o0=input, %o1=output, %o2=len, %o3=key */ +1: ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + add %o0, 0x10, %o0 + fxor %f4, %f0, %f0 + fxor %f6, %f2, %f2 + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2) + CAMELLIA_6ROUNDS(40, 0, 2) + fxor %f52, %f2, %f2 + fxor %f54, %f0, %f0 + std %f2, [%o1 + 0x00] + std %f0, [%o1 + 0x08] + subcc %o2, 0x10, %o2 + bne,pt %icc, 1b + add %o1, 0x10, %o1 + retl + nop +ENDPROC(camellia_sparc64_ecb_crypt_3_grand_rounds) + + .align 32 +ENTRY(camellia_sparc64_ecb_crypt_4_grand_rounds) + /* %o0=input, %o1=output, %o2=len, %o3=key */ +1: ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + add %o0, 0x10, %o0 + fxor %f4, %f0, %f0 + fxor %f6, %f2, %f2 + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + ldd [%o3 + 0xd0], %f8 + ldd [%o3 + 0xd8], %f10 + ldd [%o3 + 0xe0], %f12 + ldd [%o3 + 0xe8], %f14 + ldd [%o3 + 0xf0], %f16 + ldd [%o3 + 0xf8], %f18 + ldd [%o3 + 0x100], %f20 + ldd [%o3 + 0x108], %f22 + CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2) + CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2) + CAMELLIA_F(8, 2, 0, 2) + CAMELLIA_F(10, 0, 2, 0) + ldd [%o3 + 0x10], %f8 + ldd [%o3 + 0x18], %f10 + CAMELLIA_F(12, 2, 0, 2) + CAMELLIA_F(14, 0, 2, 0) + ldd [%o3 + 0x20], %f12 + ldd [%o3 + 0x28], %f14 + CAMELLIA_F(16, 2, 0, 2) + CAMELLIA_F(18, 0, 2, 0) + ldd [%o3 + 0x30], %f16 + ldd [%o3 + 0x38], %f18 + fxor %f20, %f2, %f2 + fxor %f22, %f0, %f0 + ldd [%o3 + 0x40], %f20 + ldd [%o3 + 0x48], %f22 + std %f2, [%o1 + 0x00] + std %f0, [%o1 + 0x08] + subcc %o2, 0x10, %o2 + bne,pt %icc, 1b + add %o1, 0x10, %o1 + retl + nop +ENDPROC(camellia_sparc64_ecb_crypt_4_grand_rounds) + + .align 32 +ENTRY(camellia_sparc64_cbc_encrypt_3_grand_rounds) + /* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */ + ldd [%o4 + 0x00], %f60 + ldd [%o4 + 0x08], %f62 +1: ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + add %o0, 0x10, %o0 + fxor %f60, %f0, %f0 + fxor %f62, %f2, %f2 + fxor %f4, %f0, %f0 + fxor %f6, %f2, %f2 + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2) + CAMELLIA_6ROUNDS(40, 0, 2) + fxor %f52, %f2, %f60 + fxor %f54, %f0, %f62 + std %f60, [%o1 + 0x00] + std %f62, [%o1 + 0x08] + subcc %o2, 0x10, %o2 + bne,pt %icc, 1b + add %o1, 0x10, %o1 + std %f60, [%o4 + 0x00] + retl + std %f62, [%o4 + 0x08] +ENDPROC(camellia_sparc64_cbc_encrypt_3_grand_rounds) + + .align 32 +ENTRY(camellia_sparc64_cbc_encrypt_4_grand_rounds) + /* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */ + ldd [%o4 + 0x00], %f60 + ldd [%o4 + 0x08], %f62 +1: ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + add %o0, 0x10, %o0 + fxor %f60, %f0, %f0 + fxor %f62, %f2, %f2 + fxor %f4, %f0, %f0 + fxor %f6, %f2, %f2 + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + ldd [%o3 + 0xd0], %f8 + ldd [%o3 + 0xd8], %f10 + ldd [%o3 + 0xe0], %f12 + ldd [%o3 + 0xe8], %f14 + ldd [%o3 + 0xf0], %f16 + ldd [%o3 + 0xf8], %f18 + ldd [%o3 + 0x100], %f20 + ldd [%o3 + 0x108], %f22 + CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2) + CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2) + CAMELLIA_F(8, 2, 0, 2) + CAMELLIA_F(10, 0, 2, 0) + ldd [%o3 + 0x10], %f8 + ldd [%o3 + 0x18], %f10 + CAMELLIA_F(12, 2, 0, 2) + CAMELLIA_F(14, 0, 2, 0) + ldd [%o3 + 0x20], %f12 + ldd [%o3 + 0x28], %f14 + CAMELLIA_F(16, 2, 0, 2) + CAMELLIA_F(18, 0, 2, 0) + ldd [%o3 + 0x30], %f16 + ldd [%o3 + 0x38], %f18 + fxor %f20, %f2, %f60 + fxor %f22, %f0, %f62 + ldd [%o3 + 0x40], %f20 + ldd [%o3 + 0x48], %f22 + std %f60, [%o1 + 0x00] + std %f62, [%o1 + 0x08] + subcc %o2, 0x10, %o2 + bne,pt %icc, 1b + add %o1, 0x10, %o1 + std %f60, [%o4 + 0x00] + retl + std %f62, [%o4 + 0x08] +ENDPROC(camellia_sparc64_cbc_encrypt_4_grand_rounds) + + .align 32 +ENTRY(camellia_sparc64_cbc_decrypt_3_grand_rounds) + /* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */ + ldd [%o4 + 0x00], %f60 + ldd [%o4 + 0x08], %f62 +1: ldd [%o0 + 0x00], %f56 + ldd [%o0 + 0x08], %f58 + add %o0, 0x10, %o0 + fxor %f4, %f56, %f0 + fxor %f6, %f58, %f2 + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2) + CAMELLIA_6ROUNDS(40, 0, 2) + fxor %f52, %f2, %f2 + fxor %f54, %f0, %f0 + fxor %f60, %f2, %f2 + fxor %f62, %f0, %f0 + fsrc2 %f56, %f60 + fsrc2 %f58, %f62 + std %f2, [%o1 + 0x00] + std %f0, [%o1 + 0x08] + subcc %o2, 0x10, %o2 + bne,pt %icc, 1b + add %o1, 0x10, %o1 + std %f60, [%o4 + 0x00] + retl + std %f62, [%o4 + 0x08] +ENDPROC(camellia_sparc64_cbc_decrypt_3_grand_rounds) + + .align 32 +ENTRY(camellia_sparc64_cbc_decrypt_4_grand_rounds) + /* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */ + ldd [%o4 + 0x00], %f60 + ldd [%o4 + 0x08], %f62 +1: ldd [%o0 + 0x00], %f56 + ldd [%o0 + 0x08], %f58 + add %o0, 0x10, %o0 + fxor %f4, %f56, %f0 + fxor %f6, %f58, %f2 + CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2) + ldd [%o3 + 0xd0], %f8 + ldd [%o3 + 0xd8], %f10 + ldd [%o3 + 0xe0], %f12 + ldd [%o3 + 0xe8], %f14 + ldd [%o3 + 0xf0], %f16 + ldd [%o3 + 0xf8], %f18 + ldd [%o3 + 0x100], %f20 + ldd [%o3 + 0x108], %f22 + CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2) + CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2) + CAMELLIA_F(8, 2, 0, 2) + CAMELLIA_F(10, 0, 2, 0) + ldd [%o3 + 0x10], %f8 + ldd [%o3 + 0x18], %f10 + CAMELLIA_F(12, 2, 0, 2) + CAMELLIA_F(14, 0, 2, 0) + ldd [%o3 + 0x20], %f12 + ldd [%o3 + 0x28], %f14 + CAMELLIA_F(16, 2, 0, 2) + CAMELLIA_F(18, 0, 2, 0) + ldd [%o3 + 0x30], %f16 + ldd [%o3 + 0x38], %f18 + fxor %f20, %f2, %f2 + fxor %f22, %f0, %f0 + ldd [%o3 + 0x40], %f20 + ldd [%o3 + 0x48], %f22 + fxor %f60, %f2, %f2 + fxor %f62, %f0, %f0 + fsrc2 %f56, %f60 + fsrc2 %f58, %f62 + std %f2, [%o1 + 0x00] + std %f0, [%o1 + 0x08] + subcc %o2, 0x10, %o2 + bne,pt %icc, 1b + add %o1, 0x10, %o1 + std %f60, [%o4 + 0x00] + retl + std %f62, [%o4 + 0x08] +ENDPROC(camellia_sparc64_cbc_decrypt_4_grand_rounds) diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c new file mode 100644 index 00000000000..42905c08429 --- /dev/null +++ b/arch/sparc/crypto/camellia_glue.c @@ -0,0 +1,322 @@ +/* Glue code for CAMELLIA encryption optimized for sparc64 crypto opcodes. + * + * Copyright (C) 2012 David S. Miller <davem@davemloft.net> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <crypto/algapi.h> + +#include <asm/fpumacro.h> +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +#define CAMELLIA_MIN_KEY_SIZE 16 +#define CAMELLIA_MAX_KEY_SIZE 32 +#define CAMELLIA_BLOCK_SIZE 16 +#define CAMELLIA_TABLE_BYTE_LEN 272 + +struct camellia_sparc64_ctx { + u64 encrypt_key[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; + u64 decrypt_key[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; + int key_len; +}; + +extern void camellia_sparc64_key_expand(const u32 *in_key, u64 *encrypt_key, + unsigned int key_len, u64 *decrypt_key); + +static int camellia_set_key(struct crypto_tfm *tfm, const u8 *_in_key, + unsigned int key_len) +{ + struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); + const u32 *in_key = (const u32 *) _in_key; + u32 *flags = &tfm->crt_flags; + + if (key_len != 16 && key_len != 24 && key_len != 32) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + ctx->key_len = key_len; + + camellia_sparc64_key_expand(in_key, &ctx->encrypt_key[0], + key_len, &ctx->decrypt_key[0]); + return 0; +} + +extern void camellia_sparc64_crypt(const u64 *key, const u32 *input, + u32 *output, unsigned int key_len); + +static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); + + camellia_sparc64_crypt(&ctx->encrypt_key[0], + (const u32 *) src, + (u32 *) dst, ctx->key_len); +} + +static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); + + camellia_sparc64_crypt(&ctx->decrypt_key[0], + (const u32 *) src, + (u32 *) dst, ctx->key_len); +} + +extern void camellia_sparc64_load_keys(const u64 *key, unsigned int key_len); + +typedef void ecb_crypt_op(const u64 *input, u64 *output, unsigned int len, + const u64 *key); + +extern ecb_crypt_op camellia_sparc64_ecb_crypt_3_grand_rounds; +extern ecb_crypt_op camellia_sparc64_ecb_crypt_4_grand_rounds; + +#define CAMELLIA_BLOCK_MASK (~(CAMELLIA_BLOCK_SIZE - 1)) + +static int __ecb_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes, bool encrypt) +{ + struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + ecb_crypt_op *op; + const u64 *key; + int err; + + op = camellia_sparc64_ecb_crypt_3_grand_rounds; + if (ctx->key_len != 16) + op = camellia_sparc64_ecb_crypt_4_grand_rounds; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + if (encrypt) + key = &ctx->encrypt_key[0]; + else + key = &ctx->decrypt_key[0]; + camellia_sparc64_load_keys(key, ctx->key_len); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK; + + if (likely(block_len)) { + const u64 *src64; + u64 *dst64; + + src64 = (const u64 *)walk.src.virt.addr; + dst64 = (u64 *) walk.dst.virt.addr; + op(src64, dst64, block_len, key); + } + nbytes &= CAMELLIA_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + return __ecb_crypt(desc, dst, src, nbytes, true); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + return __ecb_crypt(desc, dst, src, nbytes, false); +} + +typedef void cbc_crypt_op(const u64 *input, u64 *output, unsigned int len, + const u64 *key, u64 *iv); + +extern cbc_crypt_op camellia_sparc64_cbc_encrypt_3_grand_rounds; +extern cbc_crypt_op camellia_sparc64_cbc_encrypt_4_grand_rounds; +extern cbc_crypt_op camellia_sparc64_cbc_decrypt_3_grand_rounds; +extern cbc_crypt_op camellia_sparc64_cbc_decrypt_4_grand_rounds; + +static int cbc_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + cbc_crypt_op *op; + const u64 *key; + int err; + + op = camellia_sparc64_cbc_encrypt_3_grand_rounds; + if (ctx->key_len != 16) + op = camellia_sparc64_cbc_encrypt_4_grand_rounds; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + key = &ctx->encrypt_key[0]; + camellia_sparc64_load_keys(key, ctx->key_len); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK; + + if (likely(block_len)) { + const u64 *src64; + u64 *dst64; + + src64 = (const u64 *)walk.src.virt.addr; + dst64 = (u64 *) walk.dst.virt.addr; + op(src64, dst64, block_len, key, + (u64 *) walk.iv); + } + nbytes &= CAMELLIA_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + cbc_crypt_op *op; + const u64 *key; + int err; + + op = camellia_sparc64_cbc_decrypt_3_grand_rounds; + if (ctx->key_len != 16) + op = camellia_sparc64_cbc_decrypt_4_grand_rounds; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + key = &ctx->decrypt_key[0]; + camellia_sparc64_load_keys(key, ctx->key_len); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK; + + if (likely(block_len)) { + const u64 *src64; + u64 *dst64; + + src64 = (const u64 *)walk.src.virt.addr; + dst64 = (u64 *) walk.dst.virt.addr; + op(src64, dst64, block_len, key, + (u64 *) walk.iv); + } + nbytes &= CAMELLIA_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static struct crypto_alg algs[] = { { + .cra_name = "camellia", + .cra_driver_name = "camellia-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_sparc64_ctx), + .cra_alignmask = 3, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = CAMELLIA_MIN_KEY_SIZE, + .cia_max_keysize = CAMELLIA_MAX_KEY_SIZE, + .cia_setkey = camellia_set_key, + .cia_encrypt = camellia_encrypt, + .cia_decrypt = camellia_decrypt + } + } +}, { + .cra_name = "ecb(camellia)", + .cra_driver_name = "ecb-camellia-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_sparc64_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_set_key, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(camellia)", + .cra_driver_name = "cbc-camellia-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_sparc64_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_set_key, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +} +}; + +static bool __init sparc64_has_camellia_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_CAMELLIA)) + return false; + + return true; +} + +static int __init camellia_sparc64_mod_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(algs); i++) + INIT_LIST_HEAD(&algs[i].cra_list); + + if (sparc64_has_camellia_opcode()) { + pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n"); + return crypto_register_algs(algs, ARRAY_SIZE(algs)); + } + pr_info("sparc64 camellia opcodes not available.\n"); + return -ENODEV; +} + +static void __exit camellia_sparc64_mod_fini(void) +{ + crypto_unregister_algs(algs, ARRAY_SIZE(algs)); +} + +module_init(camellia_sparc64_mod_init); +module_exit(camellia_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Camellia Cipher Algorithm, sparc64 camellia opcode accelerated"); + +MODULE_ALIAS("aes"); diff --git a/arch/sparc/crypto/crc32c_asm.S b/arch/sparc/crypto/crc32c_asm.S new file mode 100644 index 00000000000..2b1976e765b --- /dev/null +++ b/arch/sparc/crypto/crc32c_asm.S @@ -0,0 +1,20 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> +#include <asm/asi.h> + +#include "opcodes.h" + +ENTRY(crc32c_sparc64) + /* %o0=crc32p, %o1=data_ptr, %o2=len */ + VISEntryHalf + lda [%o0] ASI_PL, %f1 +1: ldd [%o1], %f2 + CRC32C(0,2,0) + subcc %o2, 8, %o2 + bne,pt %icc, 1b + add %o1, 0x8, %o1 + sta %f1, [%o0] ASI_PL + VISExitHalf +2: retl + nop +ENDPROC(crc32c_sparc64) diff --git a/arch/sparc/crypto/crc32c_glue.c b/arch/sparc/crypto/crc32c_glue.c new file mode 100644 index 00000000000..0bd89cea8d8 --- /dev/null +++ b/arch/sparc/crypto/crc32c_glue.c @@ -0,0 +1,179 @@ +/* Glue code for CRC32C optimized for sparc64 crypto opcodes. + * + * This is based largely upon arch/x86/crypto/crc32c-intel.c + * + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang <austin_zhang@linux.intel.com> + * Kent Liu <kent.liu@intel.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/crc32.h> + +#include <crypto/internal/hash.h> + +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32c_sparc64_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *(__le32 *)mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32c_sparc64_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +extern void crc32c_sparc64(u32 *crcp, const u64 *data, unsigned int len); + +static void crc32c_compute(u32 *crcp, const u64 *data, unsigned int len) +{ + unsigned int asm_len; + + asm_len = len & ~7U; + if (asm_len) { + crc32c_sparc64(crcp, data, asm_len); + data += asm_len / 8; + len -= asm_len; + } + if (len) + *crcp = __crc32c_le(*crcp, (const unsigned char *) data, len); +} + +static int crc32c_sparc64_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + crc32c_compute(crcp, (const u64 *) data, len); + + return 0; +} + +static int __crc32c_sparc64_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + u32 tmp = *crcp; + + crc32c_compute(&tmp, (const u64 *) data, len); + + *(__le32 *) out = ~cpu_to_le32(tmp); + return 0; +} + +static int crc32c_sparc64_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_sparc64_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_sparc64_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *) out = ~cpu_to_le32p(crcp); + return 0; +} + +static int crc32c_sparc64_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_sparc64_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static int crc32c_sparc64_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + + return 0; +} + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +static struct shash_alg alg = { + .setkey = crc32c_sparc64_setkey, + .init = crc32c_sparc64_init, + .update = crc32c_sparc64_update, + .final = crc32c_sparc64_final, + .finup = crc32c_sparc64_finup, + .digest = crc32c_sparc64_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, + .cra_init = crc32c_sparc64_cra_init, + } +}; + +static bool __init sparc64_has_crc32c_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_CRC32C)) + return false; + + return true; +} + +static int __init crc32c_sparc64_mod_init(void) +{ + if (sparc64_has_crc32c_opcode()) { + pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n"); + return crypto_register_shash(&alg); + } + pr_info("sparc64 crc32c opcode not available.\n"); + return -ENODEV; +} + +static void __exit crc32c_sparc64_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crc32c_sparc64_mod_init); +module_exit(crc32c_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CRC32c (Castagnoli), sparc64 crc32c opcode accelerated"); + +MODULE_ALIAS("crc32c"); diff --git a/arch/sparc/crypto/crop_devid.c b/arch/sparc/crypto/crop_devid.c new file mode 100644 index 00000000000..5f5724a0ae2 --- /dev/null +++ b/arch/sparc/crypto/crop_devid.c @@ -0,0 +1,14 @@ +#include <linux/module.h> +#include <linux/of_device.h> + +/* This is a dummy device table linked into all of the crypto + * opcode drivers. It serves to trigger the module autoloading + * mechanisms in userspace which scan the OF device tree and + * load any modules which have device table entries that + * match OF device nodes. + */ +static const struct of_device_id crypto_opcode_match[] = { + { .name = "cpu", .compatible = "sun4v", }, + {}, +}; +MODULE_DEVICE_TABLE(of, crypto_opcode_match); diff --git a/arch/sparc/crypto/des_asm.S b/arch/sparc/crypto/des_asm.S new file mode 100644 index 00000000000..30b6e90b28b --- /dev/null +++ b/arch/sparc/crypto/des_asm.S @@ -0,0 +1,418 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> + +#include "opcodes.h" + + .align 32 +ENTRY(des_sparc64_key_expand) + /* %o0=input_key, %o1=output_key */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + DES_KEXPAND(0, 0, 0) + DES_KEXPAND(0, 1, 2) + DES_KEXPAND(2, 3, 6) + DES_KEXPAND(2, 2, 4) + DES_KEXPAND(6, 3, 10) + DES_KEXPAND(6, 2, 8) + DES_KEXPAND(10, 3, 14) + DES_KEXPAND(10, 2, 12) + DES_KEXPAND(14, 1, 16) + DES_KEXPAND(16, 3, 20) + DES_KEXPAND(16, 2, 18) + DES_KEXPAND(20, 3, 24) + DES_KEXPAND(20, 2, 22) + DES_KEXPAND(24, 3, 28) + DES_KEXPAND(24, 2, 26) + DES_KEXPAND(28, 1, 30) + std %f0, [%o1 + 0x00] + std %f2, [%o1 + 0x08] + std %f4, [%o1 + 0x10] + std %f6, [%o1 + 0x18] + std %f8, [%o1 + 0x20] + std %f10, [%o1 + 0x28] + std %f12, [%o1 + 0x30] + std %f14, [%o1 + 0x38] + std %f16, [%o1 + 0x40] + std %f18, [%o1 + 0x48] + std %f20, [%o1 + 0x50] + std %f22, [%o1 + 0x58] + std %f24, [%o1 + 0x60] + std %f26, [%o1 + 0x68] + std %f28, [%o1 + 0x70] + std %f30, [%o1 + 0x78] + retl + VISExitHalf +ENDPROC(des_sparc64_key_expand) + + .align 32 +ENTRY(des_sparc64_crypt) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ldd [%o1 + 0x00], %f32 + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + ldd [%o0 + 0x30], %f12 + ldd [%o0 + 0x38], %f14 + ldd [%o0 + 0x40], %f16 + ldd [%o0 + 0x48], %f18 + ldd [%o0 + 0x50], %f20 + ldd [%o0 + 0x58], %f22 + ldd [%o0 + 0x60], %f24 + ldd [%o0 + 0x68], %f26 + ldd [%o0 + 0x70], %f28 + ldd [%o0 + 0x78], %f30 + DES_IP(32, 32) + DES_ROUND(0, 2, 32, 32) + DES_ROUND(4, 6, 32, 32) + DES_ROUND(8, 10, 32, 32) + DES_ROUND(12, 14, 32, 32) + DES_ROUND(16, 18, 32, 32) + DES_ROUND(20, 22, 32, 32) + DES_ROUND(24, 26, 32, 32) + DES_ROUND(28, 30, 32, 32) + DES_IIP(32, 32) + std %f32, [%o2 + 0x00] + retl + VISExit +ENDPROC(des_sparc64_crypt) + + .align 32 +ENTRY(des_sparc64_load_keys) + /* %o0=key */ + VISEntry + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + ldd [%o0 + 0x30], %f12 + ldd [%o0 + 0x38], %f14 + ldd [%o0 + 0x40], %f16 + ldd [%o0 + 0x48], %f18 + ldd [%o0 + 0x50], %f20 + ldd [%o0 + 0x58], %f22 + ldd [%o0 + 0x60], %f24 + ldd [%o0 + 0x68], %f26 + ldd [%o0 + 0x70], %f28 + retl + ldd [%o0 + 0x78], %f30 +ENDPROC(des_sparc64_load_keys) + + .align 32 +ENTRY(des_sparc64_ecb_crypt) + /* %o0=input, %o1=output, %o2=len */ +1: ldd [%o0 + 0x00], %f32 + add %o0, 0x08, %o0 + DES_IP(32, 32) + DES_ROUND(0, 2, 32, 32) + DES_ROUND(4, 6, 32, 32) + DES_ROUND(8, 10, 32, 32) + DES_ROUND(12, 14, 32, 32) + DES_ROUND(16, 18, 32, 32) + DES_ROUND(20, 22, 32, 32) + DES_ROUND(24, 26, 32, 32) + DES_ROUND(28, 30, 32, 32) + DES_IIP(32, 32) + std %f32, [%o1 + 0x00] + subcc %o2, 0x08, %o2 + bne,pt %icc, 1b + add %o1, 0x08, %o1 + retl + nop +ENDPROC(des_sparc64_ecb_crypt) + + .align 32 +ENTRY(des_sparc64_cbc_encrypt) + /* %o0=input, %o1=output, %o2=len, %o3=IV */ + ldd [%o3 + 0x00], %f32 +1: ldd [%o0 + 0x00], %f34 + fxor %f32, %f34, %f32 + DES_IP(32, 32) + DES_ROUND(0, 2, 32, 32) + DES_ROUND(4, 6, 32, 32) + DES_ROUND(8, 10, 32, 32) + DES_ROUND(12, 14, 32, 32) + DES_ROUND(16, 18, 32, 32) + DES_ROUND(20, 22, 32, 32) + DES_ROUND(24, 26, 32, 32) + DES_ROUND(28, 30, 32, 32) + DES_IIP(32, 32) + std %f32, [%o1 + 0x00] + add %o0, 0x08, %o0 + subcc %o2, 0x08, %o2 + bne,pt %icc, 1b + add %o1, 0x08, %o1 + retl + std %f32, [%o3 + 0x00] +ENDPROC(des_sparc64_cbc_encrypt) + + .align 32 +ENTRY(des_sparc64_cbc_decrypt) + /* %o0=input, %o1=output, %o2=len, %o3=IV */ + ldd [%o3 + 0x00], %f34 +1: ldd [%o0 + 0x00], %f36 + DES_IP(36, 32) + DES_ROUND(0, 2, 32, 32) + DES_ROUND(4, 6, 32, 32) + DES_ROUND(8, 10, 32, 32) + DES_ROUND(12, 14, 32, 32) + DES_ROUND(16, 18, 32, 32) + DES_ROUND(20, 22, 32, 32) + DES_ROUND(24, 26, 32, 32) + DES_ROUND(28, 30, 32, 32) + DES_IIP(32, 32) + fxor %f32, %f34, %f32 + fsrc2 %f36, %f34 + std %f32, [%o1 + 0x00] + add %o0, 0x08, %o0 + subcc %o2, 0x08, %o2 + bne,pt %icc, 1b + add %o1, 0x08, %o1 + retl + std %f36, [%o3 + 0x00] +ENDPROC(des_sparc64_cbc_decrypt) + + .align 32 +ENTRY(des3_ede_sparc64_crypt) + /* %o0=key, %o1=input, %o2=output */ + VISEntry + ldd [%o1 + 0x00], %f32 + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + ldd [%o0 + 0x30], %f12 + ldd [%o0 + 0x38], %f14 + ldd [%o0 + 0x40], %f16 + ldd [%o0 + 0x48], %f18 + ldd [%o0 + 0x50], %f20 + ldd [%o0 + 0x58], %f22 + ldd [%o0 + 0x60], %f24 + ldd [%o0 + 0x68], %f26 + ldd [%o0 + 0x70], %f28 + ldd [%o0 + 0x78], %f30 + DES_IP(32, 32) + DES_ROUND(0, 2, 32, 32) + ldd [%o0 + 0x80], %f0 + ldd [%o0 + 0x88], %f2 + DES_ROUND(4, 6, 32, 32) + ldd [%o0 + 0x90], %f4 + ldd [%o0 + 0x98], %f6 + DES_ROUND(8, 10, 32, 32) + ldd [%o0 + 0xa0], %f8 + ldd [%o0 + 0xa8], %f10 + DES_ROUND(12, 14, 32, 32) + ldd [%o0 + 0xb0], %f12 + ldd [%o0 + 0xb8], %f14 + DES_ROUND(16, 18, 32, 32) + ldd [%o0 + 0xc0], %f16 + ldd [%o0 + 0xc8], %f18 + DES_ROUND(20, 22, 32, 32) + ldd [%o0 + 0xd0], %f20 + ldd [%o0 + 0xd8], %f22 + DES_ROUND(24, 26, 32, 32) + ldd [%o0 + 0xe0], %f24 + ldd [%o0 + 0xe8], %f26 + DES_ROUND(28, 30, 32, 32) + ldd [%o0 + 0xf0], %f28 + ldd [%o0 + 0xf8], %f30 + DES_IIP(32, 32) + DES_IP(32, 32) + DES_ROUND(0, 2, 32, 32) + ldd [%o0 + 0x100], %f0 + ldd [%o0 + 0x108], %f2 + DES_ROUND(4, 6, 32, 32) + ldd [%o0 + 0x110], %f4 + ldd [%o0 + 0x118], %f6 + DES_ROUND(8, 10, 32, 32) + ldd [%o0 + 0x120], %f8 + ldd [%o0 + 0x128], %f10 + DES_ROUND(12, 14, 32, 32) + ldd [%o0 + 0x130], %f12 + ldd [%o0 + 0x138], %f14 + DES_ROUND(16, 18, 32, 32) + ldd [%o0 + 0x140], %f16 + ldd [%o0 + 0x148], %f18 + DES_ROUND(20, 22, 32, 32) + ldd [%o0 + 0x150], %f20 + ldd [%o0 + 0x158], %f22 + DES_ROUND(24, 26, 32, 32) + ldd [%o0 + 0x160], %f24 + ldd [%o0 + 0x168], %f26 + DES_ROUND(28, 30, 32, 32) + ldd [%o0 + 0x170], %f28 + ldd [%o0 + 0x178], %f30 + DES_IIP(32, 32) + DES_IP(32, 32) + DES_ROUND(0, 2, 32, 32) + DES_ROUND(4, 6, 32, 32) + DES_ROUND(8, 10, 32, 32) + DES_ROUND(12, 14, 32, 32) + DES_ROUND(16, 18, 32, 32) + DES_ROUND(20, 22, 32, 32) + DES_ROUND(24, 26, 32, 32) + DES_ROUND(28, 30, 32, 32) + DES_IIP(32, 32) + + std %f32, [%o2 + 0x00] + retl + VISExit +ENDPROC(des3_ede_sparc64_crypt) + + .align 32 +ENTRY(des3_ede_sparc64_load_keys) + /* %o0=key */ + VISEntry + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + ldd [%o0 + 0x30], %f12 + ldd [%o0 + 0x38], %f14 + ldd [%o0 + 0x40], %f16 + ldd [%o0 + 0x48], %f18 + ldd [%o0 + 0x50], %f20 + ldd [%o0 + 0x58], %f22 + ldd [%o0 + 0x60], %f24 + ldd [%o0 + 0x68], %f26 + ldd [%o0 + 0x70], %f28 + ldd [%o0 + 0x78], %f30 + ldd [%o0 + 0x80], %f32 + ldd [%o0 + 0x88], %f34 + ldd [%o0 + 0x90], %f36 + ldd [%o0 + 0x98], %f38 + ldd [%o0 + 0xa0], %f40 + ldd [%o0 + 0xa8], %f42 + ldd [%o0 + 0xb0], %f44 + ldd [%o0 + 0xb8], %f46 + ldd [%o0 + 0xc0], %f48 + ldd [%o0 + 0xc8], %f50 + ldd [%o0 + 0xd0], %f52 + ldd [%o0 + 0xd8], %f54 + ldd [%o0 + 0xe0], %f56 + retl + ldd [%o0 + 0xe8], %f58 +ENDPROC(des3_ede_sparc64_load_keys) + +#define DES3_LOOP_BODY(X) \ + DES_IP(X, X) \ + DES_ROUND(0, 2, X, X) \ + DES_ROUND(4, 6, X, X) \ + DES_ROUND(8, 10, X, X) \ + DES_ROUND(12, 14, X, X) \ + DES_ROUND(16, 18, X, X) \ + ldd [%o0 + 0xf0], %f16; \ + ldd [%o0 + 0xf8], %f18; \ + DES_ROUND(20, 22, X, X) \ + ldd [%o0 + 0x100], %f20; \ + ldd [%o0 + 0x108], %f22; \ + DES_ROUND(24, 26, X, X) \ + ldd [%o0 + 0x110], %f24; \ + ldd [%o0 + 0x118], %f26; \ + DES_ROUND(28, 30, X, X) \ + ldd [%o0 + 0x120], %f28; \ + ldd [%o0 + 0x128], %f30; \ + DES_IIP(X, X) \ + DES_IP(X, X) \ + DES_ROUND(32, 34, X, X) \ + ldd [%o0 + 0x130], %f0; \ + ldd [%o0 + 0x138], %f2; \ + DES_ROUND(36, 38, X, X) \ + ldd [%o0 + 0x140], %f4; \ + ldd [%o0 + 0x148], %f6; \ + DES_ROUND(40, 42, X, X) \ + ldd [%o0 + 0x150], %f8; \ + ldd [%o0 + 0x158], %f10; \ + DES_ROUND(44, 46, X, X) \ + ldd [%o0 + 0x160], %f12; \ + ldd [%o0 + 0x168], %f14; \ + DES_ROUND(48, 50, X, X) \ + DES_ROUND(52, 54, X, X) \ + DES_ROUND(56, 58, X, X) \ + DES_ROUND(16, 18, X, X) \ + ldd [%o0 + 0x170], %f16; \ + ldd [%o0 + 0x178], %f18; \ + DES_IIP(X, X) \ + DES_IP(X, X) \ + DES_ROUND(20, 22, X, X) \ + ldd [%o0 + 0x50], %f20; \ + ldd [%o0 + 0x58], %f22; \ + DES_ROUND(24, 26, X, X) \ + ldd [%o0 + 0x60], %f24; \ + ldd [%o0 + 0x68], %f26; \ + DES_ROUND(28, 30, X, X) \ + ldd [%o0 + 0x70], %f28; \ + ldd [%o0 + 0x78], %f30; \ + DES_ROUND(0, 2, X, X) \ + ldd [%o0 + 0x00], %f0; \ + ldd [%o0 + 0x08], %f2; \ + DES_ROUND(4, 6, X, X) \ + ldd [%o0 + 0x10], %f4; \ + ldd [%o0 + 0x18], %f6; \ + DES_ROUND(8, 10, X, X) \ + ldd [%o0 + 0x20], %f8; \ + ldd [%o0 + 0x28], %f10; \ + DES_ROUND(12, 14, X, X) \ + ldd [%o0 + 0x30], %f12; \ + ldd [%o0 + 0x38], %f14; \ + DES_ROUND(16, 18, X, X) \ + ldd [%o0 + 0x40], %f16; \ + ldd [%o0 + 0x48], %f18; \ + DES_IIP(X, X) + + .align 32 +ENTRY(des3_ede_sparc64_ecb_crypt) + /* %o0=key, %o1=input, %o2=output, %o3=len */ +1: ldd [%o1 + 0x00], %f60 + DES3_LOOP_BODY(60) + std %f60, [%o2 + 0x00] + subcc %o3, 0x08, %o3 + bne,pt %icc, 1b + add %o2, 0x08, %o2 + retl + nop +ENDPROC(des3_ede_sparc64_ecb_crypt) + + .align 32 +ENTRY(des3_ede_sparc64_cbc_encrypt) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldd [%o4 + 0x00], %f60 +1: ldd [%o1 + 0x00], %f62 + fxor %f60, %f62, %f60 + DES3_LOOP_BODY(60) + std %f60, [%o2 + 0x00] + add %o1, 0x08, %o1 + subcc %o3, 0x08, %o3 + bne,pt %icc, 1b + add %o2, 0x08, %o2 + retl + std %f60, [%o4 + 0x00] +ENDPROC(des3_ede_sparc64_cbc_encrypt) + + .align 32 +ENTRY(des3_ede_sparc64_cbc_decrypt) + /* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */ + ldd [%o4 + 0x00], %f62 +1: ldx [%o1 + 0x00], %g1 + MOVXTOD_G1_F60 + DES3_LOOP_BODY(60) + fxor %f62, %f60, %f60 + MOVXTOD_G1_F62 + std %f60, [%o2 + 0x00] + add %o1, 0x08, %o1 + subcc %o3, 0x08, %o3 + bne,pt %icc, 1b + add %o2, 0x08, %o2 + retl + stx %g1, [%o4 + 0x00] +ENDPROC(des3_ede_sparc64_cbc_decrypt) diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c new file mode 100644 index 00000000000..c4940c2d307 --- /dev/null +++ b/arch/sparc/crypto/des_glue.c @@ -0,0 +1,529 @@ +/* Glue code for DES encryption optimized for sparc64 crypto opcodes. + * + * Copyright (C) 2012 David S. Miller <davem@davemloft.net> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <crypto/algapi.h> +#include <crypto/des.h> + +#include <asm/fpumacro.h> +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +struct des_sparc64_ctx { + u64 encrypt_expkey[DES_EXPKEY_WORDS / 2]; + u64 decrypt_expkey[DES_EXPKEY_WORDS / 2]; +}; + +struct des3_ede_sparc64_ctx { + u64 encrypt_expkey[DES3_EDE_EXPKEY_WORDS / 2]; + u64 decrypt_expkey[DES3_EDE_EXPKEY_WORDS / 2]; +}; + +static void encrypt_to_decrypt(u64 *d, const u64 *e) +{ + const u64 *s = e + (DES_EXPKEY_WORDS / 2) - 1; + int i; + + for (i = 0; i < DES_EXPKEY_WORDS / 2; i++) + *d++ = *s--; +} + +extern void des_sparc64_key_expand(const u32 *input_key, u64 *key); + +static int des_set_key(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct des_sparc64_ctx *dctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + u32 tmp[DES_EXPKEY_WORDS]; + int ret; + + /* Even though we have special instructions for key expansion, + * we call des_ekey() so that we don't have to write our own + * weak key detection code. + */ + ret = des_ekey(tmp, key); + if (unlikely(ret == 0) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { + *flags |= CRYPTO_TFM_RES_WEAK_KEY; + return -EINVAL; + } + + des_sparc64_key_expand((const u32 *) key, &dctx->encrypt_expkey[0]); + encrypt_to_decrypt(&dctx->decrypt_expkey[0], &dctx->encrypt_expkey[0]); + + return 0; +} + +extern void des_sparc64_crypt(const u64 *key, const u64 *input, + u64 *output); + +static void des_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct des_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); + const u64 *K = ctx->encrypt_expkey; + + des_sparc64_crypt(K, (const u64 *) src, (u64 *) dst); +} + +static void des_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct des_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); + const u64 *K = ctx->decrypt_expkey; + + des_sparc64_crypt(K, (const u64 *) src, (u64 *) dst); +} + +extern void des_sparc64_load_keys(const u64 *key); + +extern void des_sparc64_ecb_crypt(const u64 *input, u64 *output, + unsigned int len); + +#define DES_BLOCK_MASK (~(DES_BLOCK_SIZE - 1)) + +static int __ecb_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes, bool encrypt) +{ + struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + if (encrypt) + des_sparc64_load_keys(&ctx->encrypt_expkey[0]); + else + des_sparc64_load_keys(&ctx->decrypt_expkey[0]); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & DES_BLOCK_MASK; + + if (likely(block_len)) { + des_sparc64_ecb_crypt((const u64 *)walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, + block_len); + } + nbytes &= DES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + return __ecb_crypt(desc, dst, src, nbytes, true); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + return __ecb_crypt(desc, dst, src, nbytes, false); +} + +extern void des_sparc64_cbc_encrypt(const u64 *input, u64 *output, + unsigned int len, u64 *iv); + +static int cbc_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + des_sparc64_load_keys(&ctx->encrypt_expkey[0]); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & DES_BLOCK_MASK; + + if (likely(block_len)) { + des_sparc64_cbc_encrypt((const u64 *)walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, + block_len, (u64 *) walk.iv); + } + nbytes &= DES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +extern void des_sparc64_cbc_decrypt(const u64 *input, u64 *output, + unsigned int len, u64 *iv); + +static int cbc_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + des_sparc64_load_keys(&ctx->decrypt_expkey[0]); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & DES_BLOCK_MASK; + + if (likely(block_len)) { + des_sparc64_cbc_decrypt((const u64 *)walk.src.virt.addr, + (u64 *) walk.dst.virt.addr, + block_len, (u64 *) walk.iv); + } + nbytes &= DES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static int des3_ede_set_key(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct des3_ede_sparc64_ctx *dctx = crypto_tfm_ctx(tfm); + const u32 *K = (const u32 *)key; + u32 *flags = &tfm->crt_flags; + u64 k1[DES_EXPKEY_WORDS / 2]; + u64 k2[DES_EXPKEY_WORDS / 2]; + u64 k3[DES_EXPKEY_WORDS / 2]; + + if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) || + !((K[2] ^ K[4]) | (K[3] ^ K[5]))) && + (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { + *flags |= CRYPTO_TFM_RES_WEAK_KEY; + return -EINVAL; + } + + des_sparc64_key_expand((const u32 *)key, k1); + key += DES_KEY_SIZE; + des_sparc64_key_expand((const u32 *)key, k2); + key += DES_KEY_SIZE; + des_sparc64_key_expand((const u32 *)key, k3); + + memcpy(&dctx->encrypt_expkey[0], &k1[0], sizeof(k1)); + encrypt_to_decrypt(&dctx->encrypt_expkey[DES_EXPKEY_WORDS / 2], &k2[0]); + memcpy(&dctx->encrypt_expkey[(DES_EXPKEY_WORDS / 2) * 2], + &k3[0], sizeof(k3)); + + encrypt_to_decrypt(&dctx->decrypt_expkey[0], &k3[0]); + memcpy(&dctx->decrypt_expkey[DES_EXPKEY_WORDS / 2], + &k2[0], sizeof(k2)); + encrypt_to_decrypt(&dctx->decrypt_expkey[(DES_EXPKEY_WORDS / 2) * 2], + &k1[0]); + + return 0; +} + +extern void des3_ede_sparc64_crypt(const u64 *key, const u64 *input, + u64 *output); + +static void des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct des3_ede_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); + const u64 *K = ctx->encrypt_expkey; + + des3_ede_sparc64_crypt(K, (const u64 *) src, (u64 *) dst); +} + +static void des3_ede_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct des3_ede_sparc64_ctx *ctx = crypto_tfm_ctx(tfm); + const u64 *K = ctx->decrypt_expkey; + + des3_ede_sparc64_crypt(K, (const u64 *) src, (u64 *) dst); +} + +extern void des3_ede_sparc64_load_keys(const u64 *key); + +extern void des3_ede_sparc64_ecb_crypt(const u64 *expkey, const u64 *input, + u64 *output, unsigned int len); + +static int __ecb3_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes, bool encrypt) +{ + struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + const u64 *K; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + if (encrypt) + K = &ctx->encrypt_expkey[0]; + else + K = &ctx->decrypt_expkey[0]; + des3_ede_sparc64_load_keys(K); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & DES_BLOCK_MASK; + + if (likely(block_len)) { + const u64 *src64 = (const u64 *)walk.src.virt.addr; + des3_ede_sparc64_ecb_crypt(K, src64, + (u64 *) walk.dst.virt.addr, + block_len); + } + nbytes &= DES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static int ecb3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + return __ecb3_crypt(desc, dst, src, nbytes, true); +} + +static int ecb3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + return __ecb3_crypt(desc, dst, src, nbytes, false); +} + +extern void des3_ede_sparc64_cbc_encrypt(const u64 *expkey, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +static int cbc3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + const u64 *K; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + K = &ctx->encrypt_expkey[0]; + des3_ede_sparc64_load_keys(K); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & DES_BLOCK_MASK; + + if (likely(block_len)) { + const u64 *src64 = (const u64 *)walk.src.virt.addr; + des3_ede_sparc64_cbc_encrypt(K, src64, + (u64 *) walk.dst.virt.addr, + block_len, + (u64 *) walk.iv); + } + nbytes &= DES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +extern void des3_ede_sparc64_cbc_decrypt(const u64 *expkey, const u64 *input, + u64 *output, unsigned int len, + u64 *iv); + +static int cbc3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + const u64 *K; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + K = &ctx->decrypt_expkey[0]; + des3_ede_sparc64_load_keys(K); + while ((nbytes = walk.nbytes)) { + unsigned int block_len = nbytes & DES_BLOCK_MASK; + + if (likely(block_len)) { + const u64 *src64 = (const u64 *)walk.src.virt.addr; + des3_ede_sparc64_cbc_decrypt(K, src64, + (u64 *) walk.dst.virt.addr, + block_len, + (u64 *) walk.iv); + } + nbytes &= DES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + fprs_write(0); + return err; +} + +static struct crypto_alg algs[] = { { + .cra_name = "des", + .cra_driver_name = "des-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des_sparc64_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES_KEY_SIZE, + .cia_max_keysize = DES_KEY_SIZE, + .cia_setkey = des_set_key, + .cia_encrypt = des_encrypt, + .cia_decrypt = des_decrypt + } + } +}, { + .cra_name = "ecb(des)", + .cra_driver_name = "ecb-des-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des_sparc64_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .setkey = des_set_key, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(des)", + .cra_driver_name = "cbc-des-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des_sparc64_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .setkey = des_set_key, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "des3_ede", + .cra_driver_name = "des3_ede-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES3_EDE_KEY_SIZE, + .cia_max_keysize = DES3_EDE_KEY_SIZE, + .cia_setkey = des3_ede_set_key, + .cia_encrypt = des3_ede_encrypt, + .cia_decrypt = des3_ede_decrypt + } + } +}, { + .cra_name = "ecb(des3_ede)", + .cra_driver_name = "ecb-des3_ede-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .setkey = des3_ede_set_key, + .encrypt = ecb3_encrypt, + .decrypt = ecb3_decrypt, + }, + }, +}, { + .cra_name = "cbc(des3_ede)", + .cra_driver_name = "cbc-des3_ede-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_sparc64_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .setkey = des3_ede_set_key, + .encrypt = cbc3_encrypt, + .decrypt = cbc3_decrypt, + }, + }, +} }; + +static bool __init sparc64_has_des_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_DES)) + return false; + + return true; +} + +static int __init des_sparc64_mod_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(algs); i++) + INIT_LIST_HEAD(&algs[i].cra_list); + + if (sparc64_has_des_opcode()) { + pr_info("Using sparc64 des opcodes optimized DES implementation\n"); + return crypto_register_algs(algs, ARRAY_SIZE(algs)); + } + pr_info("sparc64 des opcodes not available.\n"); + return -ENODEV; +} + +static void __exit des_sparc64_mod_fini(void) +{ + crypto_unregister_algs(algs, ARRAY_SIZE(algs)); +} + +module_init(des_sparc64_mod_init); +module_exit(des_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms, sparc64 des opcode accelerated"); + +MODULE_ALIAS("des"); diff --git a/arch/sparc/crypto/md5_asm.S b/arch/sparc/crypto/md5_asm.S new file mode 100644 index 00000000000..3150404e602 --- /dev/null +++ b/arch/sparc/crypto/md5_asm.S @@ -0,0 +1,70 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> + +#include "opcodes.h" + +ENTRY(md5_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x08], %f2 + bne,pn %xcc, 10f + ld [%o0 + 0x0c], %f3 + +1: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + ldd [%o1 + 0x38], %f22 + + MD5 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + +5: + st %f0, [%o0 + 0x00] + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + retl + VISExitHalf +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +1: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + ldd [%o1 + 0x40], %f26 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + MD5 + + subcc %o2, 1, %o2 + fsrc2 %f26, %f10 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(md5_sparc64_transform) diff --git a/arch/sparc/crypto/md5_glue.c b/arch/sparc/crypto/md5_glue.c new file mode 100644 index 00000000000..603d723038c --- /dev/null +++ b/arch/sparc/crypto/md5_glue.c @@ -0,0 +1,188 @@ +/* Glue code for MD5 hashing optimized for sparc64 crypto opcodes. + * + * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c + * and crypto/md5.c which are: + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> + * Copyright (c) Mathias Krause <minipli@googlemail.com> + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/md5.h> + +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +asmlinkage void md5_sparc64_transform(u32 *digest, const char *data, + unsigned int rounds); + +static int md5_sparc64_init(struct shash_desc *desc) +{ + struct md5_state *mctx = shash_desc_ctx(desc); + + mctx->hash[0] = cpu_to_le32(0x67452301); + mctx->hash[1] = cpu_to_le32(0xefcdab89); + mctx->hash[2] = cpu_to_le32(0x98badcfe); + mctx->hash[3] = cpu_to_le32(0x10325476); + mctx->byte_count = 0; + + return 0; +} + +static void __md5_sparc64_update(struct md5_state *sctx, const u8 *data, + unsigned int len, unsigned int partial) +{ + unsigned int done = 0; + + sctx->byte_count += len; + if (partial) { + done = MD5_HMAC_BLOCK_SIZE - partial; + memcpy((u8 *)sctx->block + partial, data, done); + md5_sparc64_transform(sctx->hash, (u8 *)sctx->block, 1); + } + if (len - done >= MD5_HMAC_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / MD5_HMAC_BLOCK_SIZE; + + md5_sparc64_transform(sctx->hash, data + done, rounds); + done += rounds * MD5_HMAC_BLOCK_SIZE; + } + + memcpy(sctx->block, data + done, len - done); +} + +static int md5_sparc64_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->byte_count % MD5_HMAC_BLOCK_SIZE; + + /* Handle the fast case right here */ + if (partial + len < MD5_HMAC_BLOCK_SIZE) { + sctx->byte_count += len; + memcpy((u8 *)sctx->block + partial, data, len); + } else + __md5_sparc64_update(sctx, data, len, partial); + + return 0; +} + +/* Add padding and return the message digest. */ +static int md5_sparc64_final(struct shash_desc *desc, u8 *out) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + u32 *dst = (u32 *)out; + __le64 bits; + static const u8 padding[MD5_HMAC_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_le64(sctx->byte_count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->byte_count % MD5_HMAC_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((MD5_HMAC_BLOCK_SIZE+56) - index); + + /* We need to fill a whole block for __md5_sparc64_update() */ + if (padlen <= 56) { + sctx->byte_count += padlen; + memcpy((u8 *)sctx->block + index, padding, padlen); + } else { + __md5_sparc64_update(sctx, padding, padlen, index); + } + __md5_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56); + + /* Store state in digest */ + for (i = 0; i < MD5_HASH_WORDS; i++) + dst[i] = sctx->hash[i]; + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int md5_sparc64_export(struct shash_desc *desc, void *out) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int md5_sparc64_import(struct shash_desc *desc, const void *in) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static struct shash_alg alg = { + .digestsize = MD5_DIGEST_SIZE, + .init = md5_sparc64_init, + .update = md5_sparc64_update, + .final = md5_sparc64_final, + .export = md5_sparc64_export, + .import = md5_sparc64_import, + .descsize = sizeof(struct md5_state), + .statesize = sizeof(struct md5_state), + .base = { + .cra_name = "md5", + .cra_driver_name= "md5-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = MD5_HMAC_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static bool __init sparc64_has_md5_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_MD5)) + return false; + + return true; +} + +static int __init md5_sparc64_mod_init(void) +{ + if (sparc64_has_md5_opcode()) { + pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n"); + return crypto_register_shash(&alg); + } + pr_info("sparc64 md5 opcode not available.\n"); + return -ENODEV; +} + +static void __exit md5_sparc64_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(md5_sparc64_mod_init); +module_exit(md5_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, sparc64 md5 opcode accelerated"); + +MODULE_ALIAS("md5"); diff --git a/arch/sparc/crypto/opcodes.h b/arch/sparc/crypto/opcodes.h new file mode 100644 index 00000000000..19cbaea6976 --- /dev/null +++ b/arch/sparc/crypto/opcodes.h @@ -0,0 +1,99 @@ +#ifndef _OPCODES_H +#define _OPCODES_H + +#define SPARC_CR_OPCODE_PRIORITY 300 + +#define F3F(x,y,z) (((x)<<30)|((y)<<19)|((z)<<5)) + +#define FPD_ENCODE(x) (((x) >> 5) | ((x) & ~(0x20))) + +#define RS1(x) (FPD_ENCODE(x) << 14) +#define RS2(x) (FPD_ENCODE(x) << 0) +#define RS3(x) (FPD_ENCODE(x) << 9) +#define RD(x) (FPD_ENCODE(x) << 25) +#define IMM5_0(x) ((x) << 0) +#define IMM5_9(x) ((x) << 9) + +#define CRC32C(a,b,c) \ + .word (F3F(2,0x36,0x147)|RS1(a)|RS2(b)|RD(c)); + +#define MD5 \ + .word 0x81b02800; +#define SHA1 \ + .word 0x81b02820; +#define SHA256 \ + .word 0x81b02840; +#define SHA512 \ + .word 0x81b02860; + +#define AES_EROUND01(a,b,c,d) \ + .word (F3F(2, 0x19, 0)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_EROUND23(a,b,c,d) \ + .word (F3F(2, 0x19, 1)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND01(a,b,c,d) \ + .word (F3F(2, 0x19, 2)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND23(a,b,c,d) \ + .word (F3F(2, 0x19, 3)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_EROUND01_L(a,b,c,d) \ + .word (F3F(2, 0x19, 4)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_EROUND23_L(a,b,c,d) \ + .word (F3F(2, 0x19, 5)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND01_L(a,b,c,d) \ + .word (F3F(2, 0x19, 6)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND23_L(a,b,c,d) \ + .word (F3F(2, 0x19, 7)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_KEXPAND1(a,b,c,d) \ + .word (F3F(2, 0x19, 8)|RS1(a)|RS2(b)|IMM5_9(c)|RD(d)); +#define AES_KEXPAND0(a,b,c) \ + .word (F3F(2, 0x36, 0x130)|RS1(a)|RS2(b)|RD(c)); +#define AES_KEXPAND2(a,b,c) \ + .word (F3F(2, 0x36, 0x131)|RS1(a)|RS2(b)|RD(c)); + +#define DES_IP(a,b) \ + .word (F3F(2, 0x36, 0x134)|RS1(a)|RD(b)); +#define DES_IIP(a,b) \ + .word (F3F(2, 0x36, 0x135)|RS1(a)|RD(b)); +#define DES_KEXPAND(a,b,c) \ + .word (F3F(2, 0x36, 0x136)|RS1(a)|IMM5_0(b)|RD(c)); +#define DES_ROUND(a,b,c,d) \ + .word (F3F(2, 0x19, 0x009)|RS1(a)|RS2(b)|RS3(c)|RD(d)); + +#define CAMELLIA_F(a,b,c,d) \ + .word (F3F(2, 0x19, 0x00c)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define CAMELLIA_FL(a,b,c) \ + .word (F3F(2, 0x36, 0x13c)|RS1(a)|RS2(b)|RD(c)); +#define CAMELLIA_FLI(a,b,c) \ + .word (F3F(2, 0x36, 0x13d)|RS1(a)|RS2(b)|RD(c)); + +#define MOVDTOX_F0_O4 \ + .word 0x99b02200 +#define MOVDTOX_F2_O5 \ + .word 0x9bb02202 +#define MOVXTOD_G1_F60 \ + .word 0xbbb02301 +#define MOVXTOD_G1_F62 \ + .word 0xbfb02301 +#define MOVXTOD_G3_F4 \ + .word 0x89b02303; +#define MOVXTOD_G7_F6 \ + .word 0x8db02307; +#define MOVXTOD_G3_F0 \ + .word 0x81b02303; +#define MOVXTOD_G7_F2 \ + .word 0x85b02307; +#define MOVXTOD_O0_F0 \ + .word 0x81b02308; +#define MOVXTOD_O5_F0 \ + .word 0x81b0230d; +#define MOVXTOD_O5_F2 \ + .word 0x85b0230d; +#define MOVXTOD_O5_F4 \ + .word 0x89b0230d; +#define MOVXTOD_O5_F6 \ + .word 0x8db0230d; +#define MOVXTOD_G3_F60 \ + .word 0xbbb02303; +#define MOVXTOD_G7_F62 \ + .word 0xbfb02307; + +#endif /* _OPCODES_H */ diff --git a/arch/sparc/crypto/sha1_asm.S b/arch/sparc/crypto/sha1_asm.S new file mode 100644 index 00000000000..219d10c5ae0 --- /dev/null +++ b/arch/sparc/crypto/sha1_asm.S @@ -0,0 +1,72 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> + +#include "opcodes.h" + +ENTRY(sha1_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x0c], %f3 + bne,pn %xcc, 10f + ld [%o0 + 0x10], %f4 + +1: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + ldd [%o1 + 0x38], %f22 + + SHA1 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + +5: + st %f0, [%o0 + 0x00] + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + st %f4, [%o0 + 0x10] + retl + VISExitHalf +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +1: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + ldd [%o1 + 0x40], %f26 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + SHA1 + + subcc %o2, 1, %o2 + fsrc2 %f26, %f10 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha1_sparc64_transform) diff --git a/arch/sparc/crypto/sha1_glue.c b/arch/sparc/crypto/sha1_glue.c new file mode 100644 index 00000000000..2bbb20bee9f --- /dev/null +++ b/arch/sparc/crypto/sha1_glue.c @@ -0,0 +1,183 @@ +/* Glue code for SHA1 hashing optimized for sparc64 crypto opcodes. + * + * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> + * Copyright (c) Mathias Krause <minipli@googlemail.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> + +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +asmlinkage void sha1_sparc64_transform(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_sparc64_init(struct shash_desc *desc) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + *sctx = (struct sha1_state){ + .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, + }; + + return 0; +} + +static void __sha1_sparc64_update(struct sha1_state *sctx, const u8 *data, + unsigned int len, unsigned int partial) +{ + unsigned int done = 0; + + sctx->count += len; + if (partial) { + done = SHA1_BLOCK_SIZE - partial; + memcpy(sctx->buffer + partial, data, done); + sha1_sparc64_transform(sctx->state, sctx->buffer, 1); + } + if (len - done >= SHA1_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; + + sha1_sparc64_transform(sctx->state, data + done, rounds); + done += rounds * SHA1_BLOCK_SIZE; + } + + memcpy(sctx->buffer, data + done, len - done); +} + +static int sha1_sparc64_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; + + /* Handle the fast case right here */ + if (partial + len < SHA1_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->buffer + partial, data, len); + } else + __sha1_sparc64_update(sctx, data, len, partial); + + return 0; +} + +/* Add padding and return the message digest. */ +static int sha1_sparc64_final(struct shash_desc *desc, u8 *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA1_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); + + /* We need to fill a whole block for __sha1_sparc64_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->buffer + index, padding, padlen); + } else { + __sha1_sparc64_update(sctx, padding, padlen, index); + } + __sha1_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56); + + /* Store state in digest */ + for (i = 0; i < 5; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha1_sparc64_export(struct shash_desc *desc, void *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha1_sparc64_import(struct shash_desc *desc, const void *in) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static struct shash_alg alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_sparc64_init, + .update = sha1_sparc64_update, + .final = sha1_sparc64_final, + .export = sha1_sparc64_export, + .import = sha1_sparc64_import, + .descsize = sizeof(struct sha1_state), + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name= "sha1-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static bool __init sparc64_has_sha1_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA1)) + return false; + + return true; +} + +static int __init sha1_sparc64_mod_init(void) +{ + if (sparc64_has_sha1_opcode()) { + pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n"); + return crypto_register_shash(&alg); + } + pr_info("sparc64 sha1 opcode not available.\n"); + return -ENODEV; +} + +static void __exit sha1_sparc64_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(sha1_sparc64_mod_init); +module_exit(sha1_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, sparc64 sha1 opcode accelerated"); + +MODULE_ALIAS("sha1"); diff --git a/arch/sparc/crypto/sha256_asm.S b/arch/sparc/crypto/sha256_asm.S new file mode 100644 index 00000000000..b5f3d5826eb --- /dev/null +++ b/arch/sparc/crypto/sha256_asm.S @@ -0,0 +1,78 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> + +#include "opcodes.h" + +ENTRY(sha256_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntryHalf + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + ld [%o0 + 0x0c], %f3 + ld [%o0 + 0x10], %f4 + ld [%o0 + 0x14], %f5 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x18], %f6 + bne,pn %xcc, 10f + ld [%o0 + 0x1c], %f7 + +1: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + ldd [%o1 + 0x38], %f22 + + SHA256 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + +5: + st %f0, [%o0 + 0x00] + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + st %f4, [%o0 + 0x10] + st %f5, [%o0 + 0x14] + st %f6, [%o0 + 0x18] + st %f7, [%o0 + 0x1c] + retl + VISExitHalf +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +1: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + ldd [%o1 + 0x40], %f26 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + SHA256 + + subcc %o2, 1, %o2 + fsrc2 %f26, %f10 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha256_sparc64_transform) diff --git a/arch/sparc/crypto/sha256_glue.c b/arch/sparc/crypto/sha256_glue.c new file mode 100644 index 00000000000..591e656bd89 --- /dev/null +++ b/arch/sparc/crypto/sha256_glue.c @@ -0,0 +1,241 @@ +/* Glue code for SHA256 hashing optimized for sparc64 crypto opcodes. + * + * This is based largely upon crypto/sha256_generic.c + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> + +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +asmlinkage void sha256_sparc64_transform(u32 *digest, const char *data, + unsigned int rounds); + +static int sha224_sparc64_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + + return 0; +} + +static int sha256_sparc64_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +static void __sha256_sparc64_update(struct sha256_state *sctx, const u8 *data, + unsigned int len, unsigned int partial) +{ + unsigned int done = 0; + + sctx->count += len; + if (partial) { + done = SHA256_BLOCK_SIZE - partial; + memcpy(sctx->buf + partial, data, done); + sha256_sparc64_transform(sctx->state, sctx->buf, 1); + } + if (len - done >= SHA256_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; + + sha256_sparc64_transform(sctx->state, data + done, rounds); + done += rounds * SHA256_BLOCK_SIZE; + } + + memcpy(sctx->buf, data + done, len - done); +} + +static int sha256_sparc64_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; + + /* Handle the fast case right here */ + if (partial + len < SHA256_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->buf + partial, data, len); + } else + __sha256_sparc64_update(sctx, data, len, partial); + + return 0; +} + +static int sha256_sparc64_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA256_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56) - index); + + /* We need to fill a whole block for __sha256_sparc64_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->buf + index, padding, padlen); + } else { + __sha256_sparc64_update(sctx, padding, padlen, index); + } + __sha256_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha224_sparc64_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA256_DIGEST_SIZE]; + + sha256_sparc64_final(desc, D); + + memcpy(hash, D, SHA224_DIGEST_SIZE); + memset(D, 0, SHA256_DIGEST_SIZE); + + return 0; +} + +static int sha256_sparc64_export(struct shash_desc *desc, void *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + return 0; +} + +static int sha256_sparc64_import(struct shash_desc *desc, const void *in) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + return 0; +} + +static struct shash_alg sha256 = { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_sparc64_init, + .update = sha256_sparc64_update, + .final = sha256_sparc64_final, + .export = sha256_sparc64_export, + .import = sha256_sparc64_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name= "sha256-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static struct shash_alg sha224 = { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_sparc64_init, + .update = sha256_sparc64_update, + .final = sha224_sparc64_final, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name= "sha224-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static bool __init sparc64_has_sha256_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA256)) + return false; + + return true; +} + +static int __init sha256_sparc64_mod_init(void) +{ + if (sparc64_has_sha256_opcode()) { + int ret = crypto_register_shash(&sha224); + if (ret < 0) + return ret; + + ret = crypto_register_shash(&sha256); + if (ret < 0) { + crypto_unregister_shash(&sha224); + return ret; + } + + pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n"); + return 0; + } + pr_info("sparc64 sha256 opcode not available.\n"); + return -ENODEV; +} + +static void __exit sha256_sparc64_mod_fini(void) +{ + crypto_unregister_shash(&sha224); + crypto_unregister_shash(&sha256); +} + +module_init(sha256_sparc64_mod_init); +module_exit(sha256_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, sparc64 sha256 opcode accelerated"); + +MODULE_ALIAS("sha224"); +MODULE_ALIAS("sha256"); diff --git a/arch/sparc/crypto/sha512_asm.S b/arch/sparc/crypto/sha512_asm.S new file mode 100644 index 00000000000..54bfba713c0 --- /dev/null +++ b/arch/sparc/crypto/sha512_asm.S @@ -0,0 +1,102 @@ +#include <linux/linkage.h> +#include <asm/visasm.h> + +#include "opcodes.h" + +ENTRY(sha512_sparc64_transform) + /* %o0 = digest, %o1 = data, %o2 = rounds */ + VISEntry + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + andcc %o1, 0x7, %g0 + ldd [%o0 + 0x30], %f12 + bne,pn %xcc, 10f + ldd [%o0 + 0x38], %f14 + +1: + ldd [%o1 + 0x00], %f16 + ldd [%o1 + 0x08], %f18 + ldd [%o1 + 0x10], %f20 + ldd [%o1 + 0x18], %f22 + ldd [%o1 + 0x20], %f24 + ldd [%o1 + 0x28], %f26 + ldd [%o1 + 0x30], %f28 + ldd [%o1 + 0x38], %f30 + ldd [%o1 + 0x40], %f32 + ldd [%o1 + 0x48], %f34 + ldd [%o1 + 0x50], %f36 + ldd [%o1 + 0x58], %f38 + ldd [%o1 + 0x60], %f40 + ldd [%o1 + 0x68], %f42 + ldd [%o1 + 0x70], %f44 + ldd [%o1 + 0x78], %f46 + + SHA512 + + subcc %o2, 1, %o2 + bne,pt %xcc, 1b + add %o1, 0x80, %o1 + +5: + std %f0, [%o0 + 0x00] + std %f2, [%o0 + 0x08] + std %f4, [%o0 + 0x10] + std %f6, [%o0 + 0x18] + std %f8, [%o0 + 0x20] + std %f10, [%o0 + 0x28] + std %f12, [%o0 + 0x30] + std %f14, [%o0 + 0x38] + retl + VISExit +10: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f18 +1: + ldd [%o1 + 0x08], %f20 + ldd [%o1 + 0x10], %f22 + ldd [%o1 + 0x18], %f24 + ldd [%o1 + 0x20], %f26 + ldd [%o1 + 0x28], %f28 + ldd [%o1 + 0x30], %f30 + ldd [%o1 + 0x38], %f32 + ldd [%o1 + 0x40], %f34 + ldd [%o1 + 0x48], %f36 + ldd [%o1 + 0x50], %f38 + ldd [%o1 + 0x58], %f40 + ldd [%o1 + 0x60], %f42 + ldd [%o1 + 0x68], %f44 + ldd [%o1 + 0x70], %f46 + ldd [%o1 + 0x78], %f48 + ldd [%o1 + 0x80], %f50 + + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + faligndata %f26, %f28, %f24 + faligndata %f28, %f30, %f26 + faligndata %f30, %f32, %f28 + faligndata %f32, %f34, %f30 + faligndata %f34, %f36, %f32 + faligndata %f36, %f38, %f34 + faligndata %f38, %f40, %f36 + faligndata %f40, %f42, %f38 + faligndata %f42, %f44, %f40 + faligndata %f44, %f46, %f42 + faligndata %f46, %f48, %f44 + faligndata %f48, %f50, %f46 + + SHA512 + + subcc %o2, 1, %o2 + fsrc2 %f50, %f18 + bne,pt %xcc, 1b + add %o1, 0x80, %o1 + + ba,a,pt %xcc, 5b +ENDPROC(sha512_sparc64_transform) diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c new file mode 100644 index 00000000000..486f0a2b700 --- /dev/null +++ b/arch/sparc/crypto/sha512_glue.c @@ -0,0 +1,226 @@ +/* Glue code for SHA512 hashing optimized for sparc64 crypto opcodes. + * + * This is based largely upon crypto/sha512_generic.c + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2003 Kyle McMartin <kyle@debian.org> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> + +#include <asm/pstate.h> +#include <asm/elf.h> + +#include "opcodes.h" + +asmlinkage void sha512_sparc64_transform(u64 *digest, const char *data, + unsigned int rounds); + +static int sha512_sparc64_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + sctx->state[0] = SHA512_H0; + sctx->state[1] = SHA512_H1; + sctx->state[2] = SHA512_H2; + sctx->state[3] = SHA512_H3; + sctx->state[4] = SHA512_H4; + sctx->state[5] = SHA512_H5; + sctx->state[6] = SHA512_H6; + sctx->state[7] = SHA512_H7; + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static int sha384_sparc64_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + sctx->state[0] = SHA384_H0; + sctx->state[1] = SHA384_H1; + sctx->state[2] = SHA384_H2; + sctx->state[3] = SHA384_H3; + sctx->state[4] = SHA384_H4; + sctx->state[5] = SHA384_H5; + sctx->state[6] = SHA384_H6; + sctx->state[7] = SHA384_H7; + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static void __sha512_sparc64_update(struct sha512_state *sctx, const u8 *data, + unsigned int len, unsigned int partial) +{ + unsigned int done = 0; + + if ((sctx->count[0] += len) < len) + sctx->count[1]++; + if (partial) { + done = SHA512_BLOCK_SIZE - partial; + memcpy(sctx->buf + partial, data, done); + sha512_sparc64_transform(sctx->state, sctx->buf, 1); + } + if (len - done >= SHA512_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE; + + sha512_sparc64_transform(sctx->state, data + done, rounds); + done += rounds * SHA512_BLOCK_SIZE; + } + + memcpy(sctx->buf, data + done, len - done); +} + +static int sha512_sparc64_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; + + /* Handle the fast case right here */ + if (partial + len < SHA512_BLOCK_SIZE) { + if ((sctx->count[0] += len) < len) + sctx->count[1]++; + memcpy(sctx->buf + partial, data, len); + } else + __sha512_sparc64_update(sctx, data, len, partial); + + return 0; +} + +static int sha512_sparc64_final(struct shash_desc *desc, u8 *out) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be64 *dst = (__be64 *)out; + __be64 bits[2]; + static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, }; + + /* Save number of bits */ + bits[1] = cpu_to_be64(sctx->count[0] << 3); + bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); + + /* Pad out to 112 mod 128 and append length */ + index = sctx->count[0] % SHA512_BLOCK_SIZE; + padlen = (index < 112) ? (112 - index) : ((SHA512_BLOCK_SIZE+112) - index); + + /* We need to fill a whole block for __sha512_sparc64_update() */ + if (padlen <= 112) { + if ((sctx->count[0] += padlen) < padlen) + sctx->count[1]++; + memcpy(sctx->buf + index, padding, padlen); + } else { + __sha512_sparc64_update(sctx, padding, padlen, index); + } + __sha512_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 112); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be64(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha384_sparc64_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[64]; + + sha512_sparc64_final(desc, D); + + memcpy(hash, D, 48); + memset(D, 0, 64); + + return 0; +} + +static struct shash_alg sha512 = { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_sparc64_init, + .update = sha512_sparc64_update, + .final = sha512_sparc64_final, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name= "sha512-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static struct shash_alg sha384 = { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_sparc64_init, + .update = sha512_sparc64_update, + .final = sha384_sparc64_final, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name= "sha384-sparc64", + .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static bool __init sparc64_has_sha512_opcode(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return false; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_SHA512)) + return false; + + return true; +} + +static int __init sha512_sparc64_mod_init(void) +{ + if (sparc64_has_sha512_opcode()) { + int ret = crypto_register_shash(&sha384); + if (ret < 0) + return ret; + + ret = crypto_register_shash(&sha512); + if (ret < 0) { + crypto_unregister_shash(&sha384); + return ret; + } + + pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n"); + return 0; + } + pr_info("sparc64 sha512 opcode not available.\n"); + return -ENODEV; +} + +static void __exit sha512_sparc64_mod_fini(void) +{ + crypto_unregister_shash(&sha384); + crypto_unregister_shash(&sha512); +} + +module_init(sha512_sparc64_mod_init); +module_exit(sha512_sparc64_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-384 and SHA-512 Secure Hash Algorithm, sparc64 sha512 opcode accelerated"); + +MODULE_ALIAS("sha384"); +MODULE_ALIAS("sha512"); diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 67f83e0a0d6..f80ff93f6f7 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -17,6 +17,7 @@ header-y += uctx.h header-y += utrap.h header-y += watchdog.h +generic-y += clkdev.h generic-y += div64.h generic-y += local64.h generic-y += irq_regs.h diff --git a/arch/sparc/include/asm/asi.h b/arch/sparc/include/asm/asi.h index 61ebe7411ce..aace6f31371 100644 --- a/arch/sparc/include/asm/asi.h +++ b/arch/sparc/include/asm/asi.h @@ -141,7 +141,8 @@ /* SpitFire and later extended ASIs. The "(III)" marker designates * UltraSparc-III and later specific ASIs. The "(CMT)" marker designates * Chip Multi Threading specific ASIs. "(NG)" designates Niagara specific - * ASIs, "(4V)" designates SUN4V specific ASIs. + * ASIs, "(4V)" designates SUN4V specific ASIs. "(NG4)" designates SPARC-T4 + * and later ASIs. */ #define ASI_PHYS_USE_EC 0x14 /* PADDR, E-cachable */ #define ASI_PHYS_BYPASS_EC_E 0x15 /* PADDR, E-bit */ @@ -243,6 +244,7 @@ #define ASI_UDBL_CONTROL_R 0x7f /* External UDB control regs rd low*/ #define ASI_INTR_R 0x7f /* IRQ vector dispatch read */ #define ASI_INTR_DATAN_R 0x7f /* (III) In irq vector data reg N */ +#define ASI_PIC 0xb0 /* (NG4) PIC registers */ #define ASI_PST8_P 0xc0 /* Primary, 8 8-bit, partial */ #define ASI_PST8_S 0xc1 /* Secondary, 8 8-bit, partial */ #define ASI_PST16_P 0xc2 /* Primary, 4 16-bit, partial */ @@ -268,9 +270,28 @@ #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 /* (NG) init-store, twin load, * primary, implicit */ +#define ASI_BLK_INIT_QUAD_LDD_S 0xe3 /* (NG) init-store, twin load, + * secondary, implicit + */ #define ASI_BLK_P 0xf0 /* Primary, blk ld/st */ #define ASI_BLK_S 0xf1 /* Secondary, blk ld/st */ +#define ASI_ST_BLKINIT_MRU_P 0xf2 /* (NG4) init-store, twin load, + * Most-Recently-Used, primary, + * implicit + */ +#define ASI_ST_BLKINIT_MRU_S 0xf2 /* (NG4) init-store, twin load, + * Most-Recently-Used, secondary, + * implicit + */ #define ASI_BLK_PL 0xf8 /* Primary, blk ld/st, little */ #define ASI_BLK_SL 0xf9 /* Secondary, blk ld/st, little */ +#define ASI_ST_BLKINIT_MRU_PL 0xfa /* (NG4) init-store, twin load, + * Most-Recently-Used, primary, + * implicit, little-endian + */ +#define ASI_ST_BLKINIT_MRU_SL 0xfb /* (NG4) init-store, twin load, + * Most-Recently-Used, secondary, + * implicit, little-endian + */ #endif /* _SPARC_ASI_H */ diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index b8be20d42a0..cef99fbc0a2 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -36,6 +36,7 @@ typedef s64 compat_s64; typedef u32 compat_uint_t; typedef u32 compat_ulong_t; typedef u64 compat_u64; +typedef u32 compat_uptr_t; struct compat_timespec { compat_time_t tv_sec; @@ -147,6 +148,65 @@ typedef u32 compat_old_sigset_t; typedef u32 compat_sigset_word; +typedef union compat_sigval { + compat_int_t sival_int; + compat_uptr_t sival_ptr; +} compat_sigval_t; + +#define SI_PAD_SIZE32 (128/sizeof(int) - 3) + +typedef struct compat_siginfo { + int si_signo; + int si_errno; + int si_code; + + union { + int _pad[SI_PAD_SIZE32]; + + /* kill() */ + struct { + compat_pid_t _pid; /* sender's pid */ + unsigned int _uid; /* sender's uid */ + } _kill; + + /* POSIX.1b timers */ + struct { + compat_timer_t _tid; /* timer id */ + int _overrun; /* overrun count */ + compat_sigval_t _sigval; /* same as below */ + int _sys_private; /* not to be passed to user */ + } _timer; + + /* POSIX.1b signals */ + struct { + compat_pid_t _pid; /* sender's pid */ + unsigned int _uid; /* sender's uid */ + compat_sigval_t _sigval; + } _rt; + + /* SIGCHLD */ + struct { + compat_pid_t _pid; /* which child */ + unsigned int _uid; /* sender's uid */ + int _status; /* exit code */ + compat_clock_t _utime; + compat_clock_t _stime; + } _sigchld; + + /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGEMT */ + struct { + u32 _addr; /* faulting insn/memory ref. */ + int _trapno; + } _sigfault; + + /* SIGPOLL */ + struct { + int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ + int _fd; + } _sigpoll; + } _sifields; +} compat_siginfo_t; + #define COMPAT_OFF_T_MAX 0x7fffffff #define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL @@ -156,7 +216,6 @@ typedef u32 compat_sigset_word; * as pointers because the syscall entry code will have * appropriately converted them already. */ -typedef u32 compat_uptr_t; static inline void __user *compat_ptr(compat_uptr_t uptr) { diff --git a/arch/sparc/include/asm/elf_32.h b/arch/sparc/include/asm/elf_32.h index 2d4d755cba9..ac74a2c98e6 100644 --- a/arch/sparc/include/asm/elf_32.h +++ b/arch/sparc/include/asm/elf_32.h @@ -128,6 +128,7 @@ typedef struct { #define ELF_PLATFORM (NULL) -#define SET_PERSONALITY(ex) set_personality(PER_LINUX) +#define SET_PERSONALITY(ex) \ + set_personality(PER_LINUX | (current->personality & (~PER_MASK))) #endif /* !(__ASMSPARC_ELF_H) */ diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h index 7df8b7f544d..370ca1e71ff 100644 --- a/arch/sparc/include/asm/elf_64.h +++ b/arch/sparc/include/asm/elf_64.h @@ -86,6 +86,15 @@ #define AV_SPARC_IMA 0x00400000 /* integer multiply-add */ #define AV_SPARC_ASI_CACHE_SPARING \ 0x00800000 /* cache sparing ASIs available */ +#define AV_SPARC_PAUSE 0x01000000 /* PAUSE available */ +#define AV_SPARC_CBCOND 0x02000000 /* CBCOND insns available */ + +/* Solaris decided to enumerate every single crypto instruction type + * in the AT_HWCAP bits. This is wasteful, since if crypto is present, + * you still need to look in the CFR register to see if the opcode is + * really available. So we simply advertise only "crypto" support. + */ +#define HWCAP_SPARC_CRYPTO 0x04000000 /* CRYPTO insns available */ #define CORE_DUMP_USE_REGSET diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 177061064ee..8c5eed6d267 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -10,7 +10,10 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void hugetlb_prefault_arch_hook(struct mm_struct *mm); +static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) +{ + hugetlb_setup(mm); +} static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, @@ -82,4 +85,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_SPARC64_HUGETLB_H */ diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h index 015a761eaa3..ca121f0fa3e 100644 --- a/arch/sparc/include/asm/hypervisor.h +++ b/arch/sparc/include/asm/hypervisor.h @@ -2934,6 +2934,16 @@ extern unsigned long sun4v_reboot_data_set(unsigned long ra, unsigned long len); #endif +#define HV_FAST_VT_GET_PERFREG 0x184 +#define HV_FAST_VT_SET_PERFREG 0x185 + +#ifndef __ASSEMBLY__ +extern unsigned long sun4v_vt_get_perfreg(unsigned long reg_num, + unsigned long *reg_val); +extern unsigned long sun4v_vt_set_perfreg(unsigned long reg_num, + unsigned long reg_val); +#endif + /* Function numbers for HV_CORE_TRAP. */ #define HV_CORE_SET_VER 0x00 #define HV_CORE_PUTCHAR 0x01 @@ -2964,6 +2974,7 @@ extern unsigned long sun4v_reboot_data_set(unsigned long ra, #define HV_GRP_NIU 0x0204 #define HV_GRP_VF_CPU 0x0205 #define HV_GRP_KT_CPU 0x0209 +#define HV_GRP_VT_CPU 0x020c #define HV_GRP_DIAG 0x0300 #ifndef __ASSEMBLY__ diff --git a/arch/sparc/include/asm/mdesc.h b/arch/sparc/include/asm/mdesc.h index 9faa046713f..139097f3a67 100644 --- a/arch/sparc/include/asm/mdesc.h +++ b/arch/sparc/include/asm/mdesc.h @@ -73,6 +73,7 @@ extern void mdesc_register_notifier(struct mdesc_notifier_client *client); extern void mdesc_fill_in_cpu_data(cpumask_t *mask); extern void mdesc_populate_present_mask(cpumask_t *mask); +extern void mdesc_get_page_sizes(cpumask_t *mask, unsigned long *pgsz_mask); extern void sun4v_mdesc_init(void); diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h index 9067dc50053..76092c4dd27 100644 --- a/arch/sparc/include/asm/mmu_64.h +++ b/arch/sparc/include/asm/mmu_64.h @@ -30,22 +30,8 @@ #define CTX_PGSZ_MASK ((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \ (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT)) -#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) #define CTX_PGSZ_BASE CTX_PGSZ_8KB -#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) -#define CTX_PGSZ_BASE CTX_PGSZ_64KB -#else -#error No page size specified in kernel configuration -#endif - -#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) -#define CTX_PGSZ_HUGE CTX_PGSZ_4MB -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) -#define CTX_PGSZ_HUGE CTX_PGSZ_512KB -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K) -#define CTX_PGSZ_HUGE CTX_PGSZ_64KB -#endif - +#define CTX_PGSZ_HUGE CTX_PGSZ_4MB #define CTX_PGSZ_KERN CTX_PGSZ_4MB /* Thus, when running on UltraSPARC-III+ and later, we use the following @@ -96,7 +82,7 @@ struct tsb_config { #define MM_TSB_BASE 0 -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) #define MM_TSB_HUGE 1 #define MM_NUM_TSBS 2 #else @@ -107,6 +93,7 @@ typedef struct { spinlock_t lock; unsigned long sparc64_ctx_val; unsigned long huge_pte_count; + struct page *pgtable_page; struct tsb_config tsb_block[MM_NUM_TSBS]; struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; } mm_context_t; diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index a97fd085ceb..9191ca62ed9 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -36,7 +36,7 @@ static inline void tsb_context_switch(struct mm_struct *mm) { __tsb_context_switch(__pa(mm->pgd), &mm->context.tsb_block[0], -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) (mm->context.tsb_block[1].tsb ? &mm->context.tsb_block[1] : NULL) diff --git a/arch/sparc/include/asm/oplib_32.h b/arch/sparc/include/asm/oplib_32.h index 27517879a6c..c72f3045820 100644 --- a/arch/sparc/include/asm/oplib_32.h +++ b/arch/sparc/include/asm/oplib_32.h @@ -94,7 +94,7 @@ extern int prom_getprev(void); extern void prom_console_write_buf(const char *buf, int len); /* Prom's internal routines, don't use in kernel/boot code. */ -extern void prom_printf(const char *fmt, ...); +extern __printf(1, 2) void prom_printf(const char *fmt, ...); extern void prom_write(const char *buf, unsigned int len); /* Multiprocessor operations... */ diff --git a/arch/sparc/include/asm/oplib_64.h b/arch/sparc/include/asm/oplib_64.h index 97a90475c31..a12dbe3b776 100644 --- a/arch/sparc/include/asm/oplib_64.h +++ b/arch/sparc/include/asm/oplib_64.h @@ -98,7 +98,7 @@ extern unsigned char prom_get_idprom(char *idp_buffer, int idpbuf_size); extern void prom_console_write_buf(const char *buf, int len); /* Prom's internal routines, don't use in kernel/boot code. */ -extern void prom_printf(const char *fmt, ...); +extern __printf(1, 2) void prom_printf(const char *fmt, ...); extern void prom_write(const char *buf, unsigned int len); /* Multiprocessor operations... */ diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index f0d09b40103..4b39f74d6ca 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -3,13 +3,7 @@ #include <linux/const.h> -#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) #define PAGE_SHIFT 13 -#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) -#define PAGE_SHIFT 16 -#else -#error No page size specified in kernel configuration -#endif #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) @@ -21,15 +15,9 @@ #define DCACHE_ALIASING_POSSIBLE #endif -#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) #define HPAGE_SHIFT 22 -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) -#define HPAGE_SHIFT 19 -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K) -#define HPAGE_SHIFT 16 -#endif -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1UL)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) @@ -38,6 +26,11 @@ #ifndef __ASSEMBLY__ +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) +struct mm_struct; +extern void hugetlb_setup(struct mm_struct *mm); +#endif + #define WANT_PAGE_VIRTUAL extern void _clear_page(void *page); @@ -98,7 +91,7 @@ typedef unsigned long pgprot_t; #endif /* (STRICT_MM_TYPECHECKS) */ -typedef struct page *pgtable_t; +typedef pte_t *pgtable_t; #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_32BIT) ? \ (_AC(0x0000000070000000,UL)) : \ diff --git a/arch/sparc/include/asm/pcr.h b/arch/sparc/include/asm/pcr.h index 288d7beba05..942bb17f60c 100644 --- a/arch/sparc/include/asm/pcr.h +++ b/arch/sparc/include/asm/pcr.h @@ -2,8 +2,13 @@ #define __PCR_H struct pcr_ops { - u64 (*read)(void); - void (*write)(u64); + u64 (*read_pcr)(unsigned long); + void (*write_pcr)(unsigned long, u64); + u64 (*read_pic)(unsigned long); + void (*write_pic)(unsigned long, u64); + u64 (*nmi_picl_value)(unsigned int nmi_hz); + u64 pcr_nmi_enable; + u64 pcr_nmi_disable; }; extern const struct pcr_ops *pcr_ops; @@ -27,21 +32,18 @@ extern void schedule_deferred_pcr_work(void); #define PCR_N2_SL1_SHIFT 27 #define PCR_N2_OV1 0x80000000 -extern unsigned int picl_shift; - -/* In order to commonize as much of the implementation as - * possible, we use PICH as our counter. Mostly this is - * to accommodate Niagara-1 which can only count insn cycles - * in PICH. - */ -static inline u64 picl_value(unsigned int nmi_hz) -{ - u32 delta = local_cpu_data().clock_tick / (nmi_hz << picl_shift); - - return ((u64)((0 - delta) & 0xffffffff)) << 32; -} - -extern u64 pcr_enable; +#define PCR_N4_OV 0x00000001 /* PIC overflow */ +#define PCR_N4_TOE 0x00000002 /* Trap On Event */ +#define PCR_N4_UTRACE 0x00000004 /* Trace user events */ +#define PCR_N4_STRACE 0x00000008 /* Trace supervisor events */ +#define PCR_N4_HTRACE 0x00000010 /* Trace hypervisor events */ +#define PCR_N4_MASK 0x000007e0 /* Event mask */ +#define PCR_N4_MASK_SHIFT 5 +#define PCR_N4_SL 0x0000f800 /* Event Select */ +#define PCR_N4_SL_SHIFT 11 +#define PCR_N4_PICNPT 0x00010000 /* PIC non-privileged trap */ +#define PCR_N4_PICNHT 0x00020000 /* PIC non-hypervisor trap */ +#define PCR_N4_NTC 0x00040000 /* Next-To-Commit wrap */ extern int pcr_arch_init(void); diff --git a/arch/sparc/include/asm/perfctr.h b/arch/sparc/include/asm/perfctr.h index 3332d2cba6c..214feefa577 100644 --- a/arch/sparc/include/asm/perfctr.h +++ b/arch/sparc/include/asm/perfctr.h @@ -54,11 +54,6 @@ enum perfctr_opcode { PERFCTR_GETPCR }; -/* I don't want the kernel's namespace to be polluted with this - * stuff when this file is included. --DaveM - */ -#ifndef __KERNEL__ - #define PRIV 0x00000001 #define SYS 0x00000002 #define USR 0x00000004 @@ -168,29 +163,4 @@ struct vcounter_struct { unsigned long long vcnt1; }; -#else /* !(__KERNEL__) */ - -#ifndef CONFIG_SPARC32 - -/* Performance counter register access. */ -#define read_pcr(__p) __asm__ __volatile__("rd %%pcr, %0" : "=r" (__p)) -#define write_pcr(__p) __asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (__p)) -#define read_pic(__p) __asm__ __volatile__("rd %%pic, %0" : "=r" (__p)) - -/* Blackbird errata workaround. See commentary in - * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt() - * for more information. - */ -#define write_pic(__p) \ - __asm__ __volatile__("ba,pt %%xcc, 99f\n\t" \ - " nop\n\t" \ - ".align 64\n" \ - "99:wr %0, 0x0, %%pic\n\t" \ - "rd %%pic, %%g0" : : "r" (__p)) -#define reset_pic() write_pic(0) - -#endif /* !CONFIG_SPARC32 */ - -#endif /* !(__KERNEL__) */ - #endif /* !(PERF_COUNTER_API) */ diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 40b2d7a7023..bcfe063bce2 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -38,51 +38,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(pgtable_cache, pmd); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) -{ - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) -{ - struct page *page; - pte_t *pte; - - pte = pte_alloc_one_kernel(mm, address); - if (!pte) - return NULL; - page = virt_to_page(pte); - pgtable_page_ctor(page); - return page; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pgtable_page_dtor(ptepage); - __free_page(ptepage); -} +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address); +extern pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address); +extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte); +extern void pte_free(struct mm_struct *mm, pgtable_t ptepage); -#define pmd_populate_kernel(MM, PMD, PTE) pmd_set(PMD, PTE) -#define pmd_populate(MM,PMD,PTE_PAGE) \ - pmd_populate_kernel(MM,PMD,page_address(PTE_PAGE)) -#define pmd_pgtable(pmd) pmd_page(pmd) +#define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE) +#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) +#define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) #define check_pgt_cache() do { } while (0) -static inline void pgtable_free(void *table, bool is_page) -{ - if (is_page) - free_page((unsigned long)table); - else - kmem_cache_free(pgtable_cache, table); -} +extern void pgtable_free(void *table, bool is_page); #ifdef CONFIG_SMP @@ -113,11 +82,10 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, bool is } #endif /* !CONFIG_SMP */ -static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, +static inline void __pte_free_tlb(struct mmu_gather *tlb, pte_t *pte, unsigned long address) { - pgtable_page_dtor(ptepage); - pgtable_free_tlb(tlb, page_address(ptepage), true); + pgtable_free_tlb(tlb, pte, true); } #define __pmd_free_tlb(tlb, pmd, addr) \ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 61210db139f..95515f1e7ce 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -45,40 +45,59 @@ #define vmemmap ((struct page *)VMEMMAP_BASE) -/* XXX All of this needs to be rethought so we can take advantage - * XXX cheetah's full 64-bit virtual address space, ie. no more hole - * XXX in the middle like on spitfire. -DaveM - */ -/* - * Given a virtual address, the lowest PAGE_SHIFT bits determine offset - * into the page; the next higher PAGE_SHIFT-3 bits determine the pte# - * in the proper pagetable (the -3 is from the 8 byte ptes, and each page - * table is a single page long). The next higher PMD_BITS determine pmd# - * in the proper pmdtable (where we must have PMD_BITS <= (PAGE_SHIFT-2) - * since the pmd entries are 4 bytes, and each pmd page is a single page - * long). Finally, the higher few bits determine pgde#. - */ - /* PMD_SHIFT determines the size of the area a second-level page * table can map */ -#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) +#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4)) #define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PMD_BITS (PAGE_SHIFT - 2) /* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS) +#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4) + PMD_BITS) #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define PGDIR_BITS (PAGE_SHIFT - 2) +#if (PGDIR_SHIFT + PGDIR_BITS) != 44 +#error Page table parameters do not cover virtual address space properly. +#endif + +#if (PMD_SHIFT != HPAGE_SHIFT) +#error PMD_SHIFT must equal HPAGE_SHIFT for transparent huge pages. +#endif + +/* PMDs point to PTE tables which are 4K aligned. */ +#define PMD_PADDR _AC(0xfffffffe,UL) +#define PMD_PADDR_SHIFT _AC(11,UL) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define PMD_ISHUGE _AC(0x00000001,UL) + +/* This is the PMD layout when PMD_ISHUGE is set. With 4MB huge + * pages, this frees up a bunch of bits in the layout that we can + * use for the protection settings and software metadata. + */ +#define PMD_HUGE_PADDR _AC(0xfffff800,UL) +#define PMD_HUGE_PROTBITS _AC(0x000007ff,UL) +#define PMD_HUGE_PRESENT _AC(0x00000400,UL) +#define PMD_HUGE_WRITE _AC(0x00000200,UL) +#define PMD_HUGE_DIRTY _AC(0x00000100,UL) +#define PMD_HUGE_ACCESSED _AC(0x00000080,UL) +#define PMD_HUGE_EXEC _AC(0x00000040,UL) +#define PMD_HUGE_SPLITTING _AC(0x00000020,UL) +#endif + +/* PGDs point to PMD tables which are 8K aligned. */ +#define PGD_PADDR _AC(0xfffffffc,UL) +#define PGD_PADDR_SHIFT _AC(11,UL) + #ifndef __ASSEMBLY__ #include <linux/sched.h> /* Entries per page directory level. */ -#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3)) +#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-4)) #define PTRS_PER_PMD (1UL << PMD_BITS) #define PTRS_PER_PGD (1UL << PGDIR_BITS) @@ -160,26 +179,11 @@ #define _PAGE_SZ8K_4V _AC(0x0000000000000000,UL) /* 8K Page */ #define _PAGE_SZALL_4V _AC(0x0000000000000007,UL) /* All pgsz bits */ -#if PAGE_SHIFT == 13 #define _PAGE_SZBITS_4U _PAGE_SZ8K_4U #define _PAGE_SZBITS_4V _PAGE_SZ8K_4V -#elif PAGE_SHIFT == 16 -#define _PAGE_SZBITS_4U _PAGE_SZ64K_4U -#define _PAGE_SZBITS_4V _PAGE_SZ64K_4V -#else -#error Wrong PAGE_SHIFT specified -#endif -#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) -#define _PAGE_SZHUGE_4U _PAGE_SZ512K_4U -#define _PAGE_SZHUGE_4V _PAGE_SZ512K_4V -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K) -#define _PAGE_SZHUGE_4U _PAGE_SZ64K_4U -#define _PAGE_SZHUGE_4V _PAGE_SZ64K_4V -#endif /* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */ #define __P000 __pgprot(0) @@ -218,7 +222,6 @@ extern unsigned long _PAGE_CACHE; extern unsigned long pg_iobits; extern unsigned long _PAGE_ALL_SZ_BITS; -extern unsigned long _PAGE_SZBITS; extern struct page *mem_map_zero; #define ZERO_PAGE(vaddr) (mem_map_zero) @@ -231,25 +234,25 @@ extern struct page *mem_map_zero; static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) { unsigned long paddr = pfn << PAGE_SHIFT; - unsigned long sz_bits; - - sz_bits = 0UL; - if (_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL) { - __asm__ __volatile__( - "\n661: sethi %%uhi(%1), %0\n" - " sllx %0, 32, %0\n" - " .section .sun4v_2insn_patch, \"ax\"\n" - " .word 661b\n" - " mov %2, %0\n" - " nop\n" - " .previous\n" - : "=r" (sz_bits) - : "i" (_PAGE_SZBITS_4U), "i" (_PAGE_SZBITS_4V)); - } - return __pte(paddr | sz_bits | pgprot_val(prot)); + + BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL); + return __pte(paddr | pgprot_val(prot)); } #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot); +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + /* Do nothing, mk_pmd() does this part. */ + return pmd; +} +#endif + /* This one can be done with two shifts. */ static inline unsigned long pte_pfn(pte_t pte) { @@ -286,6 +289,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot) * Note: We encode this into 3 sun4v 2-insn patch sequences. */ + BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL); __asm__ __volatile__( "\n661: sethi %%uhi(%2), %1\n" " sethi %%hi(%2), %0\n" @@ -307,10 +311,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot) : "=r" (mask), "=r" (tmp) : "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U | _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U | - _PAGE_SZBITS_4U | _PAGE_SPECIAL), + _PAGE_SPECIAL), "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V | _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V | - _PAGE_SZBITS_4V | _PAGE_SPECIAL)); + _PAGE_SPECIAL)); return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask)); } @@ -618,19 +622,130 @@ static inline unsigned long pte_special(pte_t pte) return pte_val(pte) & _PAGE_SPECIAL; } -#define pmd_set(pmdp, ptep) \ - (pmd_val(*(pmdp)) = (__pa((unsigned long) (ptep)) >> 11UL)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_young(pmd_t pmd) +{ + return pmd_val(pmd) & PMD_HUGE_ACCESSED; +} + +static inline int pmd_write(pmd_t pmd) +{ + return pmd_val(pmd) & PMD_HUGE_WRITE; +} + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + unsigned long val = pmd_val(pmd) & PMD_HUGE_PADDR; + + return val >> (PAGE_SHIFT - PMD_PADDR_SHIFT); +} + +static inline int pmd_large(pmd_t pmd) +{ + return (pmd_val(pmd) & (PMD_ISHUGE | PMD_HUGE_PRESENT)) == + (PMD_ISHUGE | PMD_HUGE_PRESENT); +} + +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return (pmd_val(pmd) & (PMD_ISHUGE|PMD_HUGE_SPLITTING)) == + (PMD_ISHUGE|PMD_HUGE_SPLITTING); +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & PMD_ISHUGE; +} + +#define has_transparent_hugepage() 1 + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + pmd_val(pmd) &= ~PMD_HUGE_ACCESSED; + return pmd; +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + pmd_val(pmd) &= ~PMD_HUGE_WRITE; + return pmd; +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + pmd_val(pmd) |= PMD_HUGE_DIRTY; + return pmd; +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + pmd_val(pmd) |= PMD_HUGE_ACCESSED; + return pmd; +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + pmd_val(pmd) |= PMD_HUGE_WRITE; + return pmd; +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + pmd_val(pmd) &= ~PMD_HUGE_PRESENT; + return pmd; +} + +static inline pmd_t pmd_mksplitting(pmd_t pmd) +{ + pmd_val(pmd) |= PMD_HUGE_SPLITTING; + return pmd; +} + +extern pgprot_t pmd_pgprot(pmd_t entry); +#endif + +static inline int pmd_present(pmd_t pmd) +{ + return pmd_val(pmd) != 0U; +} + +#define pmd_none(pmd) (!pmd_val(pmd)) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +#else +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + *pmdp = pmd; +} +#endif + +static inline void pmd_set(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) +{ + unsigned long val = __pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT; + + pmd_val(*pmdp) = val; +} + #define pud_set(pudp, pmdp) \ - (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) -#define __pmd_page(pmd) \ - ((unsigned long) __va((((unsigned long)pmd_val(pmd))<<11UL))) + (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> PGD_PADDR_SHIFT)) +static inline unsigned long __pmd_page(pmd_t pmd) +{ + unsigned long paddr = (unsigned long) pmd_val(pmd); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_val(pmd) & PMD_ISHUGE) + paddr &= PMD_HUGE_PADDR; +#endif + paddr <<= PMD_PADDR_SHIFT; + return ((unsigned long) __va(paddr)); +} #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) #define pud_page_vaddr(pud) \ - ((unsigned long) __va((((unsigned long)pud_val(pud))<<11UL))) + ((unsigned long) __va((((unsigned long)pud_val(pud))<<PGD_PADDR_SHIFT))) #define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud)) -#define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (0) -#define pmd_present(pmd) (pmd_val(pmd) != 0U) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0U) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (0) @@ -664,6 +779,16 @@ static inline unsigned long pte_special(pte_t pte) extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t orig, int fullmm); +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + set_pmd_at(mm, addr, pmdp, __pmd(0U)); + return pmd; +} + static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, int fullmm) { @@ -719,6 +844,16 @@ extern void mmu_info(struct seq_file *); struct vm_area_struct; extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd); + +#define __HAVE_ARCH_PGTABLE_DEPOSIT +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); + +#define __HAVE_ARCH_PGTABLE_WITHDRAW +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +#endif /* Encode and de-code a swap entry */ #define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL) diff --git a/arch/sparc/include/asm/pstate.h b/arch/sparc/include/asm/pstate.h index a26a53777bb..4b6b998afd9 100644 --- a/arch/sparc/include/asm/pstate.h +++ b/arch/sparc/include/asm/pstate.h @@ -88,4 +88,18 @@ #define VERS_MAXTL _AC(0x000000000000ff00,UL) /* Max Trap Level. */ #define VERS_MAXWIN _AC(0x000000000000001f,UL) /* Max RegWindow Idx.*/ +/* Compatability Feature Register (%asr26), SPARC-T4 and later */ +#define CFR_AES _AC(0x0000000000000001,UL) /* Supports AES opcodes */ +#define CFR_DES _AC(0x0000000000000002,UL) /* Supports DES opcodes */ +#define CFR_KASUMI _AC(0x0000000000000004,UL) /* Supports KASUMI opcodes */ +#define CFR_CAMELLIA _AC(0x0000000000000008,UL) /* Supports CAMELLIA opcodes*/ +#define CFR_MD5 _AC(0x0000000000000010,UL) /* Supports MD5 opcodes */ +#define CFR_SHA1 _AC(0x0000000000000020,UL) /* Supports SHA1 opcodes */ +#define CFR_SHA256 _AC(0x0000000000000040,UL) /* Supports SHA256 opcodes */ +#define CFR_SHA512 _AC(0x0000000000000080,UL) /* Supports SHA512 opcodes */ +#define CFR_MPMUL _AC(0x0000000000000100,UL) /* Supports MPMUL opcodes */ +#define CFR_MONTMUL _AC(0x0000000000000200,UL) /* Supports MONTMUL opcodes */ +#define CFR_MONTSQR _AC(0x0000000000000400,UL) /* Supports MONTSQR opcodes */ +#define CFR_CRC32C _AC(0x0000000000000800,UL) /* Supports CRC32C opcodes */ + #endif /* !(_SPARC64_PSTATE_H) */ diff --git a/arch/sparc/include/asm/siginfo.h b/arch/sparc/include/asm/siginfo.h index 215900fce21..dbc182c438b 100644 --- a/arch/sparc/include/asm/siginfo.h +++ b/arch/sparc/include/asm/siginfo.h @@ -3,7 +3,6 @@ #if defined(__sparc__) && defined(__arch64__) -#define SI_PAD_SIZE32 ((SI_MAX_SIZE/sizeof(int)) - 3) #define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) #define __ARCH_SI_BAND_T int diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h index 1a8afd1ad04..b4c258de444 100644 --- a/arch/sparc/include/asm/tsb.h +++ b/arch/sparc/include/asm/tsb.h @@ -147,20 +147,96 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end; brz,pn REG1, FAIL_LABEL; \ sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ srlx REG2, 64 - PAGE_SHIFT, REG2; \ - sllx REG1, 11, REG1; \ + sllx REG1, PGD_PADDR_SHIFT, REG1; \ andn REG2, 0x3, REG2; \ lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ brz,pn REG1, FAIL_LABEL; \ sllx VADDR, 64 - PMD_SHIFT, REG2; \ - srlx REG2, 64 - PAGE_SHIFT, REG2; \ - sllx REG1, 11, REG1; \ + srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \ + sllx REG1, PMD_PADDR_SHIFT, REG1; \ andn REG2, 0x7, REG2; \ add REG1, REG2, REG1; - /* Do a user page table walk in MMU globals. Leaves physical PTE - * pointer in REG1. Jumps to FAIL_LABEL on early page table walk - * termination. Physical base of page tables is in PHYS_PGD which - * will not be modified. + /* This macro exists only to make the PMD translator below easier + * to read. It hides the ELF section switch for the sun4v code + * patching. + */ +#define OR_PTE_BIT(REG, NAME) \ +661: or REG, _PAGE_##NAME##_4U, REG; \ + .section .sun4v_1insn_patch, "ax"; \ + .word 661b; \ + or REG, _PAGE_##NAME##_4V, REG; \ + .previous; + + /* Load into REG the PTE value for VALID, CACHE, and SZHUGE. */ +#define BUILD_PTE_VALID_SZHUGE_CACHE(REG) \ +661: sethi %uhi(_PAGE_VALID|_PAGE_SZHUGE_4U), REG; \ + .section .sun4v_1insn_patch, "ax"; \ + .word 661b; \ + sethi %uhi(_PAGE_VALID), REG; \ + .previous; \ + sllx REG, 32, REG; \ +661: or REG, _PAGE_CP_4U|_PAGE_CV_4U, REG; \ + .section .sun4v_1insn_patch, "ax"; \ + .word 661b; \ + or REG, _PAGE_CP_4V|_PAGE_CV_4V|_PAGE_SZHUGE_4V, REG; \ + .previous; + + /* PMD has been loaded into REG1, interpret the value, seeing + * if it is a HUGE PMD or a normal one. If it is not valid + * then jump to FAIL_LABEL. If it is a HUGE PMD, and it + * translates to a valid PTE, branch to PTE_LABEL. + * + * We translate the PMD by hand, one bit at a time, + * constructing the huge PTE. + * + * So we construct the PTE in REG2 as follows: + * + * 1) Extract the PMD PFN from REG1 and place it into REG2. + * + * 2) Translate PMD protection bits in REG1 into REG2, one bit + * at a time using andcc tests on REG1 and OR's into REG2. + * + * Only two bits to be concerned with here, EXEC and WRITE. + * Now REG1 is freed up and we can use it as a temporary. + * + * 3) Construct the VALID, CACHE, and page size PTE bits in + * REG1, OR with REG2 to form final PTE. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \ + brz,pn REG1, FAIL_LABEL; \ + andcc REG1, PMD_ISHUGE, %g0; \ + be,pt %xcc, 700f; \ + and REG1, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED, REG2; \ + cmp REG2, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED; \ + bne,pn %xcc, FAIL_LABEL; \ + andn REG1, PMD_HUGE_PROTBITS, REG2; \ + sllx REG2, PMD_PADDR_SHIFT, REG2; \ + /* REG2 now holds PFN << PAGE_SHIFT */ \ + andcc REG1, PMD_HUGE_EXEC, %g0; \ + bne,a,pt %xcc, 1f; \ + OR_PTE_BIT(REG2, EXEC); \ +1: andcc REG1, PMD_HUGE_WRITE, %g0; \ + bne,a,pt %xcc, 1f; \ + OR_PTE_BIT(REG2, W); \ + /* REG1 can now be clobbered, build final PTE */ \ +1: BUILD_PTE_VALID_SZHUGE_CACHE(REG1); \ + ba,pt %xcc, PTE_LABEL; \ + or REG1, REG2, REG1; \ +700: +#else +#define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \ + brz,pn REG1, FAIL_LABEL; \ + nop; +#endif + + /* Do a user page table walk in MMU globals. Leaves final, + * valid, PTE value in REG1. Jumps to FAIL_LABEL on early + * page table walk termination or if the PTE is not valid. + * + * Physical base of page tables is in PHYS_PGD which will not + * be modified. * * VADDR will not be clobbered, but REG1 and REG2 will. */ @@ -172,15 +248,19 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end; brz,pn REG1, FAIL_LABEL; \ sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ srlx REG2, 64 - PAGE_SHIFT, REG2; \ - sllx REG1, 11, REG1; \ + sllx REG1, PGD_PADDR_SHIFT, REG1; \ andn REG2, 0x3, REG2; \ lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ - brz,pn REG1, FAIL_LABEL; \ - sllx VADDR, 64 - PMD_SHIFT, REG2; \ - srlx REG2, 64 - PAGE_SHIFT, REG2; \ - sllx REG1, 11, REG1; \ + USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \ + sllx VADDR, 64 - PMD_SHIFT, REG2; \ + srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \ + sllx REG1, PMD_PADDR_SHIFT, REG1; \ andn REG2, 0x7, REG2; \ - add REG1, REG2, REG1; + add REG1, REG2, REG1; \ + ldxa [REG1] ASI_PHYS_USE_EC, REG1; \ + brgez,pn REG1, FAIL_LABEL; \ + nop; \ +800: /* Lookup a OBP mapping on VADDR in the prom_trans[] table at TL>0. * If no entry is found, FAIL_LABEL will be branched to. On success diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index fb269346480..d9a677c5192 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -447,6 +447,7 @@ #else #define __ARCH_WANT_COMPAT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND +#define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif /* diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild new file mode 100644 index 00000000000..7518ad28696 --- /dev/null +++ b/arch/sparc/include/uapi/asm/Kbuild @@ -0,0 +1,5 @@ +# UAPI Header export list +# User exported sparc header files + +include include/uapi/asm-generic/Kbuild.asm + diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index b42ddbf9651..2feb15c35d9 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -559,10 +559,10 @@ niagara_tlb_fixup: be,pt %xcc, niagara2_patch nop cmp %g1, SUN4V_CHIP_NIAGARA4 - be,pt %xcc, niagara2_patch + be,pt %xcc, niagara4_patch nop cmp %g1, SUN4V_CHIP_NIAGARA5 - be,pt %xcc, niagara2_patch + be,pt %xcc, niagara4_patch nop call generic_patch_copyops @@ -573,6 +573,16 @@ niagara_tlb_fixup: nop ba,a,pt %xcc, 80f +niagara4_patch: + call niagara4_patch_copyops + nop + call niagara4_patch_bzero + nop + call niagara4_patch_pageops + nop + + ba,a,pt %xcc, 80f + niagara2_patch: call niagara2_patch_copyops nop diff --git a/arch/sparc/kernel/hvapi.c b/arch/sparc/kernel/hvapi.c index 8593672838f..c0a2de0fd62 100644 --- a/arch/sparc/kernel/hvapi.c +++ b/arch/sparc/kernel/hvapi.c @@ -45,6 +45,7 @@ static struct api_info api_table[] = { { .group = HV_GRP_NIU, }, { .group = HV_GRP_VF_CPU, }, { .group = HV_GRP_KT_CPU, }, + { .group = HV_GRP_VT_CPU, }, { .group = HV_GRP_DIAG, .flags = FLAG_PRE_API }, }; @@ -193,7 +194,7 @@ void __init sun4v_hvapi_init(void) bad: prom_printf("HVAPI: Cannot register API group " - "%lx with major(%u) minor(%u)\n", + "%lx with major(%lu) minor(%lu)\n", group, major, minor); prom_halt(); } diff --git a/arch/sparc/kernel/hvcalls.S b/arch/sparc/kernel/hvcalls.S index 58d60de4d65..f3ab509b76a 100644 --- a/arch/sparc/kernel/hvcalls.S +++ b/arch/sparc/kernel/hvcalls.S @@ -805,3 +805,19 @@ ENTRY(sun4v_reboot_data_set) retl nop ENDPROC(sun4v_reboot_data_set) + +ENTRY(sun4v_vt_get_perfreg) + mov %o1, %o4 + mov HV_FAST_VT_GET_PERFREG, %o5 + ta HV_FAST_TRAP + stx %o1, [%o4] + retl + nop +ENDPROC(sun4v_vt_get_perfreg) + +ENTRY(sun4v_vt_set_perfreg) + mov HV_FAST_VT_SET_PERFREG, %o5 + ta HV_FAST_TRAP + retl + nop +ENDPROC(sun4v_vt_set_perfreg) diff --git a/arch/sparc/kernel/ktlb.S b/arch/sparc/kernel/ktlb.S index 79f31036484..0746e5e32b3 100644 --- a/arch/sparc/kernel/ktlb.S +++ b/arch/sparc/kernel/ktlb.S @@ -188,31 +188,26 @@ valid_addr_bitmap_patch: be,pn %xcc, kvmap_dtlb_longpath 2: sethi %hi(kpte_linear_bitmap), %g2 - or %g2, %lo(kpte_linear_bitmap), %g2 /* Get the 256MB physical address index. */ sllx %g4, 21, %g5 - mov 1, %g7 + or %g2, %lo(kpte_linear_bitmap), %g2 srlx %g5, 21 + 28, %g5 + and %g5, (32 - 1), %g7 - /* Don't try this at home kids... this depends upon srlx - * only taking the low 6 bits of the shift count in %g5. - */ - sllx %g7, %g5, %g7 - - /* Divide by 64 to get the offset into the bitmask. */ - srlx %g5, 6, %g5 + /* Divide by 32 to get the offset into the bitmask. */ + srlx %g5, 5, %g5 + add %g7, %g7, %g7 sllx %g5, 3, %g5 - /* kern_linear_pte_xor[((mask & bit) ? 1 : 0)] */ + /* kern_linear_pte_xor[(mask >> shift) & 3)] */ ldx [%g2 + %g5], %g2 - andcc %g2, %g7, %g0 + srlx %g2, %g7, %g7 sethi %hi(kern_linear_pte_xor), %g5 + and %g7, 3, %g7 or %g5, %lo(kern_linear_pte_xor), %g5 - bne,a,pt %xcc, 1f - add %g5, 8, %g5 - -1: ldx [%g5], %g2 + sllx %g7, 3, %g7 + ldx [%g5 + %g7], %g2 .globl kvmap_linear_patch kvmap_linear_patch: diff --git a/arch/sparc/kernel/leon_pci.c b/arch/sparc/kernel/leon_pci.c index 21dcda75a52..fc052116156 100644 --- a/arch/sparc/kernel/leon_pci.c +++ b/arch/sparc/kernel/leon_pci.c @@ -102,15 +102,6 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) return pci_enable_resources(dev, mask); } -void __devinit pcibios_update_irq(struct pci_dev *dev, int irq) -{ -#ifdef CONFIG_PCI_DEBUG - printk(KERN_DEBUG "LEONPCI: Assigning IRQ %02d to %s\n", irq, - pci_name(dev)); -#endif - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} - /* in/out routines taken from pcic.c * * This probably belongs here rather than ioport.c because diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 6dc79628058..831c001604e 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -817,6 +817,30 @@ void __cpuinit mdesc_populate_present_mask(cpumask_t *mask) mdesc_iterate_over_cpus(record_one_cpu, NULL, mask); } +static void * __init check_one_pgsz(struct mdesc_handle *hp, u64 mp, int cpuid, void *arg) +{ + const u64 *pgsz_prop = mdesc_get_property(hp, mp, "mmu-page-size-list", NULL); + unsigned long *pgsz_mask = arg; + u64 val; + + val = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K | + HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB); + if (pgsz_prop) + val = *pgsz_prop; + + if (!*pgsz_mask) + *pgsz_mask = val; + else + *pgsz_mask &= val; + return NULL; +} + +void __init mdesc_get_page_sizes(cpumask_t *mask, unsigned long *pgsz_mask) +{ + *pgsz_mask = 0; + mdesc_iterate_over_cpus(check_one_pgsz, pgsz_mask, mask); +} + static void * __cpuinit fill_in_one_cpu(struct mdesc_handle *hp, u64 mp, int cpuid, void *arg) { const u64 *cfreq = mdesc_get_property(hp, mp, "clock-frequency", NULL); diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index 15e0a169397..f1ddc0d2367 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -48,9 +48,7 @@ void *module_alloc(unsigned long size) return NULL; ret = module_map(size); - if (!ret) - ret = ERR_PTR(-ENOMEM); - else + if (ret) memset(ret, 0, size); return ret; @@ -116,6 +114,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, v = sym->st_value + rel[i].r_addend; switch (ELF_R_TYPE(rel[i].r_info) & 0xff) { + case R_SPARC_DISP32: + v -= (Elf_Addr) location; + *loc32 = v; + break; #ifdef CONFIG_SPARC64 case R_SPARC_64: location[0] = v >> 56; @@ -128,11 +130,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, location[7] = v >> 0; break; - case R_SPARC_DISP32: - v -= (Elf_Addr) location; - *loc32 = v; - break; - case R_SPARC_WDISP19: v -= (Elf_Addr) location; *loc32 = (*loc32 & ~0x7ffff) | diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index eb1c1f010a4..6479256fd5a 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -22,7 +22,6 @@ #include <asm/perf_event.h> #include <asm/ptrace.h> #include <asm/pcr.h> -#include <asm/perfctr.h> #include "kstack.h" @@ -109,7 +108,7 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs) pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) touched = 1; else - pcr_ops->write(PCR_PIC_PRIV); + pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); sum = local_cpu_data().irq0_irqs; if (__get_cpu_var(nmi_touch)) { @@ -126,8 +125,8 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs) __this_cpu_write(alert_counter, 0); } if (__get_cpu_var(wd_enabled)) { - write_pic(picl_value(nmi_hz)); - pcr_ops->write(pcr_enable); + pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); + pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); } restore_hardirq_stack(orig_sp); @@ -166,7 +165,7 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) void stop_nmi_watchdog(void *unused) { - pcr_ops->write(PCR_PIC_PRIV); + pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -223,10 +222,10 @@ void start_nmi_watchdog(void *unused) __get_cpu_var(wd_enabled) = 1; atomic_inc(&nmi_active); - pcr_ops->write(PCR_PIC_PRIV); - write_pic(picl_value(nmi_hz)); + pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); + pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); - pcr_ops->write(pcr_enable); + pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); } static void nmi_adjust_hz_one(void *unused) @@ -234,10 +233,10 @@ static void nmi_adjust_hz_one(void *unused) if (!__get_cpu_var(wd_enabled)) return; - pcr_ops->write(PCR_PIC_PRIV); - write_pic(picl_value(nmi_hz)); + pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); + pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); - pcr_ops->write(pcr_enable); + pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); } void nmi_adjust_hz(unsigned int new_hz) diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 065b88c4f86..75b31bcdead 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -622,10 +622,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *pbus) { } -void pcibios_update_irq(struct pci_dev *pdev, int irq) -{ -} - resource_size_t pcibios_align_resource(void *data, const struct resource *res, resource_size_t size, resource_size_t align) { @@ -783,7 +779,7 @@ static int __pci_mmap_make_offset(struct pci_dev *pdev, static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state) { - vma->vm_flags |= (VM_IO | VM_RESERVED); + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; } /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 7661e84a05a..051b69caeff 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -594,7 +594,7 @@ static int __devinit pci_sun4v_iommu_init(struct pci_pbm_info *pbm) printk(KERN_ERR PFX "Strange virtual-dma[%08x:%08x].\n", vdma[0], vdma[1]); return -EINVAL; - }; + } dma_mask = (roundup_pow_of_two(vdma[1]) - 1UL); num_tsb_entries = vdma[1] / IO_PAGE_SIZE; diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c index 0ce0dd2332a..269af58497a 100644 --- a/arch/sparc/kernel/pcr.c +++ b/arch/sparc/kernel/pcr.c @@ -13,23 +13,14 @@ #include <asm/pil.h> #include <asm/pcr.h> #include <asm/nmi.h> +#include <asm/asi.h> #include <asm/spitfire.h> -#include <asm/perfctr.h> /* This code is shared between various users of the performance * counters. Users will be oprofile, pseudo-NMI watchdog, and the * perf_event support layer. */ -#define PCR_SUN4U_ENABLE (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE) -#define PCR_N2_ENABLE (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE | \ - PCR_N2_TOE_OV1 | \ - (2 << PCR_N2_SL1_SHIFT) | \ - (0xff << PCR_N2_MASK1_SHIFT)) - -u64 pcr_enable; -unsigned int picl_shift; - /* Performance counter interrupts run unmasked at PIL level 15. * Therefore we can't do things like wakeups and other work * that expects IRQ disabling to be adhered to in locking etc. @@ -60,39 +51,144 @@ void arch_irq_work_raise(void) const struct pcr_ops *pcr_ops; EXPORT_SYMBOL_GPL(pcr_ops); -static u64 direct_pcr_read(void) +static u64 direct_pcr_read(unsigned long reg_num) { u64 val; - read_pcr(val); + WARN_ON_ONCE(reg_num != 0); + __asm__ __volatile__("rd %%pcr, %0" : "=r" (val)); return val; } -static void direct_pcr_write(u64 val) +static void direct_pcr_write(unsigned long reg_num, u64 val) +{ + WARN_ON_ONCE(reg_num != 0); + __asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (val)); +} + +static u64 direct_pic_read(unsigned long reg_num) { - write_pcr(val); + u64 val; + + WARN_ON_ONCE(reg_num != 0); + __asm__ __volatile__("rd %%pic, %0" : "=r" (val)); + return val; +} + +static void direct_pic_write(unsigned long reg_num, u64 val) +{ + WARN_ON_ONCE(reg_num != 0); + + /* Blackbird errata workaround. See commentary in + * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt() + * for more information. + */ + __asm__ __volatile__("ba,pt %%xcc, 99f\n\t" + " nop\n\t" + ".align 64\n" + "99:wr %0, 0x0, %%pic\n\t" + "rd %%pic, %%g0" : : "r" (val)); +} + +static u64 direct_picl_value(unsigned int nmi_hz) +{ + u32 delta = local_cpu_data().clock_tick / nmi_hz; + + return ((u64)((0 - delta) & 0xffffffff)) << 32; } static const struct pcr_ops direct_pcr_ops = { - .read = direct_pcr_read, - .write = direct_pcr_write, + .read_pcr = direct_pcr_read, + .write_pcr = direct_pcr_write, + .read_pic = direct_pic_read, + .write_pic = direct_pic_write, + .nmi_picl_value = direct_picl_value, + .pcr_nmi_enable = (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE), + .pcr_nmi_disable = PCR_PIC_PRIV, }; -static void n2_pcr_write(u64 val) +static void n2_pcr_write(unsigned long reg_num, u64 val) { unsigned long ret; + WARN_ON_ONCE(reg_num != 0); if (val & PCR_N2_HTRACE) { ret = sun4v_niagara2_setperf(HV_N2_PERF_SPARC_CTL, val); if (ret != HV_EOK) - write_pcr(val); + direct_pcr_write(reg_num, val); } else - write_pcr(val); + direct_pcr_write(reg_num, val); +} + +static u64 n2_picl_value(unsigned int nmi_hz) +{ + u32 delta = local_cpu_data().clock_tick / (nmi_hz << 2); + + return ((u64)((0 - delta) & 0xffffffff)) << 32; } static const struct pcr_ops n2_pcr_ops = { - .read = direct_pcr_read, - .write = n2_pcr_write, + .read_pcr = direct_pcr_read, + .write_pcr = n2_pcr_write, + .read_pic = direct_pic_read, + .write_pic = direct_pic_write, + .nmi_picl_value = n2_picl_value, + .pcr_nmi_enable = (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE | + PCR_N2_TOE_OV1 | + (2 << PCR_N2_SL1_SHIFT) | + (0xff << PCR_N2_MASK1_SHIFT)), + .pcr_nmi_disable = PCR_PIC_PRIV, +}; + +static u64 n4_pcr_read(unsigned long reg_num) +{ + unsigned long val; + + (void) sun4v_vt_get_perfreg(reg_num, &val); + + return val; +} + +static void n4_pcr_write(unsigned long reg_num, u64 val) +{ + (void) sun4v_vt_set_perfreg(reg_num, val); +} + +static u64 n4_pic_read(unsigned long reg_num) +{ + unsigned long val; + + __asm__ __volatile__("ldxa [%1] %2, %0" + : "=r" (val) + : "r" (reg_num * 0x8UL), "i" (ASI_PIC)); + + return val; +} + +static void n4_pic_write(unsigned long reg_num, u64 val) +{ + __asm__ __volatile__("stxa %0, [%1] %2" + : /* no outputs */ + : "r" (val), "r" (reg_num * 0x8UL), "i" (ASI_PIC)); +} + +static u64 n4_picl_value(unsigned int nmi_hz) +{ + u32 delta = local_cpu_data().clock_tick / (nmi_hz << 2); + + return ((u64)((0 - delta) & 0xffffffff)); +} + +static const struct pcr_ops n4_pcr_ops = { + .read_pcr = n4_pcr_read, + .write_pcr = n4_pcr_write, + .read_pic = n4_pic_read, + .write_pic = n4_pic_write, + .nmi_picl_value = n4_picl_value, + .pcr_nmi_enable = (PCR_N4_PICNPT | PCR_N4_STRACE | + PCR_N4_UTRACE | PCR_N4_TOE | + (26 << PCR_N4_SL_SHIFT)), + .pcr_nmi_disable = PCR_N4_PICNPT, }; static unsigned long perf_hsvc_group; @@ -115,6 +211,10 @@ static int __init register_perf_hsvc(void) perf_hsvc_group = HV_GRP_KT_CPU; break; + case SUN4V_CHIP_NIAGARA4: + perf_hsvc_group = HV_GRP_VT_CPU; + break; + default: return -ENODEV; } @@ -139,6 +239,29 @@ static void __init unregister_perf_hsvc(void) sun4v_hvapi_unregister(perf_hsvc_group); } +static int __init setup_sun4v_pcr_ops(void) +{ + int ret = 0; + + switch (sun4v_chip_type) { + case SUN4V_CHIP_NIAGARA1: + case SUN4V_CHIP_NIAGARA2: + case SUN4V_CHIP_NIAGARA3: + pcr_ops = &n2_pcr_ops; + break; + + case SUN4V_CHIP_NIAGARA4: + pcr_ops = &n4_pcr_ops; + break; + + default: + ret = -ENODEV; + break; + } + + return ret; +} + int __init pcr_arch_init(void) { int err = register_perf_hsvc(); @@ -148,15 +271,14 @@ int __init pcr_arch_init(void) switch (tlb_type) { case hypervisor: - pcr_ops = &n2_pcr_ops; - pcr_enable = PCR_N2_ENABLE; - picl_shift = 2; + err = setup_sun4v_pcr_ops(); + if (err) + goto out_unregister; break; case cheetah: case cheetah_plus: pcr_ops = &direct_pcr_ops; - pcr_enable = PCR_SUN4U_ENABLE; break; case spitfire: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 5713957dcb8..e48651dace1 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -25,36 +25,48 @@ #include <linux/atomic.h> #include <asm/nmi.h> #include <asm/pcr.h> -#include <asm/perfctr.h> #include <asm/cacheflush.h> #include "kernel.h" #include "kstack.h" -/* Sparc64 chips have two performance counters, 32-bits each, with - * overflow interrupts generated on transition from 0xffffffff to 0. - * The counters are accessed in one go using a 64-bit register. +/* Two classes of sparc64 chips currently exist. All of which have + * 32-bit counters which can generate overflow interrupts on the + * transition from 0xffffffff to 0. * - * Both counters are controlled using a single control register. The - * only way to stop all sampling is to clear all of the context (user, - * supervisor, hypervisor) sampling enable bits. But these bits apply - * to both counters, thus the two counters can't be enabled/disabled - * individually. + * All chips upto and including SPARC-T3 have two performance + * counters. The two 32-bit counters are accessed in one go using a + * single 64-bit register. * - * The control register has two event fields, one for each of the two - * counters. It's thus nearly impossible to have one counter going - * while keeping the other one stopped. Therefore it is possible to - * get overflow interrupts for counters not currently "in use" and - * that condition must be checked in the overflow interrupt handler. + * On these older chips both counters are controlled using a single + * control register. The only way to stop all sampling is to clear + * all of the context (user, supervisor, hypervisor) sampling enable + * bits. But these bits apply to both counters, thus the two counters + * can't be enabled/disabled individually. + * + * Furthermore, the control register on these older chips have two + * event fields, one for each of the two counters. It's thus nearly + * impossible to have one counter going while keeping the other one + * stopped. Therefore it is possible to get overflow interrupts for + * counters not currently "in use" and that condition must be checked + * in the overflow interrupt handler. * * So we use a hack, in that we program inactive counters with the * "sw_count0" and "sw_count1" events. These count how many times * the instruction "sethi %hi(0xfc000), %g0" is executed. It's an * unusual way to encode a NOP and therefore will not trigger in * normal code. + * + * Starting with SPARC-T4 we have one control register per counter. + * And the counters are stored in individual registers. The registers + * for the counters are 64-bit but only a 32-bit counter is + * implemented. The event selections on SPARC-T4 lack any + * restrictions, therefore we can elide all of the complicated + * conflict resolution code we have for SPARC-T3 and earlier chips. */ -#define MAX_HWEVENTS 2 +#define MAX_HWEVENTS 4 +#define MAX_PCRS 4 #define MAX_PERIOD ((1UL << 32) - 1) #define PIC_UPPER_INDEX 0 @@ -90,8 +102,8 @@ struct cpu_hw_events { */ int current_idx[MAX_HWEVENTS]; - /* Software copy of %pcr register on this cpu. */ - u64 pcr; + /* Software copy of %pcr register(s) on this cpu. */ + u64 pcr[MAX_HWEVENTS]; /* Enabled/disable state. */ int enabled; @@ -103,6 +115,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; /* An event map describes the characteristics of a performance * counter event. In particular it gives the encoding as well as * a mask telling which counters the event can be measured on. + * + * The mask is unused on SPARC-T4 and later. */ struct perf_event_map { u16 encoding; @@ -142,15 +156,53 @@ struct sparc_pmu { const struct perf_event_map *(*event_map)(int); const cache_map_t *cache_map; int max_events; + u32 (*read_pmc)(int); + void (*write_pmc)(int, u64); int upper_shift; int lower_shift; int event_mask; + int user_bit; + int priv_bit; int hv_bit; int irq_bit; int upper_nop; int lower_nop; + unsigned int flags; +#define SPARC_PMU_ALL_EXCLUDES_SAME 0x00000001 +#define SPARC_PMU_HAS_CONFLICTS 0x00000002 + int max_hw_events; + int num_pcrs; + int num_pic_regs; }; +static u32 sparc_default_read_pmc(int idx) +{ + u64 val; + + val = pcr_ops->read_pic(0); + if (idx == PIC_UPPER_INDEX) + val >>= 32; + + return val & 0xffffffff; +} + +static void sparc_default_write_pmc(int idx, u64 val) +{ + u64 shift, mask, pic; + + shift = 0; + if (idx == PIC_UPPER_INDEX) + shift = 32; + + mask = ((u64) 0xffffffff) << shift; + val <<= shift; + + pic = pcr_ops->read_pic(0); + pic &= ~mask; + pic |= val; + pcr_ops->write_pic(0, pic); +} + static const struct perf_event_map ultra3_perfmon_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER }, [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER }, @@ -268,11 +320,20 @@ static const struct sparc_pmu ultra3_pmu = { .event_map = ultra3_event_map, .cache_map = &ultra3_cache_map, .max_events = ARRAY_SIZE(ultra3_perfmon_event_map), + .read_pmc = sparc_default_read_pmc, + .write_pmc = sparc_default_write_pmc, .upper_shift = 11, .lower_shift = 4, .event_mask = 0x3f, + .user_bit = PCR_UTRACE, + .priv_bit = PCR_STRACE, .upper_nop = 0x1c, .lower_nop = 0x14, + .flags = (SPARC_PMU_ALL_EXCLUDES_SAME | + SPARC_PMU_HAS_CONFLICTS), + .max_hw_events = 2, + .num_pcrs = 1, + .num_pic_regs = 1, }; /* Niagara1 is very limited. The upper PIC is hard-locked to count @@ -397,11 +458,20 @@ static const struct sparc_pmu niagara1_pmu = { .event_map = niagara1_event_map, .cache_map = &niagara1_cache_map, .max_events = ARRAY_SIZE(niagara1_perfmon_event_map), + .read_pmc = sparc_default_read_pmc, + .write_pmc = sparc_default_write_pmc, .upper_shift = 0, .lower_shift = 4, .event_mask = 0x7, + .user_bit = PCR_UTRACE, + .priv_bit = PCR_STRACE, .upper_nop = 0x0, .lower_nop = 0x0, + .flags = (SPARC_PMU_ALL_EXCLUDES_SAME | + SPARC_PMU_HAS_CONFLICTS), + .max_hw_events = 2, + .num_pcrs = 1, + .num_pic_regs = 1, }; static const struct perf_event_map niagara2_perfmon_event_map[] = { @@ -523,13 +593,203 @@ static const struct sparc_pmu niagara2_pmu = { .event_map = niagara2_event_map, .cache_map = &niagara2_cache_map, .max_events = ARRAY_SIZE(niagara2_perfmon_event_map), + .read_pmc = sparc_default_read_pmc, + .write_pmc = sparc_default_write_pmc, .upper_shift = 19, .lower_shift = 6, .event_mask = 0xfff, - .hv_bit = 0x8, + .user_bit = PCR_UTRACE, + .priv_bit = PCR_STRACE, + .hv_bit = PCR_N2_HTRACE, .irq_bit = 0x30, .upper_nop = 0x220, .lower_nop = 0x220, + .flags = (SPARC_PMU_ALL_EXCLUDES_SAME | + SPARC_PMU_HAS_CONFLICTS), + .max_hw_events = 2, + .num_pcrs = 1, + .num_pic_regs = 1, +}; + +static const struct perf_event_map niagara4_perfmon_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = { (26 << 6) }, + [PERF_COUNT_HW_INSTRUCTIONS] = { (3 << 6) | 0x3f }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { (3 << 6) | 0x04 }, + [PERF_COUNT_HW_CACHE_MISSES] = { (16 << 6) | 0x07 }, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { (4 << 6) | 0x01 }, + [PERF_COUNT_HW_BRANCH_MISSES] = { (25 << 6) | 0x0f }, +}; + +static const struct perf_event_map *niagara4_event_map(int event_id) +{ + return &niagara4_perfmon_event_map[event_id]; +} + +static const cache_map_t niagara4_cache_map = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { (3 << 6) | 0x04 }, + [C(RESULT_MISS)] = { (16 << 6) | 0x07 }, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = { (3 << 6) | 0x08 }, + [C(RESULT_MISS)] = { (16 << 6) | 0x07 }, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED }, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { (3 << 6) | 0x3f }, + [C(RESULT_MISS)] = { (11 << 6) | 0x03 }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_NONSENSE }, + [ C(RESULT_MISS) ] = { CACHE_OP_NONSENSE }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { (3 << 6) | 0x04 }, + [C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED }, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = { (3 << 6) | 0x08 }, + [C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED }, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED }, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS)] = { (17 << 6) | 0x3f }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS)] = { (6 << 6) | 0x3f }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, +}; + +static u32 sparc_vt_read_pmc(int idx) +{ + u64 val = pcr_ops->read_pic(idx); + + return val & 0xffffffff; +} + +static void sparc_vt_write_pmc(int idx, u64 val) +{ + u64 pcr; + + /* There seems to be an internal latch on the overflow event + * on SPARC-T4 that prevents it from triggering unless you + * update the PIC exactly as we do here. The requirement + * seems to be that you have to turn off event counting in the + * PCR around the PIC update. + * + * For example, after the following sequence: + * + * 1) set PIC to -1 + * 2) enable event counting and overflow reporting in PCR + * 3) overflow triggers, softint 15 handler invoked + * 4) clear OV bit in PCR + * 5) write PIC to -1 + * + * a subsequent overflow event will not trigger. This + * sequence works on SPARC-T3 and previous chips. + */ + pcr = pcr_ops->read_pcr(idx); + pcr_ops->write_pcr(idx, PCR_N4_PICNPT); + + pcr_ops->write_pic(idx, val & 0xffffffff); + + pcr_ops->write_pcr(idx, pcr); +} + +static const struct sparc_pmu niagara4_pmu = { + .event_map = niagara4_event_map, + .cache_map = &niagara4_cache_map, + .max_events = ARRAY_SIZE(niagara4_perfmon_event_map), + .read_pmc = sparc_vt_read_pmc, + .write_pmc = sparc_vt_write_pmc, + .upper_shift = 5, + .lower_shift = 5, + .event_mask = 0x7ff, + .user_bit = PCR_N4_UTRACE, + .priv_bit = PCR_N4_STRACE, + + /* We explicitly don't support hypervisor tracing. The T4 + * generates the overflow event for precise events via a trap + * which will not be generated (ie. it's completely lost) if + * we happen to be in the hypervisor when the event triggers. + * Essentially, the overflow event reporting is completely + * unusable when you have hypervisor mode tracing enabled. + */ + .hv_bit = 0, + + .irq_bit = PCR_N4_TOE, + .upper_nop = 0, + .lower_nop = 0, + .flags = 0, + .max_hw_events = 4, + .num_pcrs = 4, + .num_pic_regs = 4, }; static const struct sparc_pmu *sparc_pmu __read_mostly; @@ -558,55 +818,35 @@ static u64 nop_for_index(int idx) static inline void sparc_pmu_enable_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc, int idx) { u64 val, mask = mask_for_index(idx); + int pcr_index = 0; - val = cpuc->pcr; + if (sparc_pmu->num_pcrs > 1) + pcr_index = idx; + + val = cpuc->pcr[pcr_index]; val &= ~mask; val |= hwc->config; - cpuc->pcr = val; + cpuc->pcr[pcr_index] = val; - pcr_ops->write(cpuc->pcr); + pcr_ops->write_pcr(pcr_index, cpuc->pcr[pcr_index]); } static inline void sparc_pmu_disable_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc, int idx) { u64 mask = mask_for_index(idx); u64 nop = nop_for_index(idx); + int pcr_index = 0; u64 val; - val = cpuc->pcr; + if (sparc_pmu->num_pcrs > 1) + pcr_index = idx; + + val = cpuc->pcr[pcr_index]; val &= ~mask; val |= nop; - cpuc->pcr = val; + cpuc->pcr[pcr_index] = val; - pcr_ops->write(cpuc->pcr); -} - -static u32 read_pmc(int idx) -{ - u64 val; - - read_pic(val); - if (idx == PIC_UPPER_INDEX) - val >>= 32; - - return val & 0xffffffff; -} - -static void write_pmc(int idx, u64 val) -{ - u64 shift, mask, pic; - - shift = 0; - if (idx == PIC_UPPER_INDEX) - shift = 32; - - mask = ((u64) 0xffffffff) << shift; - val <<= shift; - - read_pic(pic); - pic &= ~mask; - pic |= val; - write_pic(pic); + pcr_ops->write_pcr(pcr_index, cpuc->pcr[pcr_index]); } static u64 sparc_perf_event_update(struct perf_event *event, @@ -618,7 +858,7 @@ static u64 sparc_perf_event_update(struct perf_event *event, again: prev_raw_count = local64_read(&hwc->prev_count); - new_raw_count = read_pmc(idx); + new_raw_count = sparc_pmu->read_pmc(idx); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -658,25 +898,17 @@ static int sparc_perf_event_set_period(struct perf_event *event, local64_set(&hwc->prev_count, (u64)-left); - write_pmc(idx, (u64)(-left) & 0xffffffff); + sparc_pmu->write_pmc(idx, (u64)(-left) & 0xffffffff); perf_event_update_userpage(event); return ret; } -/* If performance event entries have been added, move existing - * events around (if necessary) and then assign new entries to - * counters. - */ -static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr) +static void read_in_all_counters(struct cpu_hw_events *cpuc) { int i; - if (!cpuc->n_added) - goto out; - - /* Read in the counters which are moving. */ for (i = 0; i < cpuc->n_events; i++) { struct perf_event *cp = cpuc->event[i]; @@ -687,6 +919,20 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr) cpuc->current_idx[i] = PIC_NO_INDEX; } } +} + +/* On this PMU all PICs are programmed using a single PCR. Calculate + * the combined control register value. + * + * For such chips we require that all of the events have the same + * configuration, so just fetch the settings from the first entry. + */ +static void calculate_single_pcr(struct cpu_hw_events *cpuc) +{ + int i; + + if (!cpuc->n_added) + goto out; /* Assign to counters all unassigned events. */ for (i = 0; i < cpuc->n_events; i++) { @@ -702,20 +948,71 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr) cpuc->current_idx[i] = idx; enc = perf_event_get_enc(cpuc->events[i]); - pcr &= ~mask_for_index(idx); + cpuc->pcr[0] &= ~mask_for_index(idx); if (hwc->state & PERF_HES_STOPPED) - pcr |= nop_for_index(idx); + cpuc->pcr[0] |= nop_for_index(idx); else - pcr |= event_encoding(enc, idx); + cpuc->pcr[0] |= event_encoding(enc, idx); } out: - return pcr; + cpuc->pcr[0] |= cpuc->event[0]->hw.config_base; +} + +/* On this PMU each PIC has it's own PCR control register. */ +static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc) +{ + int i; + + if (!cpuc->n_added) + goto out; + + for (i = 0; i < cpuc->n_events; i++) { + struct perf_event *cp = cpuc->event[i]; + struct hw_perf_event *hwc = &cp->hw; + int idx = hwc->idx; + u64 enc; + + if (cpuc->current_idx[i] != PIC_NO_INDEX) + continue; + + sparc_perf_event_set_period(cp, hwc, idx); + cpuc->current_idx[i] = idx; + + enc = perf_event_get_enc(cpuc->events[i]); + cpuc->pcr[idx] &= ~mask_for_index(idx); + if (hwc->state & PERF_HES_STOPPED) + cpuc->pcr[idx] |= nop_for_index(idx); + else + cpuc->pcr[idx] |= event_encoding(enc, idx); + } +out: + for (i = 0; i < cpuc->n_events; i++) { + struct perf_event *cp = cpuc->event[i]; + int idx = cp->hw.idx; + + cpuc->pcr[idx] |= cp->hw.config_base; + } +} + +/* If performance event entries have been added, move existing events + * around (if necessary) and then assign new entries to counters. + */ +static void update_pcrs_for_enable(struct cpu_hw_events *cpuc) +{ + if (cpuc->n_added) + read_in_all_counters(cpuc); + + if (sparc_pmu->num_pcrs == 1) { + calculate_single_pcr(cpuc); + } else { + calculate_multiple_pcrs(cpuc); + } } static void sparc_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 pcr; + int i; if (cpuc->enabled) return; @@ -723,26 +1020,17 @@ static void sparc_pmu_enable(struct pmu *pmu) cpuc->enabled = 1; barrier(); - pcr = cpuc->pcr; - if (!cpuc->n_events) { - pcr = 0; - } else { - pcr = maybe_change_configuration(cpuc, pcr); - - /* We require that all of the events have the same - * configuration, so just fetch the settings from the - * first entry. - */ - cpuc->pcr = pcr | cpuc->event[0]->hw.config_base; - } + if (cpuc->n_events) + update_pcrs_for_enable(cpuc); - pcr_ops->write(cpuc->pcr); + for (i = 0; i < sparc_pmu->num_pcrs; i++) + pcr_ops->write_pcr(i, cpuc->pcr[i]); } static void sparc_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; + int i; if (!cpuc->enabled) return; @@ -750,12 +1038,14 @@ static void sparc_pmu_disable(struct pmu *pmu) cpuc->enabled = 0; cpuc->n_added = 0; - val = cpuc->pcr; - val &= ~(PCR_UTRACE | PCR_STRACE | - sparc_pmu->hv_bit | sparc_pmu->irq_bit); - cpuc->pcr = val; + for (i = 0; i < sparc_pmu->num_pcrs; i++) { + u64 val = cpuc->pcr[i]; - pcr_ops->write(cpuc->pcr); + val &= ~(sparc_pmu->user_bit | sparc_pmu->priv_bit | + sparc_pmu->hv_bit | sparc_pmu->irq_bit); + cpuc->pcr[i] = val; + pcr_ops->write_pcr(i, cpuc->pcr[i]); + } } static int active_event_index(struct cpu_hw_events *cpuc, @@ -854,9 +1144,11 @@ static DEFINE_MUTEX(pmc_grab_mutex); static void perf_stop_nmi_watchdog(void *unused) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; stop_nmi_watchdog(NULL); - cpuc->pcr = pcr_ops->read(); + for (i = 0; i < sparc_pmu->num_pcrs; i++) + cpuc->pcr[i] = pcr_ops->read_pcr(i); } void perf_event_grab_pmc(void) @@ -942,9 +1234,17 @@ static int sparc_check_constraints(struct perf_event **evts, if (!n_ev) return 0; - if (n_ev > MAX_HWEVENTS) + if (n_ev > sparc_pmu->max_hw_events) return -1; + if (!(sparc_pmu->flags & SPARC_PMU_HAS_CONFLICTS)) { + int i; + + for (i = 0; i < n_ev; i++) + evts[i]->hw.idx = i; + return 0; + } + msk0 = perf_event_get_msk(events[0]); if (n_ev == 1) { if (msk0 & PIC_LOWER) @@ -1000,6 +1300,9 @@ static int check_excludes(struct perf_event **evts, int n_prev, int n_new) struct perf_event *event; int i, n, first; + if (!(sparc_pmu->flags & SPARC_PMU_ALL_EXCLUDES_SAME)) + return 0; + n = n_prev + n_new; if (n <= 1) return 0; @@ -1059,7 +1362,7 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags) perf_pmu_disable(event->pmu); n0 = cpuc->n_events; - if (n0 >= MAX_HWEVENTS) + if (n0 >= sparc_pmu->max_hw_events) goto out; cpuc->event[n0] = event; @@ -1146,16 +1449,16 @@ static int sparc_pmu_event_init(struct perf_event *event) /* We save the enable bits in the config_base. */ hwc->config_base = sparc_pmu->irq_bit; if (!attr->exclude_user) - hwc->config_base |= PCR_UTRACE; + hwc->config_base |= sparc_pmu->user_bit; if (!attr->exclude_kernel) - hwc->config_base |= PCR_STRACE; + hwc->config_base |= sparc_pmu->priv_bit; if (!attr->exclude_hv) hwc->config_base |= sparc_pmu->hv_bit; n = 0; if (event->group_leader != event) { n = collect_events(event->group_leader, - MAX_HWEVENTS - 1, + sparc_pmu->max_hw_events - 1, evts, events, current_idx_dmy); if (n < 0) return -EINVAL; @@ -1254,8 +1557,7 @@ static struct pmu pmu = { void perf_event_print_debug(void) { unsigned long flags; - u64 pcr, pic; - int cpu; + int cpu, i; if (!sparc_pmu) return; @@ -1264,12 +1566,13 @@ void perf_event_print_debug(void) cpu = smp_processor_id(); - pcr = pcr_ops->read(); - read_pic(pic); - pr_info("\n"); - pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n", - cpu, pcr, pic); + for (i = 0; i < sparc_pmu->num_pcrs; i++) + pr_info("CPU#%d: PCR%d[%016llx]\n", + cpu, i, pcr_ops->read_pcr(i)); + for (i = 0; i < sparc_pmu->num_pic_regs; i++) + pr_info("CPU#%d: PIC%d[%016llx]\n", + cpu, i, pcr_ops->read_pic(i)); local_irq_restore(flags); } @@ -1305,8 +1608,9 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, * Do this before we peek at the counters to determine * overflow so we don't lose any events. */ - if (sparc_pmu->irq_bit) - pcr_ops->write(cpuc->pcr); + if (sparc_pmu->irq_bit && + sparc_pmu->num_pcrs == 1) + pcr_ops->write_pcr(0, cpuc->pcr[0]); for (i = 0; i < cpuc->n_events; i++) { struct perf_event *event = cpuc->event[i]; @@ -1314,6 +1618,10 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, struct hw_perf_event *hwc; u64 val; + if (sparc_pmu->irq_bit && + sparc_pmu->num_pcrs > 1) + pcr_ops->write_pcr(idx, cpuc->pcr[idx]); + hwc = &event->hw; val = sparc_perf_event_update(event, hwc, idx); if (val & (1ULL << 31)) @@ -1352,6 +1660,10 @@ static bool __init supported_pmu(void) sparc_pmu = &niagara2_pmu; return true; } + if (!strcmp(sparc_pmu_type, "niagara4")) { + sparc_pmu = &niagara4_pmu; + return true; + } return false; } diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c index 340c5b976d2..d397d7fc5c2 100644 --- a/arch/sparc/kernel/prom_64.c +++ b/arch/sparc/kernel/prom_64.c @@ -37,7 +37,7 @@ void * __init prom_early_alloc(unsigned long size) void *ret; if (!paddr) { - prom_printf("prom_early_alloc(%lu) failed\n"); + prom_printf("prom_early_alloc(%lu) failed\n", size); prom_halt(); } diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index 1414d16712b..0800e71d8a8 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -340,7 +340,12 @@ static const char *hwcaps[] = { */ "mul32", "div32", "fsmuld", "v8plus", "popc", "vis", "vis2", "ASIBlkInit", "fmaf", "vis3", "hpc", "random", "trans", "fjfmau", - "ima", "cspare", + "ima", "cspare", "pause", "cbcond", +}; + +static const char *crypto_hwcaps[] = { + "aes", "des", "kasumi", "camellia", "md5", "sha1", "sha256", + "sha512", "mpmul", "montmul", "montsqr", "crc32c", }; void cpucap_info(struct seq_file *m) @@ -357,27 +362,61 @@ void cpucap_info(struct seq_file *m) printed++; } } + if (caps & HWCAP_SPARC_CRYPTO) { + unsigned long cfr; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) { + unsigned long bit = 1UL << i; + if (cfr & bit) { + seq_printf(m, "%s%s", + printed ? "," : "", crypto_hwcaps[i]); + printed++; + } + } + } seq_putc(m, '\n'); } +static void __init report_one_hwcap(int *printed, const char *name) +{ + if ((*printed) == 0) + printk(KERN_INFO "CPU CAPS: ["); + printk(KERN_CONT "%s%s", + (*printed) ? "," : "", name); + if (++(*printed) == 8) { + printk(KERN_CONT "]\n"); + *printed = 0; + } +} + +static void __init report_crypto_hwcaps(int *printed) +{ + unsigned long cfr; + int i; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + + for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) { + unsigned long bit = 1UL << i; + if (cfr & bit) + report_one_hwcap(printed, crypto_hwcaps[i]); + } +} + static void __init report_hwcaps(unsigned long caps) { int i, printed = 0; - printk(KERN_INFO "CPU CAPS: ["); for (i = 0; i < ARRAY_SIZE(hwcaps); i++) { unsigned long bit = 1UL << i; - if (caps & bit) { - printk(KERN_CONT "%s%s", - printed ? "," : "", hwcaps[i]); - if (++printed == 8) { - printk(KERN_CONT "]\n"); - printk(KERN_INFO "CPU CAPS: ["); - printed = 0; - } - } + if (caps & bit) + report_one_hwcap(&printed, hwcaps[i]); } - printk(KERN_CONT "]\n"); + if (caps & HWCAP_SPARC_CRYPTO) + report_crypto_hwcaps(&printed); + if (printed != 0) + printk(KERN_CONT "]\n"); } static unsigned long __init mdesc_cpu_hwcap_list(void) @@ -411,6 +450,10 @@ static unsigned long __init mdesc_cpu_hwcap_list(void) break; } } + for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) { + if (!strcmp(prop, crypto_hwcaps[i])) + caps |= HWCAP_SPARC_CRYPTO; + } plen = strlen(prop) + 1; prop += plen; diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index a53e0a5fd3a..53e48f721ce 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -54,58 +54,6 @@ struct signal_frame32 { /* __siginfo_rwin_t * */u32 rwin_save; } __attribute__((aligned(8))); -typedef struct compat_siginfo{ - int si_signo; - int si_errno; - int si_code; - - union { - int _pad[SI_PAD_SIZE32]; - - /* kill() */ - struct { - compat_pid_t _pid; /* sender's pid */ - unsigned int _uid; /* sender's uid */ - } _kill; - - /* POSIX.1b timers */ - struct { - compat_timer_t _tid; /* timer id */ - int _overrun; /* overrun count */ - compat_sigval_t _sigval; /* same as below */ - int _sys_private; /* not to be passed to user */ - } _timer; - - /* POSIX.1b signals */ - struct { - compat_pid_t _pid; /* sender's pid */ - unsigned int _uid; /* sender's uid */ - compat_sigval_t _sigval; - } _rt; - - /* SIGCHLD */ - struct { - compat_pid_t _pid; /* which child */ - unsigned int _uid; /* sender's uid */ - int _status; /* exit code */ - compat_clock_t _utime; - compat_clock_t _stime; - } _sigchld; - - /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGEMT */ - struct { - u32 _addr; /* faulting insn/memory ref. */ - int _trapno; - } _sigfault; - - /* SIGPOLL */ - struct { - int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ - int _fd; - } _sigpoll; - } _sifields; -}compat_siginfo_t; - struct rt_signal_frame32 { struct sparc_stackf32 ss; compat_siginfo_t info; diff --git a/arch/sparc/kernel/sun4v_tlb_miss.S b/arch/sparc/kernel/sun4v_tlb_miss.S index e1fbf8c7578..bde867fd71e 100644 --- a/arch/sparc/kernel/sun4v_tlb_miss.S +++ b/arch/sparc/kernel/sun4v_tlb_miss.S @@ -176,7 +176,7 @@ sun4v_tsb_miss_common: sub %g2, TRAP_PER_CPU_FAULT_INFO, %g2 -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) mov SCRATCHPAD_UTSBREG2, %g5 ldxa [%g5] ASI_SCRATCHPAD, %g5 cmp %g5, -1 diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index d97f3eb72e0..44025f4ba41 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -90,7 +90,7 @@ SIGN1(sys32_mkdir, sys_mkdir, %o1) SIGN3(sys32_futex, compat_sys_futex, %o1, %o2, %o5) SIGN1(sys32_sysfs, compat_sys_sysfs, %o0) SIGN2(sys32_sendfile, compat_sys_sendfile, %o0, %o1) -SIGN2(sys32_sendfile64, compat_sys_sendfile64, %o0, %o1) +SIGN2(sys32_sendfile64, sys_sendfile, %o0, %o1) SIGN1(sys32_prctl, sys_prctl, %o0) SIGN1(sys32_sched_rr_get_interval, compat_sys_sched_rr_get_interval, %o0) SIGN2(sys32_waitpid, sys_waitpid, %o0, %o2) diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index f7392336961..d862499eb01 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -506,52 +506,6 @@ long compat_sys_fadvise64_64(int fd, advice); } -asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, - compat_off_t __user *offset, - compat_size_t count) -{ - mm_segment_t old_fs = get_fs(); - int ret; - off_t of; - - if (offset && get_user(of, offset)) - return -EFAULT; - - set_fs(KERNEL_DS); - ret = sys_sendfile(out_fd, in_fd, - offset ? (off_t __user *) &of : NULL, - count); - set_fs(old_fs); - - if (offset && put_user(of, offset)) - return -EFAULT; - - return ret; -} - -asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd, - compat_loff_t __user *offset, - compat_size_t count) -{ - mm_segment_t old_fs = get_fs(); - int ret; - loff_t lof; - - if (offset && get_user(lof, offset)) - return -EFAULT; - - set_fs(KERNEL_DS); - ret = sys_sendfile64(out_fd, in_fd, - offset ? (loff_t __user *) &lof : NULL, - count); - set_fs(old_fs); - - if (offset && put_user(lof, offset)) - return -EFAULT; - - return ret; -} - /* This is just a version for 32-bit applications which does * not force O_LARGEFILE on. */ diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 3b05e669771..fa1f1d375ff 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -850,7 +850,7 @@ void __init cheetah_ecache_flush_init(void) ecache_flush_physbase = find_ecache_flush_span(ecache_flush_size); if (ecache_flush_physbase == ~0UL) { - prom_printf("cheetah_ecache_flush_init: Cannot find %d byte " + prom_printf("cheetah_ecache_flush_init: Cannot find %ld byte " "contiguous physical memory.\n", ecache_flush_size); prom_halt(); diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index db15d123f05..d4bdc7a6237 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S @@ -49,7 +49,7 @@ tsb_miss_page_table_walk: /* Before committing to a full page table walk, * check the huge page TSB. */ -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 661: ldx [%g7 + TRAP_PER_CPU_TSB_HUGE], %g5 nop @@ -110,12 +110,9 @@ tsb_miss_page_table_walk: tsb_miss_page_table_walk_sun4v_fastpath: USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault) - /* Load and check PTE. */ - ldxa [%g5] ASI_PHYS_USE_EC, %g5 - brgez,pn %g5, tsb_do_fault - nop + /* Valid PTE is now in %g5. */ -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 661: sethi %uhi(_PAGE_SZALL_4U), %g7 sllx %g7, 32, %g7 .section .sun4v_2insn_patch, "ax" diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index dff4096f3de..8410065f286 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -32,6 +32,9 @@ lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o lib-$(CONFIG_SPARC64) += NG2patch.o +lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o +lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o NG4clear_page.o NG4memset.o + lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o diff --git a/arch/sparc/lib/NG2memcpy.S b/arch/sparc/lib/NG2memcpy.S index 03eadf66b0d..2c20ad63ddb 100644 --- a/arch/sparc/lib/NG2memcpy.S +++ b/arch/sparc/lib/NG2memcpy.S @@ -14,7 +14,7 @@ #define FPRS_FEF 0x04 #ifdef MEMCPY_DEBUG #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ - clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs #else #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs @@ -182,13 +182,13 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ cmp %g2, 0 tne %xcc, 5 PREAMBLE - mov %o0, GLOBAL_SPARE + mov %o0, %o3 cmp %o2, 0 be,pn %XCC, 85f - or %o0, %o1, %o3 + or %o0, %o1, GLOBAL_SPARE cmp %o2, 16 blu,a,pn %XCC, 80f - or %o3, %o2, %o3 + or GLOBAL_SPARE, %o2, GLOBAL_SPARE /* 2 blocks (128 bytes) is the minimum we can do the block * copy with. We need to ensure that we'll iterate at least @@ -202,7 +202,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ */ cmp %o2, (4 * 64) blu,pt %XCC, 75f - andcc %o3, 0x7, %g0 + andcc GLOBAL_SPARE, 0x7, %g0 /* %o0: dst * %o1: src @@ -404,13 +404,13 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ * over. If anything is left, we copy it one byte at a time. */ brz,pt %o2, 85f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE ba,a,pt %XCC, 90f .align 64 75: /* 16 < len <= 64 */ bne,pn %XCC, 75f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE 72: andn %o2, 0xf, %o4 @@ -420,9 +420,9 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ add %o1, 0x08, %o1 EX_LD(LOAD(ldx, %o1, %g1)) sub %o1, 0x08, %o1 - EX_ST(STORE(stx, %o5, %o1 + %o3)) + EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE)) add %o1, 0x8, %o1 - EX_ST(STORE(stx, %g1, %o1 + %o3)) + EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE)) bgu,pt %XCC, 1b add %o1, 0x8, %o1 73: andcc %o2, 0x8, %g0 @@ -430,14 +430,14 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ nop sub %o2, 0x8, %o2 EX_LD(LOAD(ldx, %o1, %o5)) - EX_ST(STORE(stx, %o5, %o1 + %o3)) + EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE)) add %o1, 0x8, %o1 1: andcc %o2, 0x4, %g0 be,pt %XCC, 1f nop sub %o2, 0x4, %o2 EX_LD(LOAD(lduw, %o1, %o5)) - EX_ST(STORE(stw, %o5, %o1 + %o3)) + EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE)) add %o1, 0x4, %o1 1: cmp %o2, 0 be,pt %XCC, 85f @@ -454,11 +454,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 1: subcc %g1, 1, %g1 EX_LD(LOAD(ldub, %o1, %o5)) - EX_ST(STORE(stb, %o5, %o1 + %o3)) + EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE)) bgu,pt %icc, 1b add %o1, 1, %o1 -2: add %o1, %o3, %o0 +2: add %o1, GLOBAL_SPARE, %o0 andcc %o1, 0x7, %g1 bne,pt %icc, 8f sll %g1, 3, %g1 @@ -468,16 +468,16 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ nop ba,a,pt %xcc, 73b -8: mov 64, %o3 +8: mov 64, GLOBAL_SPARE andn %o1, 0x7, %o1 EX_LD(LOAD(ldx, %o1, %g2)) - sub %o3, %g1, %o3 + sub GLOBAL_SPARE, %g1, GLOBAL_SPARE andn %o2, 0x7, %o4 sllx %g2, %g1, %g2 1: add %o1, 0x8, %o1 EX_LD(LOAD(ldx, %o1, %g3)) subcc %o4, 0x8, %o4 - srlx %g3, %o3, %o5 + srlx %g3, GLOBAL_SPARE, %o5 or %o5, %g2, %o5 EX_ST(STORE(stx, %o5, %o0)) add %o0, 0x8, %o0 @@ -489,32 +489,32 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ be,pn %icc, 85f add %o1, %g1, %o1 ba,pt %xcc, 90f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE .align 64 80: /* 0 < len <= 16 */ - andcc %o3, 0x3, %g0 + andcc GLOBAL_SPARE, 0x3, %g0 bne,pn %XCC, 90f - sub %o0, %o1, %o3 + sub %o0, %o1, GLOBAL_SPARE 1: subcc %o2, 4, %o2 EX_LD(LOAD(lduw, %o1, %g1)) - EX_ST(STORE(stw, %g1, %o1 + %o3)) + EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE)) bgu,pt %XCC, 1b add %o1, 4, %o1 85: retl - mov EX_RETVAL(GLOBAL_SPARE), %o0 + mov EX_RETVAL(%o3), %o0 .align 32 90: subcc %o2, 1, %o2 EX_LD(LOAD(ldub, %o1, %g1)) - EX_ST(STORE(stb, %g1, %o1 + %o3)) + EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE)) bgu,pt %XCC, 90b add %o1, 1, %o1 retl - mov EX_RETVAL(GLOBAL_SPARE), %o0 + mov EX_RETVAL(%o3), %o0 .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/NG4clear_page.S b/arch/sparc/lib/NG4clear_page.S new file mode 100644 index 00000000000..e16c88204a4 --- /dev/null +++ b/arch/sparc/lib/NG4clear_page.S @@ -0,0 +1,29 @@ +/* NG4copy_page.S: Niagara-4 optimized clear page. + * + * Copyright (C) 2012 (davem@davemloft.net) + */ + +#include <asm/asi.h> +#include <asm/page.h> + + .text + + .register %g3, #scratch + + .align 32 + .globl NG4clear_page + .globl NG4clear_user_page +NG4clear_page: /* %o0=dest */ +NG4clear_user_page: /* %o0=dest, %o1=vaddr */ + set PAGE_SIZE, %g7 + mov 0x20, %g3 +1: stxa %g0, [%o0 + %g0] ASI_ST_BLKINIT_MRU_P + subcc %g7, 0x40, %g7 + stxa %g0, [%o0 + %g3] ASI_ST_BLKINIT_MRU_P + bne,pt %xcc, 1b + add %o0, 0x40, %o0 + membar #StoreLoad|#StoreStore + retl + nop + .size NG4clear_page,.-NG4clear_page + .size NG4clear_user_page,.-NG4clear_user_page
\ No newline at end of file diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S new file mode 100644 index 00000000000..fd9f903ffa3 --- /dev/null +++ b/arch/sparc/lib/NG4copy_from_user.S @@ -0,0 +1,30 @@ +/* NG4copy_from_user.S: Niagara-4 optimized copy from userspace. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#define FUNC_NAME NG4copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG4memcpy.S" diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S new file mode 100644 index 00000000000..28504e88c53 --- /dev/null +++ b/arch/sparc/lib/NG4copy_page.S @@ -0,0 +1,57 @@ +/* NG4copy_page.S: Niagara-4 optimized copy page. + * + * Copyright (C) 2012 (davem@davemloft.net) + */ + +#include <asm/asi.h> +#include <asm/page.h> + + .text + .align 32 + + .register %g2, #scratch + .register %g3, #scratch + + .globl NG4copy_user_page +NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ + prefetch [%o1 + 0x000], #n_reads_strong + prefetch [%o1 + 0x040], #n_reads_strong + prefetch [%o1 + 0x080], #n_reads_strong + prefetch [%o1 + 0x0c0], #n_reads_strong + set PAGE_SIZE, %g7 + prefetch [%o1 + 0x100], #n_reads_strong + prefetch [%o1 + 0x140], #n_reads_strong + prefetch [%o1 + 0x180], #n_reads_strong + prefetch [%o1 + 0x1c0], #n_reads_strong +1: + ldx [%o1 + 0x00], %o2 + subcc %g7, 0x40, %g7 + ldx [%o1 + 0x08], %o3 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + ldx [%o1 + 0x20], %g1 + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x28], %g2 + stxa %o3, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x30], %g3 + stxa %o4, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x38], %o2 + add %o1, 0x40, %o1 + stxa %o5, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g1, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g3, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + bne,pt %icc, 1b + prefetch [%o1 + 0x200], #n_reads_strong + retl + membar #StoreLoad | #StoreStore + .size NG4copy_user_page,.-NG4copy_user_page diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S new file mode 100644 index 00000000000..9744c4540a8 --- /dev/null +++ b/arch/sparc/lib/NG4copy_to_user.S @@ -0,0 +1,39 @@ +/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS +#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 +#endif + +#define FUNC_NAME NG4copy_to_user +#define STORE(type,src,addr) type##a src, [addr] %asi +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG4memcpy.S" diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S new file mode 100644 index 00000000000..9cf2ee01cee --- /dev/null +++ b/arch/sparc/lib/NG4memcpy.S @@ -0,0 +1,360 @@ +/* NG4memcpy.S: Niagara-4 optimized memcpy. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#define GLOBAL_SPARE %g7 +#else +#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 +#define FPRS_FEF 0x04 + +/* On T4 it is very expensive to access ASRs like %fprs and + * %asi, avoiding a read or a write can save ~50 cycles. + */ +#define FPU_ENTER \ + rd %fprs, %o5; \ + andcc %o5, FPRS_FEF, %g0; \ + be,a,pn %icc, 999f; \ + wr %g0, FPRS_FEF, %fprs; \ + 999: + +#ifdef MEMCPY_DEBUG +#define VISEntryHalf FPU_ENTER; \ + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#else +#define VISEntryHalf FPU_ENTER +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#endif + +#define GLOBAL_SPARE %g5 +#endif + +#ifndef STORE_ASI +#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P +#else +#define STORE_ASI 0x80 /* ASI_P */ +#endif +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef STORE +#ifndef MEMCPY_DEBUG +#define STORE(type,src,addr) type src, [addr] +#else +#define STORE(type,src,addr) type##a src, [addr] %asi +#endif +#endif + +#ifndef STORE_INIT +#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME NG4memcpy +#endif +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ +#ifdef MEMCPY_DEBUG + wr %g0, 0x80, %asi +#endif + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %XCC, 5 + PREAMBLE + mov %o0, %o3 + brz,pn %o2, .Lexit + cmp %o2, 3 + ble,pn %icc, .Ltiny + cmp %o2, 19 + ble,pn %icc, .Lsmall + or %o0, %o1, %g2 + cmp %o2, 128 + bl,pn %icc, .Lmedium + nop + +.Llarge:/* len >= 0x80 */ + /* First get dest 8 byte aligned. */ + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, 51f + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) + add %o1, 1, %o1 + subcc %g1, 1, %g1 + add %o0, 1, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g2, %o0 - 0x01)) + +51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) + LOAD(prefetch, %o1 + 0x080, #n_reads_strong) + LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) + LOAD(prefetch, %o1 + 0x100, #n_reads_strong) + LOAD(prefetch, %o1 + 0x140, #n_reads_strong) + LOAD(prefetch, %o1 + 0x180, #n_reads_strong) + LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) + + /* Check if we can use the straight fully aligned + * loop, or we require the alignaddr/faligndata variant. + */ + andcc %o1, 0x7, %o5 + bne,pn %icc, .Llarge_src_unaligned + sub %g0, %o0, %g1 + + /* Legitimize the use of initializing stores by getting dest + * to be 64-byte aligned. + */ + and %g1, 0x3f, %g1 + brz,pt %g1, .Llarge_aligned + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) + add %o1, 8, %o1 + subcc %g1, 8, %g1 + add %o0, 8, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stx, %g2, %o0 - 0x08)) + +.Llarge_aligned: + /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ + andn %o2, 0x3f, %o4 + sub %o2, %o4, %o2 + +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + add %o1, 0x40, %o1 + EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) + subcc %o4, 0x40, %o4 + EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) + EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) + EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) + EX_ST(STORE_INIT(%g1, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g2, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) + EX_ST(STORE_INIT(%g3, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) + EX_ST(STORE_INIT(%o5, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g2, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g3, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) + add %o0, 0x08, %o0 + bne,pt %icc, 1b + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) + + membar #StoreLoad | #StoreStore + + brz,pn %o2, .Lexit + cmp %o2, 19 + ble,pn %icc, .Lsmall_unaligned + nop + ba,a,pt %icc, .Lmedium_noprefetch + +.Lexit: retl + mov EX_RETVAL(%o3), %o0 + +.Llarge_src_unaligned: + andn %o2, 0x3f, %o4 + sub %o2, %o4, %o2 + VISEntryHalf + alignaddr %o1, %g0, %g1 + add %o1, %o4, %o1 + EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) +1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) + subcc %o4, 0x40, %o4 + EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) + EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) + EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) + EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) + EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) + EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) + faligndata %f0, %f2, %f16 + EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) + faligndata %f2, %f4, %f18 + add %g1, 0x40, %g1 + faligndata %f4, %f6, %f20 + faligndata %f6, %f8, %f22 + faligndata %f8, %f10, %f24 + faligndata %f10, %f12, %f26 + faligndata %f12, %f14, %f28 + faligndata %f14, %f0, %f30 + EX_ST(STORE(std, %f16, %o0 + 0x00)) + EX_ST(STORE(std, %f18, %o0 + 0x08)) + EX_ST(STORE(std, %f20, %o0 + 0x10)) + EX_ST(STORE(std, %f22, %o0 + 0x18)) + EX_ST(STORE(std, %f24, %o0 + 0x20)) + EX_ST(STORE(std, %f26, %o0 + 0x28)) + EX_ST(STORE(std, %f28, %o0 + 0x30)) + EX_ST(STORE(std, %f30, %o0 + 0x38)) + add %o0, 0x40, %o0 + bne,pt %icc, 1b + LOAD(prefetch, %g1 + 0x200, #n_reads_strong) + VISExitHalf + + brz,pn %o2, .Lexit + cmp %o2, 19 + ble,pn %icc, .Lsmall_unaligned + nop + ba,a,pt %icc, .Lmedium_unaligned + +.Lmedium: + LOAD(prefetch, %o1 + 0x40, #n_reads_strong) + andcc %g2, 0x7, %g0 + bne,pn %icc, .Lmedium_unaligned + nop +.Lmedium_noprefetch: + andncc %o2, 0x20 - 1, %o5 + be,pn %icc, 2f + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) + EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) + EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) + add %o1, 0x20, %o1 + subcc %o5, 0x20, %o5 + EX_ST(STORE(stx, %g1, %o0 + 0x00)) + EX_ST(STORE(stx, %g2, %o0 + 0x08)) + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) + EX_ST(STORE(stx, %o4, %o0 + 0x18)) + bne,pt %icc, 1b + add %o0, 0x20, %o0 +2: andcc %o2, 0x18, %o5 + be,pt %icc, 3f + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + add %o1, 0x08, %o1 + add %o0, 0x08, %o0 + subcc %o5, 0x08, %o5 + bne,pt %icc, 1b + EX_ST(STORE(stx, %g1, %o0 - 0x08)) +3: brz,pt %o2, .Lexit + cmp %o2, 0x04 + bl,pn %icc, .Ltiny + nop + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) + add %o1, 0x04, %o1 + add %o0, 0x04, %o0 + subcc %o2, 0x04, %o2 + bne,pn %icc, .Ltiny + EX_ST(STORE(stw, %g1, %o0 - 0x04)) + ba,a,pt %icc, .Lexit +.Lmedium_unaligned: + /* First get dest 8 byte aligned. */ + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, 2f + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) + add %o1, 1, %o1 + subcc %g1, 1, %g1 + add %o0, 1, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g2, %o0 - 0x01)) +2: + and %o1, 0x7, %g1 + brz,pn %g1, .Lmedium_noprefetch + sll %g1, 3, %g1 + mov 64, %g2 + sub %g2, %g1, %g2 + andn %o1, 0x7, %o1 + EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) + sllx %o4, %g1, %o4 + andn %o2, 0x08 - 1, %o5 + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) + add %o1, 0x08, %o1 + subcc %o5, 0x08, %o5 + srlx %g3, %g2, GLOBAL_SPARE + or GLOBAL_SPARE, %o4, GLOBAL_SPARE + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) + add %o0, 0x08, %o0 + bne,pt %icc, 1b + sllx %g3, %g1, %o4 + srl %g1, 3, %g1 + add %o1, %g1, %o1 + brz,pn %o2, .Lexit + nop + ba,pt %icc, .Lsmall_unaligned + +.Ltiny: + EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) + subcc %o2, 1, %o2 + be,pn %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x00)) + EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) + subcc %o2, 1, %o2 + be,pn %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x01)) + EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) + ba,pt %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x02)) + +.Lsmall: + andcc %g2, 0x3, %g0 + bne,pn %icc, .Lsmall_unaligned + andn %o2, 0x4 - 1, %o5 + sub %o2, %o5, %o2 +1: + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) + add %o1, 0x04, %o1 + subcc %o5, 0x04, %o5 + add %o0, 0x04, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stw, %g1, %o0 - 0x04)) + brz,pt %o2, .Lexit + nop + ba,a,pt %icc, .Ltiny + +.Lsmall_unaligned: +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) + add %o1, 1, %o1 + add %o0, 1, %o0 + subcc %o2, 1, %o2 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g1, %o0 - 0x01)) + ba,a,pt %icc, .Lexit + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S new file mode 100644 index 00000000000..41da4bdd95c --- /dev/null +++ b/arch/sparc/lib/NG4memset.S @@ -0,0 +1,105 @@ +/* NG4memset.S: Niagara-4 optimized memset/bzero. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#include <asm/asi.h> + + .register %g2, #scratch + .register %g3, #scratch + + .text + .align 32 + .globl NG4memset +NG4memset: + andcc %o1, 0xff, %o4 + be,pt %icc, 1f + mov %o2, %o1 + sllx %o4, 8, %g1 + or %g1, %o4, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %icc, 1f + or %g1, %o2, %o4 + .size NG4memset,.-NG4memset + + .align 32 + .globl NG4bzero +NG4bzero: + clr %o4 +1: cmp %o1, 16 + ble %icc, .Ltiny + mov %o0, %o3 + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, .Laligned8 + sub %o1, %g1, %o1 +1: stb %o4, [%o0 + 0x00] + subcc %g1, 1, %g1 + bne,pt %icc, 1b + add %o0, 1, %o0 +.Laligned8: + cmp %o1, 64 + (64 - 8) + ble .Lmedium + sub %g0, %o0, %g1 + andcc %g1, (64 - 1), %g1 + brz,pn %g1, .Laligned64 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 8, %g1 + bne,pt %icc, 1b + add %o0, 0x8, %o0 +.Laligned64: + andn %o1, 64 - 1, %g1 + sub %o1, %g1, %o1 + brnz,pn %o4, .Lnon_bzero_loop + mov 0x20, %g2 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x40, %o0 +.Lpostloop: + cmp %o1, 8 + bl,pn %icc, .Ltiny + membar #StoreStore|#StoreLoad +.Lmedium: + andn %o1, 0x7, %g1 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 0x8, %g1 + bne,pt %icc, 1b + add %o0, 0x08, %o0 + andcc %o1, 0x4, %g1 + be,pt %icc, .Ltiny + sub %o1, %g1, %o1 + stw %o4, [%o0 + 0x00] + add %o0, 0x4, %o0 +.Ltiny: + cmp %o1, 0 + be,pn %icc, .Lexit +1: subcc %o1, 1, %o1 + stb %o4, [%o0 + 0x00] + bne,pt %icc, 1b + add %o0, 1, %o0 +.Lexit: + retl + mov %o3, %o0 +.Lnon_bzero_loop: + mov 0x08, %g3 + mov 0x28, %o5 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + add %o0, 0x10, %o0 + stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x30, %o0 + ba,a,pt %icc, .Lpostloop + .size NG4bzero,.-NG4bzero diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S new file mode 100644 index 00000000000..a114cbcf2a4 --- /dev/null +++ b/arch/sparc/lib/NG4patch.S @@ -0,0 +1,54 @@ +/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant. + * + * Copyright (C) 2012 David S. Miller <davem@davemloft.net> + */ + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define NG_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl niagara4_patch_copyops + .type niagara4_patch_copyops,#function +niagara4_patch_copyops: + NG_DO_PATCH(memcpy, NG4memcpy) + NG_DO_PATCH(___copy_from_user, NG4copy_from_user) + NG_DO_PATCH(___copy_to_user, NG4copy_to_user) + retl + nop + .size niagara4_patch_copyops,.-niagara4_patch_copyops + + .globl niagara4_patch_bzero + .type niagara4_patch_bzero,#function +niagara4_patch_bzero: + NG_DO_PATCH(memset, NG4memset) + NG_DO_PATCH(__bzero, NG4bzero) + NG_DO_PATCH(__clear_user, NGclear_user) + NG_DO_PATCH(tsb_init, NGtsb_init) + retl + nop + .size niagara4_patch_bzero,.-niagara4_patch_bzero + + .globl niagara4_patch_pageops + .type niagara4_patch_pageops,#function +niagara4_patch_pageops: + NG_DO_PATCH(copy_user_page, NG4copy_user_page) + NG_DO_PATCH(_clear_page, NG4clear_page) + NG_DO_PATCH(clear_user_page, NG4clear_user_page) + retl + nop + .size niagara4_patch_pageops,.-niagara4_patch_pageops diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S index b9e790b9c6b..423d46e2258 100644 --- a/arch/sparc/lib/NGpage.S +++ b/arch/sparc/lib/NGpage.S @@ -59,6 +59,8 @@ NGcopy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ restore .align 32 + .globl NGclear_page + .globl NGclear_user_page NGclear_page: /* %o0=dest */ NGclear_user_page: /* %o0=dest, %o1=vaddr */ rd %asi, %g3 diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c index 3b31218cafc..ee31b884c61 100644 --- a/arch/sparc/lib/ksyms.c +++ b/arch/sparc/lib/ksyms.c @@ -134,6 +134,10 @@ EXPORT_SYMBOL(copy_user_page); void VISenter(void); EXPORT_SYMBOL(VISenter); +/* CRYPTO code needs this */ +void VISenterhalf(void); +EXPORT_SYMBOL(VISenterhalf); + extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *); extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *, unsigned long *); diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 77ac917be15..e98bfda205a 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -265,6 +265,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 1fe0429b631..2976dba1eba 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -452,6 +452,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry @@ -464,13 +465,13 @@ good_area: up_read(&mm->mmap_sem); mm_rss = get_mm_rss(mm); -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE)); #endif if (unlikely(mm_rss > mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit)) tsb_grow(mm, MM_TSB_BASE, mm_rss); -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) mm_rss = mm->context.huge_pte_count; if (unlikely(mm_rss > mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit)) diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 07e14535375..f76f83d5ac6 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -303,53 +303,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - -static void context_reload(void *__data) -{ - struct mm_struct *mm = __data; - - if (mm == current->mm) - load_secondary_context(mm); -} - -void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ - struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE]; - - if (likely(tp->tsb != NULL)) - return; - - tsb_grow(mm, MM_TSB_HUGE, 0); - tsb_context_switch(mm); - smp_tsb_sync(mm); - - /* On UltraSPARC-III+ and later, configure the second half of - * the Data-TLB for huge pages. - */ - if (tlb_type == cheetah_plus) { - unsigned long ctx; - - spin_lock(&ctx_alloc_lock); - ctx = mm->context.sparc64_ctx_val; - ctx &= ~CTX_PGSZ_MASK; - ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; - ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; - - if (ctx != mm->context.sparc64_ctx_val) { - /* When changing the page size fields, we - * must perform a context flush so that no - * stale entries match. This flush must - * occur with the original context register - * settings. - */ - do_flush_tlb_mm(mm); - - /* Reload the context register of all processors - * also executing in this address space. - */ - mm->context.sparc64_ctx_val = ctx; - on_each_cpu(context_reload, mm, 0); - } - spin_unlock(&ctx_alloc_lock); - } -} diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index d58edf5fefd..9e28a118e6a 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -51,22 +51,40 @@ #include "init_64.h" -unsigned long kern_linear_pte_xor[2] __read_mostly; +unsigned long kern_linear_pte_xor[4] __read_mostly; -/* A bitmap, one bit for every 256MB of physical memory. If the bit - * is clear, we should use a 4MB page (via kern_linear_pte_xor[0]) else - * if set we should use a 256MB page (via kern_linear_pte_xor[1]). +/* A bitmap, two bits for every 256MB of physical memory. These two + * bits determine what page size we use for kernel linear + * translations. They form an index into kern_linear_pte_xor[]. The + * value in the indexed slot is XOR'd with the TLB miss virtual + * address to form the resulting TTE. The mapping is: + * + * 0 ==> 4MB + * 1 ==> 256MB + * 2 ==> 2GB + * 3 ==> 16GB + * + * All sun4v chips support 256MB pages. Only SPARC-T4 and later + * support 2GB pages, and hopefully future cpus will support the 16GB + * pages as well. For slots 2 and 3, we encode a 256MB TTE xor there + * if these larger page sizes are not supported by the cpu. + * + * It would be nice to determine this from the machine description + * 'cpu' properties, but we need to have this table setup before the + * MDESC is initialized. */ unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)]; #ifndef CONFIG_DEBUG_PAGEALLOC -/* A special kernel TSB for 4MB and 256MB linear mappings. - * Space is allocated for this right after the trap table - * in arch/sparc64/kernel/head.S +/* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings. + * Space is allocated for this right after the trap table in + * arch/sparc64/kernel/head.S */ extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; #endif +static unsigned long cpu_pgsz_mask; + #define MAX_BANKS 32 static struct linux_prom64_registers pavail[MAX_BANKS] __devinitdata; @@ -101,7 +119,8 @@ static void __init read_obp_memory(const char *property, ret = prom_getproperty(node, property, (char *) regs, prop_size); if (ret == -1) { - prom_printf("Couldn't get %s property from /memory.\n"); + prom_printf("Couldn't get %s property from /memory.\n", + property); prom_halt(); } @@ -257,7 +276,6 @@ static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long } unsigned long _PAGE_ALL_SZ_BITS __read_mostly; -unsigned long _PAGE_SZBITS __read_mostly; static void flush_dcache(unsigned long pfn) { @@ -288,12 +306,24 @@ static void flush_dcache(unsigned long pfn) } } +/* mm->context.lock must be held */ +static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index, + unsigned long tsb_hash_shift, unsigned long address, + unsigned long tte) +{ + struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb; + unsigned long tag; + + tsb += ((address >> tsb_hash_shift) & + (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); + tag = (address >> 22UL); + tsb_insert(tsb, tag, tte); +} + void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + unsigned long tsb_index, tsb_hash_shift, flags; struct mm_struct *mm; - struct tsb *tsb; - unsigned long tag, flags; - unsigned long tsb_index, tsb_hash_shift; pte_t pte = *ptep; if (tlb_type != hypervisor) { @@ -310,7 +340,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * spin_lock_irqsave(&mm->context.lock, flags); -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) { if ((tlb_type == hypervisor && (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) || @@ -322,11 +352,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * } #endif - tsb = mm->context.tsb_block[tsb_index].tsb; - tsb += ((address >> tsb_hash_shift) & - (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); - tag = (address >> 22UL); - tsb_insert(tsb, tag, pte_val(pte)); + __update_mmu_tsb_insert(mm, tsb_index, tsb_hash_shift, + address, pte_val(pte)); spin_unlock_irqrestore(&mm->context.lock, flags); } @@ -403,6 +430,12 @@ EXPORT_SYMBOL(flush_icache_range); void mmu_info(struct seq_file *m) { + static const char *pgsz_strings[] = { + "8K", "64K", "512K", "4MB", "32MB", + "256MB", "2GB", "16GB", + }; + int i, printed; + if (tlb_type == cheetah) seq_printf(m, "MMU Type\t: Cheetah\n"); else if (tlb_type == cheetah_plus) @@ -414,6 +447,17 @@ void mmu_info(struct seq_file *m) else seq_printf(m, "MMU Type\t: ???\n"); + seq_printf(m, "MMU PGSZs\t: "); + printed = 0; + for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) { + if (cpu_pgsz_mask & (1UL << i)) { + seq_printf(m, "%s%s", + printed ? "," : "", pgsz_strings[i]); + printed++; + } + } + seq_putc(m, '\n'); + #ifdef CONFIG_DEBUG_DCFLUSH seq_printf(m, "DCPageFlushes\t: %d\n", atomic_read(&dcpage_flushes)); @@ -462,7 +506,7 @@ static void __init read_obp_translations(void) prom_halt(); } if (unlikely(n > sizeof(prom_trans))) { - prom_printf("prom_mappings: Size %Zd is too big.\n", n); + prom_printf("prom_mappings: Size %d is too big.\n", n); prom_halt(); } @@ -524,7 +568,7 @@ static void __init hypervisor_tlb_lock(unsigned long vaddr, unsigned long ret = sun4v_mmu_map_perm_addr(vaddr, 0, pte, mmu); if (ret != 0) { - prom_printf("hypervisor_tlb_lock[%lx:%lx:%lx:%lx]: " + prom_printf("hypervisor_tlb_lock[%lx:%x:%lx:%lx]: " "errors with %lx\n", vaddr, 0, pte, mmu, ret); prom_halt(); } @@ -1358,32 +1402,75 @@ static unsigned long __ref kernel_map_range(unsigned long pstart, extern unsigned int kvmap_linear_patch[1]; #endif /* CONFIG_DEBUG_PAGEALLOC */ -static void __init mark_kpte_bitmap(unsigned long start, unsigned long end) +static void __init kpte_set_val(unsigned long index, unsigned long val) { - const unsigned long shift_256MB = 28; - const unsigned long mask_256MB = ((1UL << shift_256MB) - 1UL); - const unsigned long size_256MB = (1UL << shift_256MB); + unsigned long *ptr = kpte_linear_bitmap; - while (start < end) { - long remains; + val <<= ((index % (BITS_PER_LONG / 2)) * 2); + ptr += (index / (BITS_PER_LONG / 2)); - remains = end - start; - if (remains < size_256MB) - break; + *ptr |= val; +} - if (start & mask_256MB) { - start = (start + size_256MB) & ~mask_256MB; - continue; - } +static const unsigned long kpte_shift_min = 28; /* 256MB */ +static const unsigned long kpte_shift_max = 34; /* 16GB */ +static const unsigned long kpte_shift_incr = 3; - while (remains >= size_256MB) { - unsigned long index = start >> shift_256MB; +static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end, + unsigned long shift) +{ + unsigned long size = (1UL << shift); + unsigned long mask = (size - 1UL); + unsigned long remains = end - start; + unsigned long val; - __set_bit(index, kpte_linear_bitmap); + if (remains < size || (start & mask)) + return start; - start += size_256MB; - remains -= size_256MB; + /* VAL maps: + * + * shift 28 --> kern_linear_pte_xor index 1 + * shift 31 --> kern_linear_pte_xor index 2 + * shift 34 --> kern_linear_pte_xor index 3 + */ + val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1; + + remains &= ~mask; + if (shift != kpte_shift_max) + remains = size; + + while (remains) { + unsigned long index = start >> kpte_shift_min; + + kpte_set_val(index, val); + + start += 1UL << kpte_shift_min; + remains -= 1UL << kpte_shift_min; + } + + return start; +} + +static void __init mark_kpte_bitmap(unsigned long start, unsigned long end) +{ + unsigned long smallest_size, smallest_mask; + unsigned long s; + + smallest_size = (1UL << kpte_shift_min); + smallest_mask = (smallest_size - 1UL); + + while (start < end) { + unsigned long orig_start = start; + + for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) { + start = kpte_mark_using_shift(start, end, s); + + if (start != orig_start) + break; } + + if (start == orig_start) + start = (start + smallest_size) & ~smallest_mask; } } @@ -1577,13 +1664,16 @@ static void __init sun4v_ktsb_init(void) ktsb_descr[0].resv = 0; #ifndef CONFIG_DEBUG_PAGEALLOC - /* Second KTSB for 4MB/256MB mappings. */ + /* Second KTSB for 4MB/256MB/2GB/16GB mappings. */ ktsb_pa = (kern_base + ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB; - ktsb_descr[1].pgsz_mask = (HV_PGSZ_MASK_4MB | - HV_PGSZ_MASK_256MB); + ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB | + HV_PGSZ_MASK_256MB | + HV_PGSZ_MASK_2GB | + HV_PGSZ_MASK_16GB) & + cpu_pgsz_mask); ktsb_descr[1].assoc = 1; ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES; ktsb_descr[1].ctx_idx = 0; @@ -1606,6 +1696,47 @@ void __cpuinit sun4v_ktsb_register(void) } } +static void __init sun4u_linear_pte_xor_finalize(void) +{ +#ifndef CONFIG_DEBUG_PAGEALLOC + /* This is where we would add Panther support for + * 32MB and 256MB pages. + */ +#endif +} + +static void __init sun4v_linear_pte_xor_finalize(void) +{ +#ifndef CONFIG_DEBUG_PAGEALLOC + if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) { + kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ + 0xfffff80000000000UL; + kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V | + _PAGE_P_4V | _PAGE_W_4V); + } else { + kern_linear_pte_xor[1] = kern_linear_pte_xor[0]; + } + + if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) { + kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^ + 0xfffff80000000000UL; + kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V | + _PAGE_P_4V | _PAGE_W_4V); + } else { + kern_linear_pte_xor[2] = kern_linear_pte_xor[1]; + } + + if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) { + kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^ + 0xfffff80000000000UL; + kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V | + _PAGE_P_4V | _PAGE_W_4V); + } else { + kern_linear_pte_xor[3] = kern_linear_pte_xor[2]; + } +#endif +} + /* paging_init() sets up the page tables */ static unsigned long last_valid_pfn; @@ -1665,10 +1796,8 @@ void __init paging_init(void) ktsb_phys_patch(); } - if (tlb_type == hypervisor) { + if (tlb_type == hypervisor) sun4v_patch_tlb_handlers(); - sun4v_ktsb_init(); - } /* Find available physical memory... * @@ -1727,9 +1856,6 @@ void __init paging_init(void) __flush_tlb_all(); - if (tlb_type == hypervisor) - sun4v_ktsb_register(); - prom_build_devicetree(); of_populate_present_mask(); #ifndef CONFIG_SMP @@ -1742,8 +1868,36 @@ void __init paging_init(void) #ifndef CONFIG_SMP mdesc_fill_in_cpu_data(cpu_all_mask); #endif + mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask); + + sun4v_linear_pte_xor_finalize(); + + sun4v_ktsb_init(); + sun4v_ktsb_register(); + } else { + unsigned long impl, ver; + + cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K | + HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB); + + __asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver)); + impl = ((ver >> 32) & 0xffff); + if (impl == PANTHER_IMPL) + cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB | + HV_PGSZ_MASK_256MB); + + sun4u_linear_pte_xor_finalize(); } + /* Flush the TLBs and the 4M TSB so that the updated linear + * pte XOR settings are realized for all mappings. + */ + __flush_tlb_all(); +#ifndef CONFIG_DEBUG_PAGEALLOC + memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); +#endif + __flush_tlb_all(); + /* Setup bootmem... */ last_valid_pfn = end_pfn = bootmem_init(phys_base); @@ -2110,6 +2264,7 @@ static void __init sun4u_pgprot_init(void) { unsigned long page_none, page_shared, page_copy, page_readonly; unsigned long page_exec_bit; + int i; PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | _PAGE_CACHE_4U | _PAGE_P_4U | @@ -2128,8 +2283,7 @@ static void __init sun4u_pgprot_init(void) __ACCESS_BITS_4U | _PAGE_E_4U); #ifdef CONFIG_DEBUG_PAGEALLOC - kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4U) ^ - 0xfffff80000000000UL; + kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; #else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ 0xfffff80000000000UL; @@ -2137,10 +2291,9 @@ static void __init sun4u_pgprot_init(void) kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | _PAGE_W_4U); - /* XXX Should use 256MB on Panther. XXX */ - kern_linear_pte_xor[1] = kern_linear_pte_xor[0]; + for (i = 1; i < 4; i++) + kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; - _PAGE_SZBITS = _PAGE_SZBITS_4U; _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); @@ -2164,6 +2317,7 @@ static void __init sun4v_pgprot_init(void) { unsigned long page_none, page_shared, page_copy, page_readonly; unsigned long page_exec_bit; + int i; PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID | _PAGE_CACHE_4V | _PAGE_P_4V | @@ -2176,8 +2330,7 @@ static void __init sun4v_pgprot_init(void) _PAGE_CACHE = _PAGE_CACHE_4V; #ifdef CONFIG_DEBUG_PAGEALLOC - kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^ - 0xfffff80000000000UL; + kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; #else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ 0xfffff80000000000UL; @@ -2185,20 +2338,12 @@ static void __init sun4v_pgprot_init(void) kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | _PAGE_W_4V); -#ifdef CONFIG_DEBUG_PAGEALLOC - kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^ - 0xfffff80000000000UL; -#else - kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ - 0xfffff80000000000UL; -#endif - kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V | - _PAGE_P_4V | _PAGE_W_4V); + for (i = 1; i < 4; i++) + kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | __ACCESS_BITS_4V | _PAGE_E_4V); - _PAGE_SZBITS = _PAGE_SZBITS_4V; _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | @@ -2331,3 +2476,281 @@ void __flush_tlb_all(void) __asm__ __volatile__("wrpr %0, 0, %%pstate" : : "r" (pstate)); } + +static pte_t *get_from_cache(struct mm_struct *mm) +{ + struct page *page; + pte_t *ret; + + spin_lock(&mm->page_table_lock); + page = mm->context.pgtable_page; + ret = NULL; + if (page) { + void *p = page_address(page); + + mm->context.pgtable_page = NULL; + + ret = (pte_t *) (p + (PAGE_SIZE / 2)); + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static struct page *__alloc_for_cache(struct mm_struct *mm) +{ + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | + __GFP_REPEAT | __GFP_ZERO); + + if (page) { + spin_lock(&mm->page_table_lock); + if (!mm->context.pgtable_page) { + atomic_set(&page->_count, 2); + mm->context.pgtable_page = page; + } + spin_unlock(&mm->page_table_lock); + } + return page; +} + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + struct page *page; + pte_t *pte; + + pte = get_from_cache(mm); + if (pte) + return pte; + + page = __alloc_for_cache(mm); + if (page) + pte = (pte_t *) page_address(page); + + return pte; +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + struct page *page; + pte_t *pte; + + pte = get_from_cache(mm); + if (pte) + return pte; + + page = __alloc_for_cache(mm); + if (page) { + pgtable_page_ctor(page); + pte = (pte_t *) page_address(page); + } + + return pte; +} + +void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + struct page *page = virt_to_page(pte); + if (put_page_testzero(page)) + free_hot_cold_page(page, 0); +} + +static void __pte_free(pgtable_t pte) +{ + struct page *page = virt_to_page(pte); + if (put_page_testzero(page)) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +void pte_free(struct mm_struct *mm, pgtable_t pte) +{ + __pte_free(pte); +} + +void pgtable_free(void *table, bool is_page) +{ + if (is_page) + __pte_free(table); + else + kmem_cache_free(pgtable_cache, table); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify) +{ + if (pgprot_val(pgprot) & _PAGE_VALID) + pmd_val(pmd) |= PMD_HUGE_PRESENT; + if (tlb_type == hypervisor) { + if (pgprot_val(pgprot) & _PAGE_WRITE_4V) + pmd_val(pmd) |= PMD_HUGE_WRITE; + if (pgprot_val(pgprot) & _PAGE_EXEC_4V) + pmd_val(pmd) |= PMD_HUGE_EXEC; + + if (!for_modify) { + if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V) + pmd_val(pmd) |= PMD_HUGE_ACCESSED; + if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V) + pmd_val(pmd) |= PMD_HUGE_DIRTY; + } + } else { + if (pgprot_val(pgprot) & _PAGE_WRITE_4U) + pmd_val(pmd) |= PMD_HUGE_WRITE; + if (pgprot_val(pgprot) & _PAGE_EXEC_4U) + pmd_val(pmd) |= PMD_HUGE_EXEC; + + if (!for_modify) { + if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U) + pmd_val(pmd) |= PMD_HUGE_ACCESSED; + if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U) + pmd_val(pmd) |= PMD_HUGE_DIRTY; + } + } + + return pmd; +} + +pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + pmd_t pmd; + + pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT))); + pmd_val(pmd) |= PMD_ISHUGE; + pmd = pmd_set_protbits(pmd, pgprot, false); + return pmd; +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmd_val(pmd) &= ~(PMD_HUGE_PRESENT | + PMD_HUGE_WRITE | + PMD_HUGE_EXEC); + pmd = pmd_set_protbits(pmd, newprot, true); + return pmd; +} + +pgprot_t pmd_pgprot(pmd_t entry) +{ + unsigned long pte = 0; + + if (pmd_val(entry) & PMD_HUGE_PRESENT) + pte |= _PAGE_VALID; + + if (tlb_type == hypervisor) { + if (pmd_val(entry) & PMD_HUGE_PRESENT) + pte |= _PAGE_PRESENT_4V; + if (pmd_val(entry) & PMD_HUGE_EXEC) + pte |= _PAGE_EXEC_4V; + if (pmd_val(entry) & PMD_HUGE_WRITE) + pte |= _PAGE_W_4V; + if (pmd_val(entry) & PMD_HUGE_ACCESSED) + pte |= _PAGE_ACCESSED_4V; + if (pmd_val(entry) & PMD_HUGE_DIRTY) + pte |= _PAGE_MODIFIED_4V; + pte |= _PAGE_CP_4V|_PAGE_CV_4V; + } else { + if (pmd_val(entry) & PMD_HUGE_PRESENT) + pte |= _PAGE_PRESENT_4U; + if (pmd_val(entry) & PMD_HUGE_EXEC) + pte |= _PAGE_EXEC_4U; + if (pmd_val(entry) & PMD_HUGE_WRITE) + pte |= _PAGE_W_4U; + if (pmd_val(entry) & PMD_HUGE_ACCESSED) + pte |= _PAGE_ACCESSED_4U; + if (pmd_val(entry) & PMD_HUGE_DIRTY) + pte |= _PAGE_MODIFIED_4U; + pte |= _PAGE_CP_4U|_PAGE_CV_4U; + } + + return __pgprot(pte); +} + +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + unsigned long pte, flags; + struct mm_struct *mm; + pmd_t entry = *pmd; + pgprot_t prot; + + if (!pmd_large(entry) || !pmd_young(entry)) + return; + + pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS); + pte <<= PMD_PADDR_SHIFT; + pte |= _PAGE_VALID; + + prot = pmd_pgprot(entry); + + if (tlb_type == hypervisor) + pgprot_val(prot) |= _PAGE_SZHUGE_4V; + else + pgprot_val(prot) |= _PAGE_SZHUGE_4U; + + pte |= pgprot_val(prot); + + mm = vma->vm_mm; + + spin_lock_irqsave(&mm->context.lock, flags); + + if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) + __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT, + addr, pte); + + spin_unlock_irqrestore(&mm->context.lock, flags); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) +static void context_reload(void *__data) +{ + struct mm_struct *mm = __data; + + if (mm == current->mm) + load_secondary_context(mm); +} + +void hugetlb_setup(struct mm_struct *mm) +{ + struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE]; + + if (likely(tp->tsb != NULL)) + return; + + tsb_grow(mm, MM_TSB_HUGE, 0); + tsb_context_switch(mm); + smp_tsb_sync(mm); + + /* On UltraSPARC-III+ and later, configure the second half of + * the Data-TLB for huge pages. + */ + if (tlb_type == cheetah_plus) { + unsigned long ctx; + + spin_lock(&ctx_alloc_lock); + ctx = mm->context.sparc64_ctx_val; + ctx &= ~CTX_PGSZ_MASK; + ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; + ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; + + if (ctx != mm->context.sparc64_ctx_val) { + /* When changing the page size fields, we + * must perform a context flush so that no + * stale entries match. This flush must + * occur with the original context register + * settings. + */ + do_flush_tlb_mm(mm); + + /* Reload the context register of all processors + * also executing in this address space. + */ + mm->context.sparc64_ctx_val = ctx; + on_each_cpu(context_reload, mm, 0); + } + spin_unlock(&ctx_alloc_lock); + } +} +#endif diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h index 3e1ac8b96ca..0661aa606de 100644 --- a/arch/sparc/mm/init_64.h +++ b/arch/sparc/mm/init_64.h @@ -8,12 +8,12 @@ #define MAX_PHYS_ADDRESS (1UL << 41UL) #define KPTE_BITMAP_CHUNK_SZ (256UL * 1024UL * 1024UL) #define KPTE_BITMAP_BYTES \ - ((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 8) + ((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 4) #define VALID_ADDR_BITMAP_CHUNK_SZ (4UL * 1024UL * 1024UL) #define VALID_ADDR_BITMAP_BYTES \ ((MAX_PHYS_ADDRESS / VALID_ADDR_BITMAP_CHUNK_SZ) / 8) -extern unsigned long kern_linear_pte_xor[2]; +extern unsigned long kern_linear_pte_xor[4]; extern unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)]; extern unsigned int sparc64_highest_unlocked_tlb_ent; extern unsigned long sparc64_kern_pri_context; diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index a8a58cad9d2..0f4f7191fbb 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -90,8 +90,8 @@ static void __init sbus_iommu_init(struct platform_device *op) it to us. */ tmp = __get_free_pages(GFP_KERNEL, IOMMU_ORDER); if (!tmp) { - prom_printf("Unable to allocate iommu table [0x%08x]\n", - IOMMU_NPTES*sizeof(iopte_t)); + prom_printf("Unable to allocate iommu table [0x%lx]\n", + IOMMU_NPTES * sizeof(iopte_t)); prom_halt(); } iommu->page_table = (iopte_t *)tmp; diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index b1f279cd00b..3e8fec391fe 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -43,16 +43,37 @@ void flush_tlb_pending(void) put_cpu_var(tlb_batch); } -void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, - pte_t *ptep, pte_t orig, int fullmm) +static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, + bool exec) { struct tlb_batch *tb = &get_cpu_var(tlb_batch); unsigned long nr; vaddr &= PAGE_MASK; - if (pte_exec(orig)) + if (exec) vaddr |= 0x1UL; + nr = tb->tlb_nr; + + if (unlikely(nr != 0 && mm != tb->mm)) { + flush_tlb_pending(); + nr = 0; + } + + if (nr == 0) + tb->mm = mm; + + tb->vaddrs[nr] = vaddr; + tb->tlb_nr = ++nr; + if (nr >= TLB_BATCH_NR) + flush_tlb_pending(); + + put_cpu_var(tlb_batch); +} + +void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, + pte_t *ptep, pte_t orig, int fullmm) +{ if (tlb_type != hypervisor && pte_dirty(orig)) { unsigned long paddr, pfn = pte_pfn(orig); @@ -77,26 +98,91 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, } no_cache_flush: + if (!fullmm) + tlb_batch_add_one(mm, vaddr, pte_exec(orig)); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void tlb_batch_pmd_scan(struct mm_struct *mm, unsigned long vaddr, + pmd_t pmd, bool exec) +{ + unsigned long end; + pte_t *pte; + + pte = pte_offset_map(&pmd, vaddr); + end = vaddr + HPAGE_SIZE; + while (vaddr < end) { + if (pte_val(*pte) & _PAGE_VALID) + tlb_batch_add_one(mm, vaddr, exec); + pte++; + vaddr += PAGE_SIZE; + } + pte_unmap(pte); +} - if (fullmm) { - put_cpu_var(tlb_batch); +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + pmd_t orig = *pmdp; + + *pmdp = pmd; + + if (mm == &init_mm) return; + + if ((pmd_val(pmd) ^ pmd_val(orig)) & PMD_ISHUGE) { + if (pmd_val(pmd) & PMD_ISHUGE) + mm->context.huge_pte_count++; + else + mm->context.huge_pte_count--; + if (mm->context.huge_pte_count == 1) + hugetlb_setup(mm); } - nr = tb->tlb_nr; + if (!pmd_none(orig)) { + bool exec = ((pmd_val(orig) & PMD_HUGE_EXEC) != 0); - if (unlikely(nr != 0 && mm != tb->mm)) { - flush_tlb_pending(); - nr = 0; + addr &= HPAGE_MASK; + if (pmd_val(orig) & PMD_ISHUGE) + tlb_batch_add_one(mm, addr, exec); + else + tlb_batch_pmd_scan(mm, addr, orig, exec); } +} - if (nr == 0) - tb->mm = mm; +void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; - tb->vaddrs[nr] = vaddr; - tb->tlb_nr = ++nr; - if (nr >= TLB_BATCH_NR) - flush_tlb_pending(); + assert_spin_locked(&mm->page_table_lock); - put_cpu_var(tlb_batch); + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) mm->pmd_huge_pte); + mm->pmd_huge_pte = pgtable; +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +{ + struct list_head *lh; + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = (pgtable_t) lh->next; + list_del(lh); + } + pte_val(pgtable[0]) = 0; + pte_val(pgtable[1]) = 0; + + return pgtable; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index c52add79b83..7f647434749 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -78,7 +78,7 @@ void flush_tsb_user(struct tlb_batch *tb) base = __pa(base); __flush_tsb_one(tb, PAGE_SHIFT, base, nentries); -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; @@ -90,29 +90,12 @@ void flush_tsb_user(struct tlb_batch *tb) spin_unlock_irqrestore(&mm->context.lock, flags); } -#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) #define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K #define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_8K -#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) -#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_64K -#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_64K -#else -#error Broken base page size setting... -#endif -#ifdef CONFIG_HUGETLB_PAGE -#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) -#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_64K -#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_64K -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) -#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_512K -#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_512K -#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) #define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_4MB #define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_4MB -#else -#error Broken huge page size setting... -#endif #endif static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes) @@ -207,7 +190,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign case MM_TSB_BASE: hp->pgsz_idx = HV_PGSZ_IDX_BASE; break; -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) case MM_TSB_HUGE: hp->pgsz_idx = HV_PGSZ_IDX_HUGE; break; @@ -222,7 +205,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign case MM_TSB_BASE: hp->pgsz_mask = HV_PGSZ_MASK_BASE; break; -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) case MM_TSB_HUGE: hp->pgsz_mask = HV_PGSZ_MASK_HUGE; break; @@ -444,7 +427,7 @@ retry_tsb_alloc: int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) unsigned long huge_pte_count; #endif unsigned int i; @@ -453,7 +436,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) mm->context.sparc64_ctx_val = 0UL; -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) /* We reset it to zero because the fork() page copying * will re-increment the counters as the parent PTEs are * copied into the child address space. @@ -462,6 +445,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) mm->context.huge_pte_count = 0; #endif + mm->context.pgtable_page = NULL; + /* copy_mm() copies over the parent's mm_struct before calling * us, so we need to zero out the TSB pointer or else tsb_grow() * will be confused and think there is an older TSB to free up. @@ -474,7 +459,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) */ tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm)); -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) if (unlikely(huge_pte_count)) tsb_grow(mm, MM_TSB_HUGE, huge_pte_count); #endif @@ -500,10 +485,17 @@ static void tsb_destroy_one(struct tsb_config *tp) void destroy_context(struct mm_struct *mm) { unsigned long flags, i; + struct page *page; for (i = 0; i < MM_NUM_TSBS; i++) tsb_destroy_one(&mm->context.tsb_block[i]); + page = mm->context.pgtable_page; + if (page && put_page_testzero(page)) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } + spin_lock_irqsave(&ctx_alloc_lock, flags); if (CTX_VALID(mm->context)) { diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index e9073e9501b..28368701ef7 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -464,8 +464,12 @@ void bpf_jit_compile(struct sk_filter *fp) emit_alu_K(OR, K); break; case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ + case BPF_S_ALU_XOR_X: emit_alu_X(XOR); break; + case BPF_S_ALU_XOR_K: /* A ^= K */ + emit_alu_K(XOR, K); + break; case BPF_S_ALU_LSH_X: /* A <<= X */ emit_alu_X(SLL); break; |