diff options
Diffstat (limited to 'arch/sh/lib64')
| -rw-r--r-- | arch/sh/lib64/.gitignore | 1 | ||||
| -rw-r--r-- | arch/sh/lib64/Makefile | 8 | ||||
| -rw-r--r-- | arch/sh/lib64/c-checksum.c | 214 | ||||
| -rw-r--r-- | arch/sh/lib64/clear_page.S | 54 | ||||
| -rw-r--r-- | arch/sh/lib64/copy_user_memcpy.S | 2 | ||||
| -rw-r--r-- | arch/sh/lib64/dbg.c | 430 | ||||
| -rw-r--r-- | arch/sh/lib64/memcpy.S | 201 | ||||
| -rw-r--r-- | arch/sh/lib64/memcpy.c | 81 | ||||
| -rw-r--r-- | arch/sh/lib64/memset.S | 91 | ||||
| -rw-r--r-- | arch/sh/lib64/panic.c | 43 | ||||
| -rw-r--r-- | arch/sh/lib64/sdivsi3.S | 135 | ||||
| -rw-r--r-- | arch/sh/lib64/strcpy.S | 97 | ||||
| -rw-r--r-- | arch/sh/lib64/strlen.S | 33 | ||||
| -rw-r--r-- | arch/sh/lib64/udelay.c | 2 | ||||
| -rw-r--r-- | arch/sh/lib64/udivdi3.S | 120 | ||||
| -rw-r--r-- | arch/sh/lib64/udivsi3.S | 59 |
16 files changed, 743 insertions, 828 deletions
diff --git a/arch/sh/lib64/.gitignore b/arch/sh/lib64/.gitignore deleted file mode 100644 index 3508c2cb23c..00000000000 --- a/arch/sh/lib64/.gitignore +++ /dev/null @@ -1 +0,0 @@ -syscalltab.h diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile index 9950966923a..69779ff741d 100644 --- a/arch/sh/lib64/Makefile +++ b/arch/sh/lib64/Makefile @@ -2,7 +2,7 @@ # Makefile for the SH-5 specific library files.. # # Copyright (C) 2000, 2001 Paolo Alberelli -# Copyright (C) 2003 Paul Mundt +# Copyright (C) 2003 - 2008 Paul Mundt # # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive @@ -10,6 +10,8 @@ # # Panic should really be compiled as PIC -lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \ - copy_page.o clear_page.o +lib-y := udelay.o panic.o memcpy.o memset.o \ + copy_user_memcpy.o copy_page.o strcpy.o strlen.o +# Extracted from libgcc +lib-y += udivsi3.o udivdi3.o sdivsi3.o diff --git a/arch/sh/lib64/c-checksum.c b/arch/sh/lib64/c-checksum.c deleted file mode 100644 index 5c284e0cff9..00000000000 --- a/arch/sh/lib64/c-checksum.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * arch/sh/lib64/c-checksum.c - * - * This file contains network checksum routines that are better done - * in an architecture-specific manner due to speed.. - */ -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <asm/byteorder.h> -#include <asm/uaccess.h> - -static inline unsigned short from64to16(unsigned long long x) -{ - /* add up 32-bit words for 33 bits */ - x = (x & 0xffffffff) + (x >> 32); - /* add up 16-bit and 17-bit words for 17+c bits */ - x = (x & 0xffff) + (x >> 16); - /* add up 16-bit and 2-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -static inline unsigned short foldto16(unsigned long x) -{ - /* add up 16-bit for 17 bits */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -static inline unsigned short myfoldto16(unsigned long long x) -{ - /* Fold down to 32-bits so we don't loose in the typedef-less - network stack. */ - /* 64 to 33 */ - x = (x & 0xffffffff) + (x >> 32); - /* 33 to 32 */ - x = (x & 0xffffffff) + (x >> 32); - - /* add up 16-bit for 17 bits */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -#define odd(x) ((x)&1) -#define U16(x) ntohs(x) - -static unsigned long do_csum(const unsigned char *buff, int len) -{ - int odd, count; - unsigned long result = 0; - - pr_debug("do_csum buff %p, len %d (0x%x)\n", buff, len, len); -#ifdef DEBUG - for (i = 0; i < len; i++) { - if ((i % 26) == 0) - printk("\n"); - printk("%02X ", buff[i]); - } -#endif - - if (len <= 0) - goto out; - - odd = 1 & (unsigned long) buff; - if (odd) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *) buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long carry = 0; - do { - unsigned long w = *(unsigned long *) buff; - buff += 4; - count--; - result += carry; - result += w; - carry = (w > result); - } while (count); - result += carry; - result = (result & 0xffff) + (result >> 16); - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - result = foldto16(result); - if (odd) - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - - pr_debug("\nCHECKSUM is 0x%lx\n", result); - - out: - return result; -} - -/* computes the checksum of a memory block at buff, length len, - and adds in "sum" (32-bit) */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ - unsigned long long result = do_csum(buff, len); - - /* add in old sum, and carry.. */ - result += (__force u32)sum; - /* 32+c bits -> 32 bits */ - result = (result & 0xffffffff) + (result >> 32); - - pr_debug("csum_partial, buff %p len %d sum 0x%x result=0x%016Lx\n", - buff, len, sum, result); - - return (__force __wsum)result; -} - -/* Copy while checksumming, otherwise like csum_partial. */ -__wsum -csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) -{ - sum = csum_partial(src, len, sum); - memcpy(dst, src, len); - - return sum; -} - -/* Copy from userspace and compute checksum. If we catch an exception - then zero the rest of the buffer. */ -__wsum -csum_partial_copy_from_user(const void __user *src, void *dst, int len, - __wsum sum, int *err_ptr) -{ - int missing; - - pr_debug - ("csum_partial_copy_from_user src %p, dest %p, len %d, sum %08x, err_ptr %p\n", - src, dst, len, sum, err_ptr); - missing = copy_from_user(dst, src, len); - pr_debug(" access_ok %d\n", __access_ok((unsigned long) src, len)); - pr_debug(" missing %d\n", missing); - if (missing) { - memset(dst + len - missing, 0, missing); - *err_ptr = -EFAULT; - } - - return csum_partial(dst, len, sum); -} - -/* Copy to userspace and compute checksum. */ -__wsum -csum_partial_copy_to_user(const unsigned char *src, unsigned char *dst, int len, - __wsum sum, int *err_ptr) -{ - sum = csum_partial(src, len, sum); - - if (copy_to_user(dst, src, len)) - *err_ptr = -EFAULT; - - return sum; -} - -/* - * This is a version of ip_compute_csum() optimized for IP headers, - * which always checksum on 4 octet boundaries. - */ -__sum16 ip_fast_csum(const void *iph, unsigned int ihl) -{ - pr_debug("ip_fast_csum %p,%d\n", iph, ihl); - - return (__force __sum16)~do_csum(iph, ihl * 4); -} - -__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, - unsigned short len, - unsigned short proto, __wsum sum) -{ - unsigned long long result; - - pr_debug("ntohs(0x%x)=0x%x\n", 0xdead, ntohs(0xdead)); - pr_debug("htons(0x%x)=0x%x\n", 0xdead, htons(0xdead)); - - result = (__force u64) saddr + (__force u64) daddr + - (__force u64) sum + ((len + proto) << 8); - - /* Fold down to 32-bits so we don't loose in the typedef-less - network stack. */ - /* 64 to 33 */ - result = (result & 0xffffffff) + (result >> 32); - /* 33 to 32 */ - result = (result & 0xffffffff) + (result >> 32); - - pr_debug("%s saddr %x daddr %x len %x proto %x sum %x result %08Lx\n", - __func__, saddr, daddr, len, proto, sum, result); - - return (__wsum)result; -} -EXPORT_SYMBOL(csum_tcpudp_nofold); diff --git a/arch/sh/lib64/clear_page.S b/arch/sh/lib64/clear_page.S deleted file mode 100644 index 007ab48ecc1..00000000000 --- a/arch/sh/lib64/clear_page.S +++ /dev/null @@ -1,54 +0,0 @@ -/* - Copyright 2003 Richard Curnow, SuperH (UK) Ltd. - - This file is subject to the terms and conditions of the GNU General Public - License. See the file "COPYING" in the main directory of this archive - for more details. - - Tight version of memset for the case of just clearing a page. It turns out - that having the alloco's spaced out slightly due to the increment/branch - pair causes them to contend less for access to the cache. Similarly, - keeping the stores apart from the allocos causes less contention. => Do two - separate loops. Do multiple stores per loop to amortise the - increment/branch cost a little. - - Parameters: - r2 : source effective address (start of page) - - Always clears 4096 bytes. - - Note : alloco guarded by synco to avoid TAKum03020 erratum - -*/ - - .section .text..SHmedia32,"ax" - .little - - .balign 8 - .global clear_page -clear_page: - pta/l 1f, tr1 - pta/l 2f, tr2 - ptabs/l r18, tr0 - - movi 4096, r7 - add r2, r7, r7 - add r2, r63, r6 -1: - alloco r6, 0 - synco ! TAKum03020 - addi r6, 32, r6 - bgt/l r7, r6, tr1 - - add r2, r63, r6 -2: - st.q r6, 0, r63 - st.q r6, 8, r63 - st.q r6, 16, r63 - st.q r6, 24, r63 - addi r6, 32, r6 - bgt/l r7, r6, tr2 - - blink tr0, r63 - - diff --git a/arch/sh/lib64/copy_user_memcpy.S b/arch/sh/lib64/copy_user_memcpy.S index 2a62816d2dd..49aeabeba2c 100644 --- a/arch/sh/lib64/copy_user_memcpy.S +++ b/arch/sh/lib64/copy_user_memcpy.S @@ -27,7 +27,7 @@ ! 2.: When there are two or three bytes in the last word of an 11-or-more ! bytes memory chunk to b copied, the rest of the word can be read ! without side effects. -! This could be easily changed by increasing the minumum size of +! This could be easily changed by increasing the minimum size of ! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, ! however, this would cost a few extra cyles on average. ! For SHmedia, the assumption is that any quadword can be read in its diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c deleted file mode 100644 index 75825ef6e08..00000000000 --- a/arch/sh/lib64/dbg.c +++ /dev/null @@ -1,430 +0,0 @@ -/*-------------------------------------------------------------------------- --- --- Identity : Linux50 Debug Funcions --- --- File : arch/sh/lib64/dbg.c --- --- Copyright 2000, 2001 STMicroelectronics Limited. --- Copyright 2004 Richard Curnow (evt_debug etc) --- ---------------------------------------------------------------------------*/ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <asm/mmu_context.h> - -typedef u64 regType_t; - -static regType_t getConfigReg(u64 id) -{ - register u64 reg __asm__("r2"); - asm volatile ("getcfg %1, 0, %0":"=r" (reg):"r"(id)); - return (reg); -} - -/* ======================================================================= */ - -static char *szTab[] = { "4k", "64k", "1M", "512M" }; -static char *protTab[] = { "----", - "---R", - "--X-", - "--XR", - "-W--", - "-W-R", - "-WX-", - "-WXR", - "U---", - "U--R", - "U-X-", - "U-XR", - "UW--", - "UW-R", - "UWX-", - "UWXR" -}; -#define ITLB_BASE 0x00000000 -#define DTLB_BASE 0x00800000 -#define MAX_TLBs 64 -/* PTE High */ -#define GET_VALID(pte) ((pte) & 0x1) -#define GET_SHARED(pte) ((pte) & 0x2) -#define GET_ASID(pte) ((pte >> 2) & 0x0ff) -#define GET_EPN(pte) ((pte) & 0xfffff000) - -/* PTE Low */ -#define GET_CBEHAVIOR(pte) ((pte) & 0x3) -#define GET_PAGE_SIZE(pte) szTab[((pte >> 3) & 0x3)] -#define GET_PROTECTION(pte) protTab[((pte >> 6) & 0xf)] -#define GET_PPN(pte) ((pte) & 0xfffff000) - -#define PAGE_1K_MASK 0x00000000 -#define PAGE_4K_MASK 0x00000010 -#define PAGE_64K_MASK 0x00000080 -#define MMU_PAGESIZE_MASK (PAGE_64K_MASK | PAGE_4K_MASK) -#define PAGE_1MB_MASK MMU_PAGESIZE_MASK -#define PAGE_1K (1024) -#define PAGE_4K (1024 * 4) -#define PAGE_64K (1024 * 64) -#define PAGE_1MB (1024 * 1024) - -#define HOW_TO_READ_TLB_CONTENT \ - "[ ID] PPN EPN ASID Share CB P.Size PROT.\n" - -void print_single_tlb(unsigned long tlb, int single_print) -{ - regType_t pteH; - regType_t pteL; - unsigned int valid, shared, asid, epn, cb, ppn; - char *pSize; - char *pProt; - - /* - ** in case of single print <single_print> is true, this implies: - ** 1) print the TLB in any case also if NOT VALID - ** 2) print out the header - */ - - pteH = getConfigReg(tlb); - valid = GET_VALID(pteH); - if (single_print) - printk(HOW_TO_READ_TLB_CONTENT); - else if (!valid) - return; - - pteL = getConfigReg(tlb + 1); - - shared = GET_SHARED(pteH); - asid = GET_ASID(pteH); - epn = GET_EPN(pteH); - cb = GET_CBEHAVIOR(pteL); - pSize = GET_PAGE_SIZE(pteL); - pProt = GET_PROTECTION(pteL); - ppn = GET_PPN(pteL); - printk("[%c%2ld] 0x%08x 0x%08x %03d %02x %02x %4s %s\n", - ((valid) ? ' ' : 'u'), ((tlb & 0x0ffff) / TLB_STEP), - ppn, epn, asid, shared, cb, pSize, pProt); -} - -void print_dtlb(void) -{ - int count; - unsigned long tlb; - - printk(" ================= SH-5 D-TLBs Status ===================\n"); - printk(HOW_TO_READ_TLB_CONTENT); - tlb = DTLB_BASE; - for (count = 0; count < MAX_TLBs; count++, tlb += TLB_STEP) - print_single_tlb(tlb, 0); - printk - (" =============================================================\n"); -} - -void print_itlb(void) -{ - int count; - unsigned long tlb; - - printk(" ================= SH-5 I-TLBs Status ===================\n"); - printk(HOW_TO_READ_TLB_CONTENT); - tlb = ITLB_BASE; - for (count = 0; count < MAX_TLBs; count++, tlb += TLB_STEP) - print_single_tlb(tlb, 0); - printk - (" =============================================================\n"); -} - -/* ======================================================================= */ - -#ifdef CONFIG_POOR_MANS_STRACE - -#include "syscalltab.h" - -struct ring_node { - int evt; - int ret_addr; - int event; - int tra; - int pid; - unsigned long sp; - unsigned long pc; -}; - -static struct ring_node event_ring[16]; -static int event_ptr = 0; - -struct stored_syscall_data { - int pid; - int syscall_number; -}; - -#define N_STORED_SYSCALLS 16 - -static struct stored_syscall_data stored_syscalls[N_STORED_SYSCALLS]; -static int syscall_next=0; -static int syscall_next_print=0; - -void evt_debug(int evt, int ret_addr, int event, int tra, struct pt_regs *regs) -{ - int syscallno = tra & 0xff; - unsigned long sp; - unsigned long stack_bottom; - int pid; - struct ring_node *rr; - - pid = current->pid; - stack_bottom = (unsigned long) task_stack_page(current); - asm volatile("ori r15, 0, %0" : "=r" (sp)); - rr = event_ring + event_ptr; - rr->evt = evt; - rr->ret_addr = ret_addr; - rr->event = event; - rr->tra = tra; - rr->pid = pid; - rr->sp = sp; - rr->pc = regs->pc; - - if (sp < stack_bottom + 3092) { - printk("evt_debug : stack underflow report\n"); - int i, j; - for (j=0, i = event_ptr; j<16; j++) { - rr = event_ring + i; - printk("evt=%08x event=%08x tra=%08x pid=%5d sp=%08lx pc=%08lx\n", - rr->evt, rr->event, rr->tra, rr->pid, rr->sp, rr->pc); - i--; - i &= 15; - } - panic("STACK UNDERFLOW\n"); - } - - event_ptr = (event_ptr + 1) & 15; - - if ((event == 2) && (evt == 0x160)) { - if (syscallno < NUM_SYSCALL_INFO_ENTRIES) { - /* Store the syscall information to print later. We - * can't print this now - currently we're running with - * SR.BL=1, so we can't take a tlbmiss (which could occur - * in the console drivers under printk). - * - * Just overwrite old entries on ring overflow - this - * is only for last-hope debugging. */ - stored_syscalls[syscall_next].pid = current->pid; - stored_syscalls[syscall_next].syscall_number = syscallno; - syscall_next++; - syscall_next &= (N_STORED_SYSCALLS - 1); - } - } -} - -static void drain_syscalls(void) { - while (syscall_next_print != syscall_next) { - printk("Task %d: %s()\n", - stored_syscalls[syscall_next_print].pid, - syscall_info_table[stored_syscalls[syscall_next_print].syscall_number].name); - syscall_next_print++; - syscall_next_print &= (N_STORED_SYSCALLS - 1); - } -} - -void evt_debug2(unsigned int ret) -{ - drain_syscalls(); - printk("Task %d: syscall returns %08x\n", current->pid, ret); -} - -void evt_debug_ret_from_irq(struct pt_regs *regs) -{ - int pid; - struct ring_node *rr; - - pid = current->pid; - rr = event_ring + event_ptr; - rr->evt = 0xffff; - rr->ret_addr = 0; - rr->event = 0; - rr->tra = 0; - rr->pid = pid; - rr->pc = regs->pc; - event_ptr = (event_ptr + 1) & 15; -} - -void evt_debug_ret_from_exc(struct pt_regs *regs) -{ - int pid; - struct ring_node *rr; - - pid = current->pid; - rr = event_ring + event_ptr; - rr->evt = 0xfffe; - rr->ret_addr = 0; - rr->event = 0; - rr->tra = 0; - rr->pid = pid; - rr->pc = regs->pc; - event_ptr = (event_ptr + 1) & 15; -} - -#endif /* CONFIG_POOR_MANS_STRACE */ - -/* ======================================================================= */ - -void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs) -{ - - unsigned long long ah, al, bh, bl, ch, cl; - - printk("\n"); - printk("EXCEPTION - %s: task %d; Linux trap # %d; signal = %d\n", - ((from) ? from : "???"), current->pid, trapnr, signr); - - asm volatile ("getcon " __EXPEVT ", %0":"=r"(ah)); - asm volatile ("getcon " __EXPEVT ", %0":"=r"(al)); - ah = (ah) >> 32; - al = (al) & 0xffffffff; - asm volatile ("getcon " __KCR1 ", %0":"=r"(bh)); - asm volatile ("getcon " __KCR1 ", %0":"=r"(bl)); - bh = (bh) >> 32; - bl = (bl) & 0xffffffff; - asm volatile ("getcon " __INTEVT ", %0":"=r"(ch)); - asm volatile ("getcon " __INTEVT ", %0":"=r"(cl)); - ch = (ch) >> 32; - cl = (cl) & 0xffffffff; - printk("EXPE: %08Lx%08Lx KCR1: %08Lx%08Lx INTE: %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - asm volatile ("getcon " __PEXPEVT ", %0":"=r"(ah)); - asm volatile ("getcon " __PEXPEVT ", %0":"=r"(al)); - ah = (ah) >> 32; - al = (al) & 0xffffffff; - asm volatile ("getcon " __PSPC ", %0":"=r"(bh)); - asm volatile ("getcon " __PSPC ", %0":"=r"(bl)); - bh = (bh) >> 32; - bl = (bl) & 0xffffffff; - asm volatile ("getcon " __PSSR ", %0":"=r"(ch)); - asm volatile ("getcon " __PSSR ", %0":"=r"(cl)); - ch = (ch) >> 32; - cl = (cl) & 0xffffffff; - printk("PEXP: %08Lx%08Lx PSPC: %08Lx%08Lx PSSR: %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->pc) >> 32; - al = (regs->pc) & 0xffffffff; - bh = (regs->regs[18]) >> 32; - bl = (regs->regs[18]) & 0xffffffff; - ch = (regs->regs[15]) >> 32; - cl = (regs->regs[15]) & 0xffffffff; - printk("PC : %08Lx%08Lx LINK: %08Lx%08Lx SP : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->sr) >> 32; - al = (regs->sr) & 0xffffffff; - asm volatile ("getcon " __TEA ", %0":"=r"(bh)); - asm volatile ("getcon " __TEA ", %0":"=r"(bl)); - bh = (bh) >> 32; - bl = (bl) & 0xffffffff; - asm volatile ("getcon " __KCR0 ", %0":"=r"(ch)); - asm volatile ("getcon " __KCR0 ", %0":"=r"(cl)); - ch = (ch) >> 32; - cl = (cl) & 0xffffffff; - printk("SR : %08Lx%08Lx TEA : %08Lx%08Lx KCR0: %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[0]) >> 32; - al = (regs->regs[0]) & 0xffffffff; - bh = (regs->regs[1]) >> 32; - bl = (regs->regs[1]) & 0xffffffff; - ch = (regs->regs[2]) >> 32; - cl = (regs->regs[2]) & 0xffffffff; - printk("R0 : %08Lx%08Lx R1 : %08Lx%08Lx R2 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[3]) >> 32; - al = (regs->regs[3]) & 0xffffffff; - bh = (regs->regs[4]) >> 32; - bl = (regs->regs[4]) & 0xffffffff; - ch = (regs->regs[5]) >> 32; - cl = (regs->regs[5]) & 0xffffffff; - printk("R3 : %08Lx%08Lx R4 : %08Lx%08Lx R5 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[6]) >> 32; - al = (regs->regs[6]) & 0xffffffff; - bh = (regs->regs[7]) >> 32; - bl = (regs->regs[7]) & 0xffffffff; - ch = (regs->regs[8]) >> 32; - cl = (regs->regs[8]) & 0xffffffff; - printk("R6 : %08Lx%08Lx R7 : %08Lx%08Lx R8 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - - ah = (regs->regs[9]) >> 32; - al = (regs->regs[9]) & 0xffffffff; - bh = (regs->regs[10]) >> 32; - bl = (regs->regs[10]) & 0xffffffff; - ch = (regs->regs[11]) >> 32; - cl = (regs->regs[11]) & 0xffffffff; - printk("R9 : %08Lx%08Lx R10 : %08Lx%08Lx R11 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - printk("....\n"); - - ah = (regs->tregs[0]) >> 32; - al = (regs->tregs[0]) & 0xffffffff; - bh = (regs->tregs[1]) >> 32; - bl = (regs->tregs[1]) & 0xffffffff; - ch = (regs->tregs[2]) >> 32; - cl = (regs->tregs[2]) & 0xffffffff; - printk("T0 : %08Lx%08Lx T1 : %08Lx%08Lx T2 : %08Lx%08Lx\n", - ah, al, bh, bl, ch, cl); - printk("....\n"); - - print_dtlb(); - print_itlb(); -} - -/* ======================================================================= */ - -/* -** Depending on <base> scan the MMU, Data or Instruction side -** looking for a valid mapping matching Eaddr & asid. -** Return -1 if not found or the TLB id entry otherwise. -** Note: it works only for 4k pages! -*/ -static unsigned long -lookup_mmu_side(unsigned long base, unsigned long Eaddr, unsigned long asid) -{ - regType_t pteH; - unsigned long epn; - int count; - - epn = Eaddr & 0xfffff000; - - for (count = 0; count < MAX_TLBs; count++, base += TLB_STEP) { - pteH = getConfigReg(base); - if (GET_VALID(pteH)) - if ((unsigned long) GET_EPN(pteH) == epn) - if ((unsigned long) GET_ASID(pteH) == asid) - break; - } - return ((unsigned long) ((count < MAX_TLBs) ? base : -1)); -} - -unsigned long lookup_dtlb(unsigned long Eaddr) -{ - unsigned long asid = get_asid(); - return (lookup_mmu_side((u64) DTLB_BASE, Eaddr, asid)); -} - -unsigned long lookup_itlb(unsigned long Eaddr) -{ - unsigned long asid = get_asid(); - return (lookup_mmu_side((u64) ITLB_BASE, Eaddr, asid)); -} - -void print_page(struct page *page) -{ - printk(" page[%p] -> index 0x%lx, count 0x%x, flags 0x%lx\n", - page, page->index, page_count(page), page->flags); - printk(" address_space = %p, pages =%ld\n", page->mapping, - page->mapping->nrpages); - -} diff --git a/arch/sh/lib64/memcpy.S b/arch/sh/lib64/memcpy.S new file mode 100644 index 00000000000..5d682e0ee24 --- /dev/null +++ b/arch/sh/lib64/memcpy.S @@ -0,0 +1,201 @@ +/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ +/* Modified by SuperH, Inc. September 2003 */ +! +! Fast SH memcpy +! +! by Toshiyasu Morita (tm@netcom.com) +! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) +! SH5 code Copyright 2002 SuperH Ltd. +! +! Entry: ARG0: destination pointer +! ARG1: source pointer +! ARG2: byte count +! +! Exit: RESULT: destination pointer +! any other registers in the range r0-r7: trashed +! +! Notes: Usually one wants to do small reads and write a longword, but +! unfortunately it is difficult in some cases to concatanate bytes +! into a longword on the SH, so this does a longword read and small +! writes. +! +! This implementation makes two assumptions about how it is called: +! +! 1.: If the byte count is nonzero, the address of the last byte to be +! copied is unsigned greater than the address of the first byte to +! be copied. This could be easily swapped for a signed comparison, +! but the algorithm used needs some comparison. +! +! 2.: When there are two or three bytes in the last word of an 11-or-more +! bytes memory chunk to b copied, the rest of the word can be read +! without side effects. +! This could be easily changed by increasing the minimum size of +! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, +! however, this would cost a few extra cyles on average. +! For SHmedia, the assumption is that any quadword can be read in its +! enirety if at least one byte is included in the copy. +! + + .section .text..SHmedia32,"ax" + .globl memcpy + .type memcpy, @function + .align 5 + +memcpy: + +#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 +#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 +#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 +#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 + + ld.b r3,0,r63 + pta/l Large,tr0 + movi 25,r0 + bgeu/u r4,r0,tr0 + nsb r4,r0 + shlli r0,5,r0 + movi (L1-L0+63*32 + 1) & 0xffff,r1 + sub r1, r0, r0 +L0: ptrel r0,tr0 + add r2,r4,r5 + ptabs r18,tr1 + add r3,r4,r6 + blink tr0,r63 + +/* Rearranged to make cut2 safe */ + .balign 8 +L4_7: /* 4..7 byte memcpy cntd. */ + stlo.l r2, 0, r0 + or r6, r7, r6 + sthi.l r5, -1, r6 + stlo.l r5, -4, r6 + blink tr1,r63 + + .balign 8 +L1: /* 0 byte memcpy */ + nop + blink tr1,r63 + nop + nop + nop + nop + +L2_3: /* 2 or 3 byte memcpy cntd. */ + st.b r5,-1,r6 + blink tr1,r63 + + /* 1 byte memcpy */ + ld.b r3,0,r0 + st.b r2,0,r0 + blink tr1,r63 + +L8_15: /* 8..15 byte memcpy cntd. */ + stlo.q r2, 0, r0 + or r6, r7, r6 + sthi.q r5, -1, r6 + stlo.q r5, -8, r6 + blink tr1,r63 + + /* 2 or 3 byte memcpy */ + ld.b r3,0,r0 + ld.b r2,0,r63 + ld.b r3,1,r1 + st.b r2,0,r0 + pta/l L2_3,tr0 + ld.b r6,-1,r6 + st.b r2,1,r1 + blink tr0, r63 + + /* 4 .. 7 byte memcpy */ + LDUAL (r3, 0, r0, r1) + pta L4_7, tr0 + ldlo.l r6, -4, r7 + or r0, r1, r0 + sthi.l r2, 3, r0 + ldhi.l r6, -1, r6 + blink tr0, r63 + + /* 8 .. 15 byte memcpy */ + LDUAQ (r3, 0, r0, r1) + pta L8_15, tr0 + ldlo.q r6, -8, r7 + or r0, r1, r0 + sthi.q r2, 7, r0 + ldhi.q r6, -1, r6 + blink tr0, r63 + + /* 16 .. 24 byte memcpy */ + LDUAQ (r3, 0, r0, r1) + LDUAQ (r3, 8, r8, r9) + or r0, r1, r0 + sthi.q r2, 7, r0 + or r8, r9, r8 + sthi.q r2, 15, r8 + ldlo.q r6, -8, r7 + ldhi.q r6, -1, r6 + stlo.q r2, 8, r8 + stlo.q r2, 0, r0 + or r6, r7, r6 + sthi.q r5, -1, r6 + stlo.q r5, -8, r6 + blink tr1,r63 + +Large: + ld.b r2, 0, r63 + pta/l Loop_ua, tr1 + ori r3, -8, r7 + sub r2, r7, r22 + sub r3, r2, r6 + add r2, r4, r5 + ldlo.q r3, 0, r0 + addi r5, -16, r5 + movi 64+8, r27 // could subtract r7 from that. + stlo.q r2, 0, r0 + sthi.q r2, 7, r0 + ldx.q r22, r6, r0 + bgtu/l r27, r4, tr1 + + addi r5, -48, r27 + pta/l Loop_line, tr0 + addi r6, 64, r36 + addi r6, -24, r19 + addi r6, -16, r20 + addi r6, -8, r21 + +Loop_line: + ldx.q r22, r36, r63 + alloco r22, 32 + addi r22, 32, r22 + ldx.q r22, r19, r23 + sthi.q r22, -25, r0 + ldx.q r22, r20, r24 + ldx.q r22, r21, r25 + stlo.q r22, -32, r0 + ldx.q r22, r6, r0 + sthi.q r22, -17, r23 + sthi.q r22, -9, r24 + sthi.q r22, -1, r25 + stlo.q r22, -24, r23 + stlo.q r22, -16, r24 + stlo.q r22, -8, r25 + bgeu r27, r22, tr0 + +Loop_ua: + addi r22, 8, r22 + sthi.q r22, -1, r0 + stlo.q r22, -8, r0 + ldx.q r22, r6, r0 + bgtu/l r5, r22, tr1 + + add r3, r4, r7 + ldlo.q r7, -8, r1 + sthi.q r22, 7, r0 + ldhi.q r7, -1, r7 + ptabs r18,tr1 + stlo.q r22, 0, r0 + or r1, r7, r1 + sthi.q r5, 15, r1 + stlo.q r5, 8, r1 + blink tr1, r63 + + .size memcpy,.-memcpy diff --git a/arch/sh/lib64/memcpy.c b/arch/sh/lib64/memcpy.c deleted file mode 100644 index fba436a92bf..00000000000 --- a/arch/sh/lib64/memcpy.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (C) 2002 Mark Debbage (Mark.Debbage@superh.com) - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - */ - -#include <linux/types.h> -#include <asm/string.h> - -// This is a simplistic optimization of memcpy to increase the -// granularity of access beyond one byte using aligned -// loads and stores. This is not an optimal implementation -// for SH-5 (especially with regard to prefetching and the cache), -// and a better version should be provided later ... - -void *memcpy(void *dest, const void *src, size_t count) -{ - char *d = (char *) dest, *s = (char *) src; - - if (count >= 32) { - int i = 8 - (((unsigned long) d) & 0x7); - - if (i != 8) - while (i-- && count--) { - *d++ = *s++; - } - - if (((((unsigned long) d) & 0x7) == 0) && - ((((unsigned long) s) & 0x7) == 0)) { - while (count >= 32) { - unsigned long long t1, t2, t3, t4; - t1 = *(unsigned long long *) (s); - t2 = *(unsigned long long *) (s + 8); - t3 = *(unsigned long long *) (s + 16); - t4 = *(unsigned long long *) (s + 24); - *(unsigned long long *) (d) = t1; - *(unsigned long long *) (d + 8) = t2; - *(unsigned long long *) (d + 16) = t3; - *(unsigned long long *) (d + 24) = t4; - d += 32; - s += 32; - count -= 32; - } - while (count >= 8) { - *(unsigned long long *) d = - *(unsigned long long *) s; - d += 8; - s += 8; - count -= 8; - } - } - - if (((((unsigned long) d) & 0x3) == 0) && - ((((unsigned long) s) & 0x3) == 0)) { - while (count >= 4) { - *(unsigned long *) d = *(unsigned long *) s; - d += 4; - s += 4; - count -= 4; - } - } - - if (((((unsigned long) d) & 0x1) == 0) && - ((((unsigned long) s) & 0x1) == 0)) { - while (count >= 2) { - *(unsigned short *) d = *(unsigned short *) s; - d += 2; - s += 2; - count -= 2; - } - } - } - - while (count--) { - *d++ = *s++; - } - - return d; -} diff --git a/arch/sh/lib64/memset.S b/arch/sh/lib64/memset.S new file mode 100644 index 00000000000..2d37b048855 --- /dev/null +++ b/arch/sh/lib64/memset.S @@ -0,0 +1,91 @@ +/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ +/* Modified by SuperH, Inc. September 2003 */ +! +! Fast SH memset +! +! by Toshiyasu Morita (tm@netcom.com) +! +! SH5 code by J"orn Rennecke (joern.rennecke@superh.com) +! Copyright 2002 SuperH Ltd. +! + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define SHHI shlld +#define SHLO shlrd +#else +#define SHHI shlrd +#define SHLO shlld +#endif + + .section .text..SHmedia32,"ax" + .globl memset + .type memset, @function + + .align 5 + +memset: + pta/l multiquad, tr0 + andi r2, 7, r22 + ptabs r18, tr2 + mshflo.b r3,r3,r3 + add r4, r22, r23 + mperm.w r3, r63, r3 // Fill pattern now in every byte of r3 + + movi 8, r9 + bgtu/u r23, r9, tr0 // multiquad + + beqi/u r4, 0, tr2 // Return with size 0 - ensures no mem accesses + ldlo.q r2, 0, r7 + shlli r4, 2, r4 + movi -1, r8 + SHHI r8, r4, r8 + SHHI r8, r4, r8 + mcmv r7, r8, r3 + stlo.q r2, 0, r3 + blink tr2, r63 + +multiquad: + pta/l lastquad, tr0 + stlo.q r2, 0, r3 + shlri r23, 3, r24 + add r2, r4, r5 + beqi/u r24, 1, tr0 // lastquad + pta/l loop, tr1 + sub r2, r22, r25 + andi r5, -8, r20 // calculate end address and + addi r20, -7*8, r8 // loop end address; This might overflow, so we need + // to use a different test before we start the loop + bge/u r24, r9, tr1 // loop + st.q r25, 8, r3 + st.q r20, -8, r3 + shlri r24, 1, r24 + beqi/u r24, 1, tr0 // lastquad + st.q r25, 16, r3 + st.q r20, -16, r3 + beqi/u r24, 2, tr0 // lastquad + st.q r25, 24, r3 + st.q r20, -24, r3 +lastquad: + sthi.q r5, -1, r3 + blink tr2,r63 + +loop: +!!! alloco r25, 32 // QQQ comment out for short-term fix to SHUK #3895. + // QQQ commenting out is locically correct, but sub-optimal + // QQQ Sean McGoogan - 4th April 2003. + st.q r25, 8, r3 + st.q r25, 16, r3 + st.q r25, 24, r3 + st.q r25, 32, r3 + addi r25, 32, r25 + bgeu/l r8, r25, tr1 // loop + + st.q r20, -40, r3 + st.q r20, -32, r3 + st.q r20, -24, r3 + st.q r20, -16, r3 + st.q r20, -8, r3 + sthi.q r5, -1, r3 + blink tr2,r63 + + .size memset,.-memset diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c index ff559e2a96f..38c954e04f6 100644 --- a/arch/sh/lib64/panic.c +++ b/arch/sh/lib64/panic.c @@ -6,53 +6,10 @@ * for more details. */ -#include <linux/kernel.h> -#include <asm/io.h> -#include <asm/cpu/registers.h> - -/* THIS IS A PHYSICAL ADDRESS */ -#define HDSP2534_ADDR (0x04002100) - -#ifdef CONFIG_SH_CAYMAN - -static void poor_mans_delay(void) -{ - int i; - for (i = 0; i < 2500000; i++) { - } /* poor man's delay */ -} - -static void show_value(unsigned long x) -{ - int i; - unsigned nibble; - for (i = 0; i < 8; i++) { - nibble = ((x >> (i * 4)) & 0xf); - - ctrl_outb(nibble + ((nibble > 9) ? 55 : 48), - HDSP2534_ADDR + 0xe0 + ((7 - i) << 2)); - } -} - -#endif - void panic_handler(unsigned long panicPC, unsigned long panicSSR, unsigned long panicEXPEVT) { -#ifdef CONFIG_SH_CAYMAN - while (1) { - /* This piece of code displays the PC on the LED display */ - show_value(panicPC); - poor_mans_delay(); - show_value(panicSSR); - poor_mans_delay(); - show_value(panicEXPEVT); - poor_mans_delay(); - } -#endif - /* Never return from the panic handler */ for (;;) ; - } diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S new file mode 100644 index 00000000000..1963bbd4228 --- /dev/null +++ b/arch/sh/lib64/sdivsi3.S @@ -0,0 +1,135 @@ + .global __sdivsi3 + .global __sdivsi3_1 + .global __sdivsi3_2 + .section .text..SHmedia32,"ax" + .align 2 + + /* inputs: r4,r5 */ + /* clobbered: r1,r18,r19,r20,r21,r25,tr0 */ + /* result in r0 */ +__sdivsi3: +__sdivsi3_1: + ptb __div_table,tr0 + gettr tr0,r20 + +__sdivsi3_2: + nsb r5, r1 + shlld r5, r1, r25 /* normalize; [-2 ..1, 1..2) in s2.62 */ + shari r25, 58, r21 /* extract 5(6) bit index (s2.4 with hole -1..1) */ + /* bubble */ + ldx.ub r20, r21, r19 /* u0.8 */ + shari r25, 32, r25 /* normalize to s2.30 */ + shlli r21, 1, r21 + muls.l r25, r19, r19 /* s2.38 */ + ldx.w r20, r21, r21 /* s2.14 */ + ptabs r18, tr0 + shari r19, 24, r19 /* truncate to s2.14 */ + sub r21, r19, r19 /* some 11 bit inverse in s1.14 */ + muls.l r19, r19, r21 /* u0.28 */ + sub r63, r1, r1 + addi r1, 92, r1 + muls.l r25, r21, r18 /* s2.58 */ + shlli r19, 45, r19 /* multiply by two and convert to s2.58 */ + /* bubble */ + sub r19, r18, r18 + shari r18, 28, r18 /* some 22 bit inverse in s1.30 */ + muls.l r18, r25, r0 /* s2.60 */ + muls.l r18, r4, r25 /* s32.30 */ + /* bubble */ + shari r0, 16, r19 /* s-16.44 */ + muls.l r19, r18, r19 /* s-16.74 */ + shari r25, 63, r0 + shari r4, 14, r18 /* s19.-14 */ + shari r19, 30, r19 /* s-16.44 */ + muls.l r19, r18, r19 /* s15.30 */ + xor r21, r0, r21 /* You could also use the constant 1 << 27. */ + add r21, r25, r21 + sub r21, r19, r21 + shard r21, r1, r21 + sub r21, r0, r0 + blink tr0, r63 + +/* This table has been generated by divtab.c . +Defects for bias -330: + Max defect: 6.081536e-07 at -1.000000e+00 + Min defect: 2.849516e-08 at 1.030651e+00 + Max 2nd step defect: 9.606539e-12 at -1.000000e+00 + Min 2nd step defect: 0.000000e+00 at 0.000000e+00 + Defect at 1: 1.238659e-07 + Defect at -2: 1.061708e-07 */ + + .balign 2 + .type __div_table,@object + .size __div_table,128 +/* negative division constants */ + .word -16638 + .word -17135 + .word -17737 + .word -18433 + .word -19103 + .word -19751 + .word -20583 + .word -21383 + .word -22343 + .word -23353 + .word -24407 + .word -25582 + .word -26863 + .word -28382 + .word -29965 + .word -31800 +/* negative division factors */ + .byte 66 + .byte 70 + .byte 75 + .byte 81 + .byte 87 + .byte 93 + .byte 101 + .byte 109 + .byte 119 + .byte 130 + .byte 142 + .byte 156 + .byte 172 + .byte 192 + .byte 214 + .byte 241 + .skip 16 + .global __div_table +__div_table: + .skip 16 +/* positive division factors */ + .byte 241 + .byte 214 + .byte 192 + .byte 172 + .byte 156 + .byte 142 + .byte 130 + .byte 119 + .byte 109 + .byte 101 + .byte 93 + .byte 87 + .byte 81 + .byte 75 + .byte 70 + .byte 66 +/* positive division constants */ + .word 31801 + .word 29966 + .word 28383 + .word 26864 + .word 25583 + .word 24408 + .word 23354 + .word 22344 + .word 21384 + .word 20584 + .word 19752 + .word 19104 + .word 18434 + .word 17738 + .word 17136 + .word 16639 diff --git a/arch/sh/lib64/strcpy.S b/arch/sh/lib64/strcpy.S new file mode 100644 index 00000000000..ea7c9c533ee --- /dev/null +++ b/arch/sh/lib64/strcpy.S @@ -0,0 +1,97 @@ +/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ +/* Modified by SuperH, Inc. September 2003 */ +! Entry: arg0: destination +! arg1: source +! Exit: result: destination +! +! SH5 code Copyright 2002 SuperH Ltd. + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define SHHI shlld +#define SHLO shlrd +#else +#define SHHI shlrd +#define SHLO shlld +#endif + + .section .text..SHmedia32,"ax" + .globl strcpy + .type strcpy, @function + .align 5 + +strcpy: + + pta/l shortstring,tr1 + ldlo.q r3,0,r4 + ptabs r18,tr4 + shlli r3,3,r7 + addi r2, 8, r0 + mcmpeq.b r4,r63,r6 + SHHI r6,r7,r6 + bnei/u r6,0,tr1 // shortstring + pta/l no_lddst, tr2 + ori r3,-8,r23 + sub r2, r23, r0 + sub r3, r2, r21 + addi r21, 8, r20 + ldx.q r0, r21, r5 + pta/l loop, tr0 + ori r2,-8,r22 + mcmpeq.b r5, r63, r6 + bgt/u r22, r23, tr2 // no_lddst + + // r22 < r23 : Need to do a load from the destination. + // r22 == r23 : Doesn't actually need to load from destination, + // but still can be handled here. + ldlo.q r2, 0, r9 + movi -1, r8 + SHLO r8, r7, r8 + mcmv r4, r8, r9 + stlo.q r2, 0, r9 + beqi/l r6, 0, tr0 // loop + + add r5, r63, r4 + addi r0, 8, r0 + blink tr1, r63 // shortstring +no_lddst: + // r22 > r23: note that for r22 == r23 the sthi.q would clobber + // bytes before the destination region. + stlo.q r2, 0, r4 + SHHI r4, r7, r4 + sthi.q r0, -1, r4 + beqi/l r6, 0, tr0 // loop + + add r5, r63, r4 + addi r0, 8, r0 +shortstring: +#if __BYTE_ORDER != __LITTLE_ENDIAN + pta/l shortstring2,tr1 + byterev r4,r4 +#endif +shortstring2: + st.b r0,-8,r4 + andi r4,0xff,r5 + shlri r4,8,r4 + addi r0,1,r0 + bnei/l r5,0,tr1 + blink tr4,r63 // return + + .balign 8 +loop: + stlo.q r0, 0, r5 + ldx.q r0, r20, r4 + addi r0, 16, r0 + sthi.q r0, -9, r5 + mcmpeq.b r4, r63, r6 + bnei/u r6, 0, tr1 // shortstring + ldx.q r0, r21, r5 + stlo.q r0, -8, r4 + sthi.q r0, -1, r4 + mcmpeq.b r5, r63, r6 + beqi/l r6, 0, tr0 // loop + + add r5, r63, r4 + addi r0, 8, r0 + blink tr1, r63 // shortstring + + .size strcpy,.-strcpy diff --git a/arch/sh/lib64/strlen.S b/arch/sh/lib64/strlen.S new file mode 100644 index 00000000000..cbc0d912e5f --- /dev/null +++ b/arch/sh/lib64/strlen.S @@ -0,0 +1,33 @@ +/* + * Simplistic strlen() implementation for SHmedia. + * + * Copyright (C) 2003 Paul Mundt <lethal@linux-sh.org> + */ + + .section .text..SHmedia32,"ax" + .globl strlen + .type strlen,@function + + .balign 16 +strlen: + ptabs r18, tr4 + + /* + * Note: We could easily deal with the NULL case here with a simple + * sanity check, though it seems that the behavior we want is to fault + * in the event that r2 == NULL, so we don't bother. + */ +/* beqi r2, 0, tr4 */ ! Sanity check + + movi -1, r0 + pta/l loop, tr0 +loop: + ld.b r2, 0, r1 + addi r2, 1, r2 + addi r0, 1, r0 + bnei/l r1, 0, tr0 + + or r0, r63, r2 + blink tr4, r63 + + .size strlen,.-strlen diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c index d76bd801194..f215b063da7 100644 --- a/arch/sh/lib64/udelay.c +++ b/arch/sh/lib64/udelay.c @@ -33,7 +33,7 @@ void __delay(unsigned long loops) :"0"(loops)); } -inline void __const_udelay(unsigned long xloops) +void __const_udelay(unsigned long xloops) { __delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy)); } diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S new file mode 100644 index 00000000000..6895c0225b8 --- /dev/null +++ b/arch/sh/lib64/udivdi3.S @@ -0,0 +1,120 @@ + .section .text..SHmedia32,"ax" + .align 2 + .global __udivdi3 +__udivdi3: + shlri r3,1,r4 + nsb r4,r22 + shlld r3,r22,r6 + shlri r6,49,r5 + movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ + sub r21,r5,r1 + mmulfx.w r1,r1,r4 + mshflo.w r1,r63,r1 + sub r63,r22,r20 // r63 == 64 % 64 + mmulfx.w r5,r4,r4 + pta large_divisor,tr0 + addi r20,32,r9 + msub.w r1,r4,r1 + madd.w r1,r1,r1 + mmulfx.w r1,r1,r4 + shlri r6,32,r7 + bgt/u r9,r63,tr0 // large_divisor + mmulfx.w r5,r4,r4 + shlri r2,32+14,r19 + addi r22,-31,r0 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r19,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + mulu.l r5,r3,r8 + mshalds.l r1,r21,r1 + shari r4,26,r4 + shlld r8,r0,r8 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r2,r8,r2 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ + + shlri r2,22,r21 + mulu.l r21,r1,r21 + shlld r5,r0,r8 + addi r20,30-22,r0 + shlrd r21,r0,r21 + mulu.l r21,r3,r5 + add r8,r21,r8 + mcmpgt.l r21,r63,r21 // See Note 1 + addi r20,30,r0 + mshfhi.l r63,r21,r21 + sub r2,r5,r2 + andc r2,r21,r2 + + /* small divisor: need a third divide step */ + mulu.l r2,r1,r7 + ptabs r18,tr0 + addi r2,1,r2 + shlrd r7,r0,r7 + mulu.l r7,r3,r5 + add r8,r7,r8 + sub r2,r3,r2 + cmpgt r2,r5,r5 + add r8,r5,r2 + /* could test r3 here to check for divide by zero. */ + blink tr0,r63 + +large_divisor: + mmulfx.w r5,r4,r4 + shlrd r2,r9,r25 + shlri r25,32,r8 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r8,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + shlri r5,14-1,r8 + mulu.l r8,r7,r5 + mshalds.l r1,r21,r1 + shari r4,26,r4 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r25,r5,r25 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ + + shlri r25,22,r21 + mulu.l r21,r1,r21 + pta no_lo_adj,tr0 + addi r22,32,r0 + shlri r21,40,r21 + mulu.l r21,r7,r5 + add r8,r21,r8 + shlld r2,r0,r2 + sub r25,r5,r25 + bgtu/u r7,r25,tr0 // no_lo_adj + addi r8,1,r8 + sub r25,r7,r25 +no_lo_adj: + mextr4 r2,r25,r2 + + /* large_divisor: only needs a few adjustments. */ + mulu.l r8,r6,r5 + ptabs r18,tr0 + /* bubble */ + cmpgtu r5,r2,r5 + sub r8,r5,r2 + blink tr0,r63 + +/* Note 1: To shift the result of the second divide stage so that the result + always fits into 32 bits, yet we still reduce the rest sufficiently + would require a lot of instructions to do the shifts just right. Using + the full 64 bit shift result to multiply with the divisor would require + four extra instructions for the upper 32 bits (shift / mulu / shift / sub). + Fortunately, if the upper 32 bits of the shift result are nonzero, we + know that the rest after taking this partial result into account will + fit into 32 bits. So we just clear the upper 32 bits of the rest if the + upper 32 bits of the partial result are nonzero. */ diff --git a/arch/sh/lib64/udivsi3.S b/arch/sh/lib64/udivsi3.S new file mode 100644 index 00000000000..e68120e4b84 --- /dev/null +++ b/arch/sh/lib64/udivsi3.S @@ -0,0 +1,59 @@ + .global __udivsi3 + .section .text..SHmedia32,"ax" + .align 2 + +/* + inputs: r4,r5 + clobbered: r18,r19,r20,r21,r22,r25,tr0 + result in r0. + */ +__udivsi3: + addz.l r5,r63,r22 + nsb r22,r0 + shlld r22,r0,r25 + shlri r25,48,r25 + movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */ + sub r20,r25,r21 + mmulfx.w r21,r21,r19 + mshflo.w r21,r63,r21 + ptabs r18,tr0 + mmulfx.w r25,r19,r19 + sub r20,r0,r0 + /* bubble */ + msub.w r21,r19,r19 + + /* + * It would be nice for scheduling to do this add to r21 before + * the msub.w, but we need a different value for r19 to keep + * errors under control. + */ + addi r19,-2,r21 + mulu.l r4,r21,r18 + mmulfx.w r19,r19,r19 + shlli r21,15,r21 + shlrd r18,r0,r18 + mulu.l r18,r22,r20 + mmacnfx.wl r25,r19,r21 + /* bubble */ + sub r4,r20,r25 + + mulu.l r25,r21,r19 + addi r0,14,r0 + /* bubble */ + shlrd r19,r0,r19 + mulu.l r19,r22,r20 + add r18,r19,r18 + /* bubble */ + sub.l r25,r20,r25 + + mulu.l r25,r21,r19 + addz.l r25,r63,r25 + sub r25,r22,r25 + shlrd r19,r0,r19 + mulu.l r19,r22,r20 + addi r25,1,r25 + add r18,r19,r18 + + cmpgt r25,r20,r25 + add.l r18,r25,r0 + blink tr0,r63 |
