aboutsummaryrefslogtreecommitdiff
path: root/arch/sh/lib64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sh/lib64')
-rw-r--r--arch/sh/lib64/.gitignore1
-rw-r--r--arch/sh/lib64/Makefile8
-rw-r--r--arch/sh/lib64/c-checksum.c214
-rw-r--r--arch/sh/lib64/clear_page.S54
-rw-r--r--arch/sh/lib64/copy_user_memcpy.S2
-rw-r--r--arch/sh/lib64/dbg.c430
-rw-r--r--arch/sh/lib64/memcpy.S201
-rw-r--r--arch/sh/lib64/memcpy.c81
-rw-r--r--arch/sh/lib64/memset.S91
-rw-r--r--arch/sh/lib64/panic.c43
-rw-r--r--arch/sh/lib64/sdivsi3.S135
-rw-r--r--arch/sh/lib64/strcpy.S97
-rw-r--r--arch/sh/lib64/strlen.S33
-rw-r--r--arch/sh/lib64/udelay.c2
-rw-r--r--arch/sh/lib64/udivdi3.S120
-rw-r--r--arch/sh/lib64/udivsi3.S59
16 files changed, 743 insertions, 828 deletions
diff --git a/arch/sh/lib64/.gitignore b/arch/sh/lib64/.gitignore
deleted file mode 100644
index 3508c2cb23c..00000000000
--- a/arch/sh/lib64/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-syscalltab.h
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
index 9950966923a..69779ff741d 100644
--- a/arch/sh/lib64/Makefile
+++ b/arch/sh/lib64/Makefile
@@ -2,7 +2,7 @@
# Makefile for the SH-5 specific library files..
#
# Copyright (C) 2000, 2001 Paolo Alberelli
-# Copyright (C) 2003 Paul Mundt
+# Copyright (C) 2003 - 2008 Paul Mundt
#
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
@@ -10,6 +10,8 @@
#
# Panic should really be compiled as PIC
-lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \
- copy_page.o clear_page.o
+lib-y := udelay.o panic.o memcpy.o memset.o \
+ copy_user_memcpy.o copy_page.o strcpy.o strlen.o
+# Extracted from libgcc
+lib-y += udivsi3.o udivdi3.o sdivsi3.o
diff --git a/arch/sh/lib64/c-checksum.c b/arch/sh/lib64/c-checksum.c
deleted file mode 100644
index 5c284e0cff9..00000000000
--- a/arch/sh/lib64/c-checksum.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * arch/sh/lib64/c-checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/byteorder.h>
-#include <asm/uaccess.h>
-
-static inline unsigned short from64to16(unsigned long long x)
-{
- /* add up 32-bit words for 33 bits */
- x = (x & 0xffffffff) + (x >> 32);
- /* add up 16-bit and 17-bit words for 17+c bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up 16-bit and 2-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-static inline unsigned short foldto16(unsigned long x)
-{
- /* add up 16-bit for 17 bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-static inline unsigned short myfoldto16(unsigned long long x)
-{
- /* Fold down to 32-bits so we don't loose in the typedef-less
- network stack. */
- /* 64 to 33 */
- x = (x & 0xffffffff) + (x >> 32);
- /* 33 to 32 */
- x = (x & 0xffffffff) + (x >> 32);
-
- /* add up 16-bit for 17 bits */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-#define odd(x) ((x)&1)
-#define U16(x) ntohs(x)
-
-static unsigned long do_csum(const unsigned char *buff, int len)
-{
- int odd, count;
- unsigned long result = 0;
-
- pr_debug("do_csum buff %p, len %d (0x%x)\n", buff, len, len);
-#ifdef DEBUG
- for (i = 0; i < len; i++) {
- if ((i % 26) == 0)
- printk("\n");
- printk("%02X ", buff[i]);
- }
-#endif
-
- if (len <= 0)
- goto out;
-
- odd = 1 & (unsigned long) buff;
- if (odd) {
- result = *buff << 8;
- len--;
- buff++;
- }
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *) buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- unsigned long carry = 0;
- do {
- unsigned long w = *(unsigned long *) buff;
- buff += 4;
- count--;
- result += carry;
- result += w;
- carry = (w > result);
- } while (count);
- result += carry;
- result = (result & 0xffff) + (result >> 16);
- }
- if (len & 2) {
- result += *(unsigned short *) buff;
- buff += 2;
- }
- }
- if (len & 1)
- result += *buff;
- result = foldto16(result);
- if (odd)
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
- pr_debug("\nCHECKSUM is 0x%lx\n", result);
-
- out:
- return result;
-}
-
-/* computes the checksum of a memory block at buff, length len,
- and adds in "sum" (32-bit) */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
- unsigned long long result = do_csum(buff, len);
-
- /* add in old sum, and carry.. */
- result += (__force u32)sum;
- /* 32+c bits -> 32 bits */
- result = (result & 0xffffffff) + (result >> 32);
-
- pr_debug("csum_partial, buff %p len %d sum 0x%x result=0x%016Lx\n",
- buff, len, sum, result);
-
- return (__force __wsum)result;
-}
-
-/* Copy while checksumming, otherwise like csum_partial. */
-__wsum
-csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
-{
- sum = csum_partial(src, len, sum);
- memcpy(dst, src, len);
-
- return sum;
-}
-
-/* Copy from userspace and compute checksum. If we catch an exception
- then zero the rest of the buffer. */
-__wsum
-csum_partial_copy_from_user(const void __user *src, void *dst, int len,
- __wsum sum, int *err_ptr)
-{
- int missing;
-
- pr_debug
- ("csum_partial_copy_from_user src %p, dest %p, len %d, sum %08x, err_ptr %p\n",
- src, dst, len, sum, err_ptr);
- missing = copy_from_user(dst, src, len);
- pr_debug(" access_ok %d\n", __access_ok((unsigned long) src, len));
- pr_debug(" missing %d\n", missing);
- if (missing) {
- memset(dst + len - missing, 0, missing);
- *err_ptr = -EFAULT;
- }
-
- return csum_partial(dst, len, sum);
-}
-
-/* Copy to userspace and compute checksum. */
-__wsum
-csum_partial_copy_to_user(const unsigned char *src, unsigned char *dst, int len,
- __wsum sum, int *err_ptr)
-{
- sum = csum_partial(src, len, sum);
-
- if (copy_to_user(dst, src, len))
- *err_ptr = -EFAULT;
-
- return sum;
-}
-
-/*
- * This is a version of ip_compute_csum() optimized for IP headers,
- * which always checksum on 4 octet boundaries.
- */
-__sum16 ip_fast_csum(const void *iph, unsigned int ihl)
-{
- pr_debug("ip_fast_csum %p,%d\n", iph, ihl);
-
- return (__force __sum16)~do_csum(iph, ihl * 4);
-}
-
-__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto, __wsum sum)
-{
- unsigned long long result;
-
- pr_debug("ntohs(0x%x)=0x%x\n", 0xdead, ntohs(0xdead));
- pr_debug("htons(0x%x)=0x%x\n", 0xdead, htons(0xdead));
-
- result = (__force u64) saddr + (__force u64) daddr +
- (__force u64) sum + ((len + proto) << 8);
-
- /* Fold down to 32-bits so we don't loose in the typedef-less
- network stack. */
- /* 64 to 33 */
- result = (result & 0xffffffff) + (result >> 32);
- /* 33 to 32 */
- result = (result & 0xffffffff) + (result >> 32);
-
- pr_debug("%s saddr %x daddr %x len %x proto %x sum %x result %08Lx\n",
- __func__, saddr, daddr, len, proto, sum, result);
-
- return (__wsum)result;
-}
-EXPORT_SYMBOL(csum_tcpudp_nofold);
diff --git a/arch/sh/lib64/clear_page.S b/arch/sh/lib64/clear_page.S
deleted file mode 100644
index 007ab48ecc1..00000000000
--- a/arch/sh/lib64/clear_page.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
-
- This file is subject to the terms and conditions of the GNU General Public
- License. See the file "COPYING" in the main directory of this archive
- for more details.
-
- Tight version of memset for the case of just clearing a page. It turns out
- that having the alloco's spaced out slightly due to the increment/branch
- pair causes them to contend less for access to the cache. Similarly,
- keeping the stores apart from the allocos causes less contention. => Do two
- separate loops. Do multiple stores per loop to amortise the
- increment/branch cost a little.
-
- Parameters:
- r2 : source effective address (start of page)
-
- Always clears 4096 bytes.
-
- Note : alloco guarded by synco to avoid TAKum03020 erratum
-
-*/
-
- .section .text..SHmedia32,"ax"
- .little
-
- .balign 8
- .global clear_page
-clear_page:
- pta/l 1f, tr1
- pta/l 2f, tr2
- ptabs/l r18, tr0
-
- movi 4096, r7
- add r2, r7, r7
- add r2, r63, r6
-1:
- alloco r6, 0
- synco ! TAKum03020
- addi r6, 32, r6
- bgt/l r7, r6, tr1
-
- add r2, r63, r6
-2:
- st.q r6, 0, r63
- st.q r6, 8, r63
- st.q r6, 16, r63
- st.q r6, 24, r63
- addi r6, 32, r6
- bgt/l r7, r6, tr2
-
- blink tr0, r63
-
-
diff --git a/arch/sh/lib64/copy_user_memcpy.S b/arch/sh/lib64/copy_user_memcpy.S
index 2a62816d2dd..49aeabeba2c 100644
--- a/arch/sh/lib64/copy_user_memcpy.S
+++ b/arch/sh/lib64/copy_user_memcpy.S
@@ -27,7 +27,7 @@
! 2.: When there are two or three bytes in the last word of an 11-or-more
! bytes memory chunk to b copied, the rest of the word can be read
! without side effects.
-! This could be easily changed by increasing the minumum size of
+! This could be easily changed by increasing the minimum size of
! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
! however, this would cost a few extra cyles on average.
! For SHmedia, the assumption is that any quadword can be read in its
diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c
deleted file mode 100644
index 75825ef6e08..00000000000
--- a/arch/sh/lib64/dbg.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*--------------------------------------------------------------------------
---
--- Identity : Linux50 Debug Funcions
---
--- File : arch/sh/lib64/dbg.c
---
--- Copyright 2000, 2001 STMicroelectronics Limited.
--- Copyright 2004 Richard Curnow (evt_debug etc)
---
---------------------------------------------------------------------------*/
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <asm/mmu_context.h>
-
-typedef u64 regType_t;
-
-static regType_t getConfigReg(u64 id)
-{
- register u64 reg __asm__("r2");
- asm volatile ("getcfg %1, 0, %0":"=r" (reg):"r"(id));
- return (reg);
-}
-
-/* ======================================================================= */
-
-static char *szTab[] = { "4k", "64k", "1M", "512M" };
-static char *protTab[] = { "----",
- "---R",
- "--X-",
- "--XR",
- "-W--",
- "-W-R",
- "-WX-",
- "-WXR",
- "U---",
- "U--R",
- "U-X-",
- "U-XR",
- "UW--",
- "UW-R",
- "UWX-",
- "UWXR"
-};
-#define ITLB_BASE 0x00000000
-#define DTLB_BASE 0x00800000
-#define MAX_TLBs 64
-/* PTE High */
-#define GET_VALID(pte) ((pte) & 0x1)
-#define GET_SHARED(pte) ((pte) & 0x2)
-#define GET_ASID(pte) ((pte >> 2) & 0x0ff)
-#define GET_EPN(pte) ((pte) & 0xfffff000)
-
-/* PTE Low */
-#define GET_CBEHAVIOR(pte) ((pte) & 0x3)
-#define GET_PAGE_SIZE(pte) szTab[((pte >> 3) & 0x3)]
-#define GET_PROTECTION(pte) protTab[((pte >> 6) & 0xf)]
-#define GET_PPN(pte) ((pte) & 0xfffff000)
-
-#define PAGE_1K_MASK 0x00000000
-#define PAGE_4K_MASK 0x00000010
-#define PAGE_64K_MASK 0x00000080
-#define MMU_PAGESIZE_MASK (PAGE_64K_MASK | PAGE_4K_MASK)
-#define PAGE_1MB_MASK MMU_PAGESIZE_MASK
-#define PAGE_1K (1024)
-#define PAGE_4K (1024 * 4)
-#define PAGE_64K (1024 * 64)
-#define PAGE_1MB (1024 * 1024)
-
-#define HOW_TO_READ_TLB_CONTENT \
- "[ ID] PPN EPN ASID Share CB P.Size PROT.\n"
-
-void print_single_tlb(unsigned long tlb, int single_print)
-{
- regType_t pteH;
- regType_t pteL;
- unsigned int valid, shared, asid, epn, cb, ppn;
- char *pSize;
- char *pProt;
-
- /*
- ** in case of single print <single_print> is true, this implies:
- ** 1) print the TLB in any case also if NOT VALID
- ** 2) print out the header
- */
-
- pteH = getConfigReg(tlb);
- valid = GET_VALID(pteH);
- if (single_print)
- printk(HOW_TO_READ_TLB_CONTENT);
- else if (!valid)
- return;
-
- pteL = getConfigReg(tlb + 1);
-
- shared = GET_SHARED(pteH);
- asid = GET_ASID(pteH);
- epn = GET_EPN(pteH);
- cb = GET_CBEHAVIOR(pteL);
- pSize = GET_PAGE_SIZE(pteL);
- pProt = GET_PROTECTION(pteL);
- ppn = GET_PPN(pteL);
- printk("[%c%2ld] 0x%08x 0x%08x %03d %02x %02x %4s %s\n",
- ((valid) ? ' ' : 'u'), ((tlb & 0x0ffff) / TLB_STEP),
- ppn, epn, asid, shared, cb, pSize, pProt);
-}
-
-void print_dtlb(void)
-{
- int count;
- unsigned long tlb;
-
- printk(" ================= SH-5 D-TLBs Status ===================\n");
- printk(HOW_TO_READ_TLB_CONTENT);
- tlb = DTLB_BASE;
- for (count = 0; count < MAX_TLBs; count++, tlb += TLB_STEP)
- print_single_tlb(tlb, 0);
- printk
- (" =============================================================\n");
-}
-
-void print_itlb(void)
-{
- int count;
- unsigned long tlb;
-
- printk(" ================= SH-5 I-TLBs Status ===================\n");
- printk(HOW_TO_READ_TLB_CONTENT);
- tlb = ITLB_BASE;
- for (count = 0; count < MAX_TLBs; count++, tlb += TLB_STEP)
- print_single_tlb(tlb, 0);
- printk
- (" =============================================================\n");
-}
-
-/* ======================================================================= */
-
-#ifdef CONFIG_POOR_MANS_STRACE
-
-#include "syscalltab.h"
-
-struct ring_node {
- int evt;
- int ret_addr;
- int event;
- int tra;
- int pid;
- unsigned long sp;
- unsigned long pc;
-};
-
-static struct ring_node event_ring[16];
-static int event_ptr = 0;
-
-struct stored_syscall_data {
- int pid;
- int syscall_number;
-};
-
-#define N_STORED_SYSCALLS 16
-
-static struct stored_syscall_data stored_syscalls[N_STORED_SYSCALLS];
-static int syscall_next=0;
-static int syscall_next_print=0;
-
-void evt_debug(int evt, int ret_addr, int event, int tra, struct pt_regs *regs)
-{
- int syscallno = tra & 0xff;
- unsigned long sp;
- unsigned long stack_bottom;
- int pid;
- struct ring_node *rr;
-
- pid = current->pid;
- stack_bottom = (unsigned long) task_stack_page(current);
- asm volatile("ori r15, 0, %0" : "=r" (sp));
- rr = event_ring + event_ptr;
- rr->evt = evt;
- rr->ret_addr = ret_addr;
- rr->event = event;
- rr->tra = tra;
- rr->pid = pid;
- rr->sp = sp;
- rr->pc = regs->pc;
-
- if (sp < stack_bottom + 3092) {
- printk("evt_debug : stack underflow report\n");
- int i, j;
- for (j=0, i = event_ptr; j<16; j++) {
- rr = event_ring + i;
- printk("evt=%08x event=%08x tra=%08x pid=%5d sp=%08lx pc=%08lx\n",
- rr->evt, rr->event, rr->tra, rr->pid, rr->sp, rr->pc);
- i--;
- i &= 15;
- }
- panic("STACK UNDERFLOW\n");
- }
-
- event_ptr = (event_ptr + 1) & 15;
-
- if ((event == 2) && (evt == 0x160)) {
- if (syscallno < NUM_SYSCALL_INFO_ENTRIES) {
- /* Store the syscall information to print later. We
- * can't print this now - currently we're running with
- * SR.BL=1, so we can't take a tlbmiss (which could occur
- * in the console drivers under printk).
- *
- * Just overwrite old entries on ring overflow - this
- * is only for last-hope debugging. */
- stored_syscalls[syscall_next].pid = current->pid;
- stored_syscalls[syscall_next].syscall_number = syscallno;
- syscall_next++;
- syscall_next &= (N_STORED_SYSCALLS - 1);
- }
- }
-}
-
-static void drain_syscalls(void) {
- while (syscall_next_print != syscall_next) {
- printk("Task %d: %s()\n",
- stored_syscalls[syscall_next_print].pid,
- syscall_info_table[stored_syscalls[syscall_next_print].syscall_number].name);
- syscall_next_print++;
- syscall_next_print &= (N_STORED_SYSCALLS - 1);
- }
-}
-
-void evt_debug2(unsigned int ret)
-{
- drain_syscalls();
- printk("Task %d: syscall returns %08x\n", current->pid, ret);
-}
-
-void evt_debug_ret_from_irq(struct pt_regs *regs)
-{
- int pid;
- struct ring_node *rr;
-
- pid = current->pid;
- rr = event_ring + event_ptr;
- rr->evt = 0xffff;
- rr->ret_addr = 0;
- rr->event = 0;
- rr->tra = 0;
- rr->pid = pid;
- rr->pc = regs->pc;
- event_ptr = (event_ptr + 1) & 15;
-}
-
-void evt_debug_ret_from_exc(struct pt_regs *regs)
-{
- int pid;
- struct ring_node *rr;
-
- pid = current->pid;
- rr = event_ring + event_ptr;
- rr->evt = 0xfffe;
- rr->ret_addr = 0;
- rr->event = 0;
- rr->tra = 0;
- rr->pid = pid;
- rr->pc = regs->pc;
- event_ptr = (event_ptr + 1) & 15;
-}
-
-#endif /* CONFIG_POOR_MANS_STRACE */
-
-/* ======================================================================= */
-
-void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs)
-{
-
- unsigned long long ah, al, bh, bl, ch, cl;
-
- printk("\n");
- printk("EXCEPTION - %s: task %d; Linux trap # %d; signal = %d\n",
- ((from) ? from : "???"), current->pid, trapnr, signr);
-
- asm volatile ("getcon " __EXPEVT ", %0":"=r"(ah));
- asm volatile ("getcon " __EXPEVT ", %0":"=r"(al));
- ah = (ah) >> 32;
- al = (al) & 0xffffffff;
- asm volatile ("getcon " __KCR1 ", %0":"=r"(bh));
- asm volatile ("getcon " __KCR1 ", %0":"=r"(bl));
- bh = (bh) >> 32;
- bl = (bl) & 0xffffffff;
- asm volatile ("getcon " __INTEVT ", %0":"=r"(ch));
- asm volatile ("getcon " __INTEVT ", %0":"=r"(cl));
- ch = (ch) >> 32;
- cl = (cl) & 0xffffffff;
- printk("EXPE: %08Lx%08Lx KCR1: %08Lx%08Lx INTE: %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
-
- asm volatile ("getcon " __PEXPEVT ", %0":"=r"(ah));
- asm volatile ("getcon " __PEXPEVT ", %0":"=r"(al));
- ah = (ah) >> 32;
- al = (al) & 0xffffffff;
- asm volatile ("getcon " __PSPC ", %0":"=r"(bh));
- asm volatile ("getcon " __PSPC ", %0":"=r"(bl));
- bh = (bh) >> 32;
- bl = (bl) & 0xffffffff;
- asm volatile ("getcon " __PSSR ", %0":"=r"(ch));
- asm volatile ("getcon " __PSSR ", %0":"=r"(cl));
- ch = (ch) >> 32;
- cl = (cl) & 0xffffffff;
- printk("PEXP: %08Lx%08Lx PSPC: %08Lx%08Lx PSSR: %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
-
- ah = (regs->pc) >> 32;
- al = (regs->pc) & 0xffffffff;
- bh = (regs->regs[18]) >> 32;
- bl = (regs->regs[18]) & 0xffffffff;
- ch = (regs->regs[15]) >> 32;
- cl = (regs->regs[15]) & 0xffffffff;
- printk("PC : %08Lx%08Lx LINK: %08Lx%08Lx SP : %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
-
- ah = (regs->sr) >> 32;
- al = (regs->sr) & 0xffffffff;
- asm volatile ("getcon " __TEA ", %0":"=r"(bh));
- asm volatile ("getcon " __TEA ", %0":"=r"(bl));
- bh = (bh) >> 32;
- bl = (bl) & 0xffffffff;
- asm volatile ("getcon " __KCR0 ", %0":"=r"(ch));
- asm volatile ("getcon " __KCR0 ", %0":"=r"(cl));
- ch = (ch) >> 32;
- cl = (cl) & 0xffffffff;
- printk("SR : %08Lx%08Lx TEA : %08Lx%08Lx KCR0: %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
-
- ah = (regs->regs[0]) >> 32;
- al = (regs->regs[0]) & 0xffffffff;
- bh = (regs->regs[1]) >> 32;
- bl = (regs->regs[1]) & 0xffffffff;
- ch = (regs->regs[2]) >> 32;
- cl = (regs->regs[2]) & 0xffffffff;
- printk("R0 : %08Lx%08Lx R1 : %08Lx%08Lx R2 : %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
-
- ah = (regs->regs[3]) >> 32;
- al = (regs->regs[3]) & 0xffffffff;
- bh = (regs->regs[4]) >> 32;
- bl = (regs->regs[4]) & 0xffffffff;
- ch = (regs->regs[5]) >> 32;
- cl = (regs->regs[5]) & 0xffffffff;
- printk("R3 : %08Lx%08Lx R4 : %08Lx%08Lx R5 : %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
-
- ah = (regs->regs[6]) >> 32;
- al = (regs->regs[6]) & 0xffffffff;
- bh = (regs->regs[7]) >> 32;
- bl = (regs->regs[7]) & 0xffffffff;
- ch = (regs->regs[8]) >> 32;
- cl = (regs->regs[8]) & 0xffffffff;
- printk("R6 : %08Lx%08Lx R7 : %08Lx%08Lx R8 : %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
-
- ah = (regs->regs[9]) >> 32;
- al = (regs->regs[9]) & 0xffffffff;
- bh = (regs->regs[10]) >> 32;
- bl = (regs->regs[10]) & 0xffffffff;
- ch = (regs->regs[11]) >> 32;
- cl = (regs->regs[11]) & 0xffffffff;
- printk("R9 : %08Lx%08Lx R10 : %08Lx%08Lx R11 : %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
- printk("....\n");
-
- ah = (regs->tregs[0]) >> 32;
- al = (regs->tregs[0]) & 0xffffffff;
- bh = (regs->tregs[1]) >> 32;
- bl = (regs->tregs[1]) & 0xffffffff;
- ch = (regs->tregs[2]) >> 32;
- cl = (regs->tregs[2]) & 0xffffffff;
- printk("T0 : %08Lx%08Lx T1 : %08Lx%08Lx T2 : %08Lx%08Lx\n",
- ah, al, bh, bl, ch, cl);
- printk("....\n");
-
- print_dtlb();
- print_itlb();
-}
-
-/* ======================================================================= */
-
-/*
-** Depending on <base> scan the MMU, Data or Instruction side
-** looking for a valid mapping matching Eaddr & asid.
-** Return -1 if not found or the TLB id entry otherwise.
-** Note: it works only for 4k pages!
-*/
-static unsigned long
-lookup_mmu_side(unsigned long base, unsigned long Eaddr, unsigned long asid)
-{
- regType_t pteH;
- unsigned long epn;
- int count;
-
- epn = Eaddr & 0xfffff000;
-
- for (count = 0; count < MAX_TLBs; count++, base += TLB_STEP) {
- pteH = getConfigReg(base);
- if (GET_VALID(pteH))
- if ((unsigned long) GET_EPN(pteH) == epn)
- if ((unsigned long) GET_ASID(pteH) == asid)
- break;
- }
- return ((unsigned long) ((count < MAX_TLBs) ? base : -1));
-}
-
-unsigned long lookup_dtlb(unsigned long Eaddr)
-{
- unsigned long asid = get_asid();
- return (lookup_mmu_side((u64) DTLB_BASE, Eaddr, asid));
-}
-
-unsigned long lookup_itlb(unsigned long Eaddr)
-{
- unsigned long asid = get_asid();
- return (lookup_mmu_side((u64) ITLB_BASE, Eaddr, asid));
-}
-
-void print_page(struct page *page)
-{
- printk(" page[%p] -> index 0x%lx, count 0x%x, flags 0x%lx\n",
- page, page->index, page_count(page), page->flags);
- printk(" address_space = %p, pages =%ld\n", page->mapping,
- page->mapping->nrpages);
-
-}
diff --git a/arch/sh/lib64/memcpy.S b/arch/sh/lib64/memcpy.S
new file mode 100644
index 00000000000..5d682e0ee24
--- /dev/null
+++ b/arch/sh/lib64/memcpy.S
@@ -0,0 +1,201 @@
+/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
+/* Modified by SuperH, Inc. September 2003 */
+!
+! Fast SH memcpy
+!
+! by Toshiyasu Morita (tm@netcom.com)
+! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
+! SH5 code Copyright 2002 SuperH Ltd.
+!
+! Entry: ARG0: destination pointer
+! ARG1: source pointer
+! ARG2: byte count
+!
+! Exit: RESULT: destination pointer
+! any other registers in the range r0-r7: trashed
+!
+! Notes: Usually one wants to do small reads and write a longword, but
+! unfortunately it is difficult in some cases to concatanate bytes
+! into a longword on the SH, so this does a longword read and small
+! writes.
+!
+! This implementation makes two assumptions about how it is called:
+!
+! 1.: If the byte count is nonzero, the address of the last byte to be
+! copied is unsigned greater than the address of the first byte to
+! be copied. This could be easily swapped for a signed comparison,
+! but the algorithm used needs some comparison.
+!
+! 2.: When there are two or three bytes in the last word of an 11-or-more
+! bytes memory chunk to b copied, the rest of the word can be read
+! without side effects.
+! This could be easily changed by increasing the minimum size of
+! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
+! however, this would cost a few extra cyles on average.
+! For SHmedia, the assumption is that any quadword can be read in its
+! enirety if at least one byte is included in the copy.
+!
+
+ .section .text..SHmedia32,"ax"
+ .globl memcpy
+ .type memcpy, @function
+ .align 5
+
+memcpy:
+
+#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
+#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
+#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
+#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
+
+ ld.b r3,0,r63
+ pta/l Large,tr0
+ movi 25,r0
+ bgeu/u r4,r0,tr0
+ nsb r4,r0
+ shlli r0,5,r0
+ movi (L1-L0+63*32 + 1) & 0xffff,r1
+ sub r1, r0, r0
+L0: ptrel r0,tr0
+ add r2,r4,r5
+ ptabs r18,tr1
+ add r3,r4,r6
+ blink tr0,r63
+
+/* Rearranged to make cut2 safe */
+ .balign 8
+L4_7: /* 4..7 byte memcpy cntd. */
+ stlo.l r2, 0, r0
+ or r6, r7, r6
+ sthi.l r5, -1, r6
+ stlo.l r5, -4, r6
+ blink tr1,r63
+
+ .balign 8
+L1: /* 0 byte memcpy */
+ nop
+ blink tr1,r63
+ nop
+ nop
+ nop
+ nop
+
+L2_3: /* 2 or 3 byte memcpy cntd. */
+ st.b r5,-1,r6
+ blink tr1,r63
+
+ /* 1 byte memcpy */
+ ld.b r3,0,r0
+ st.b r2,0,r0
+ blink tr1,r63
+
+L8_15: /* 8..15 byte memcpy cntd. */
+ stlo.q r2, 0, r0
+ or r6, r7, r6
+ sthi.q r5, -1, r6
+ stlo.q r5, -8, r6
+ blink tr1,r63
+
+ /* 2 or 3 byte memcpy */
+ ld.b r3,0,r0
+ ld.b r2,0,r63
+ ld.b r3,1,r1
+ st.b r2,0,r0
+ pta/l L2_3,tr0
+ ld.b r6,-1,r6
+ st.b r2,1,r1
+ blink tr0, r63
+
+ /* 4 .. 7 byte memcpy */
+ LDUAL (r3, 0, r0, r1)
+ pta L4_7, tr0
+ ldlo.l r6, -4, r7
+ or r0, r1, r0
+ sthi.l r2, 3, r0
+ ldhi.l r6, -1, r6
+ blink tr0, r63
+
+ /* 8 .. 15 byte memcpy */
+ LDUAQ (r3, 0, r0, r1)
+ pta L8_15, tr0
+ ldlo.q r6, -8, r7
+ or r0, r1, r0
+ sthi.q r2, 7, r0
+ ldhi.q r6, -1, r6
+ blink tr0, r63
+
+ /* 16 .. 24 byte memcpy */
+ LDUAQ (r3, 0, r0, r1)
+ LDUAQ (r3, 8, r8, r9)
+ or r0, r1, r0
+ sthi.q r2, 7, r0
+ or r8, r9, r8
+ sthi.q r2, 15, r8
+ ldlo.q r6, -8, r7
+ ldhi.q r6, -1, r6
+ stlo.q r2, 8, r8
+ stlo.q r2, 0, r0
+ or r6, r7, r6
+ sthi.q r5, -1, r6
+ stlo.q r5, -8, r6
+ blink tr1,r63
+
+Large:
+ ld.b r2, 0, r63
+ pta/l Loop_ua, tr1
+ ori r3, -8, r7
+ sub r2, r7, r22
+ sub r3, r2, r6
+ add r2, r4, r5
+ ldlo.q r3, 0, r0
+ addi r5, -16, r5
+ movi 64+8, r27 // could subtract r7 from that.
+ stlo.q r2, 0, r0
+ sthi.q r2, 7, r0
+ ldx.q r22, r6, r0
+ bgtu/l r27, r4, tr1
+
+ addi r5, -48, r27
+ pta/l Loop_line, tr0
+ addi r6, 64, r36
+ addi r6, -24, r19
+ addi r6, -16, r20
+ addi r6, -8, r21
+
+Loop_line:
+ ldx.q r22, r36, r63
+ alloco r22, 32
+ addi r22, 32, r22
+ ldx.q r22, r19, r23
+ sthi.q r22, -25, r0
+ ldx.q r22, r20, r24
+ ldx.q r22, r21, r25
+ stlo.q r22, -32, r0
+ ldx.q r22, r6, r0
+ sthi.q r22, -17, r23
+ sthi.q r22, -9, r24
+ sthi.q r22, -1, r25
+ stlo.q r22, -24, r23
+ stlo.q r22, -16, r24
+ stlo.q r22, -8, r25
+ bgeu r27, r22, tr0
+
+Loop_ua:
+ addi r22, 8, r22
+ sthi.q r22, -1, r0
+ stlo.q r22, -8, r0
+ ldx.q r22, r6, r0
+ bgtu/l r5, r22, tr1
+
+ add r3, r4, r7
+ ldlo.q r7, -8, r1
+ sthi.q r22, 7, r0
+ ldhi.q r7, -1, r7
+ ptabs r18,tr1
+ stlo.q r22, 0, r0
+ or r1, r7, r1
+ sthi.q r5, 15, r1
+ stlo.q r5, 8, r1
+ blink tr1, r63
+
+ .size memcpy,.-memcpy
diff --git a/arch/sh/lib64/memcpy.c b/arch/sh/lib64/memcpy.c
deleted file mode 100644
index fba436a92bf..00000000000
--- a/arch/sh/lib64/memcpy.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (C) 2002 Mark Debbage (Mark.Debbage@superh.com)
- *
- * May be copied or modified under the terms of the GNU General Public
- * License. See linux/COPYING for more information.
- *
- */
-
-#include <linux/types.h>
-#include <asm/string.h>
-
-// This is a simplistic optimization of memcpy to increase the
-// granularity of access beyond one byte using aligned
-// loads and stores. This is not an optimal implementation
-// for SH-5 (especially with regard to prefetching and the cache),
-// and a better version should be provided later ...
-
-void *memcpy(void *dest, const void *src, size_t count)
-{
- char *d = (char *) dest, *s = (char *) src;
-
- if (count >= 32) {
- int i = 8 - (((unsigned long) d) & 0x7);
-
- if (i != 8)
- while (i-- && count--) {
- *d++ = *s++;
- }
-
- if (((((unsigned long) d) & 0x7) == 0) &&
- ((((unsigned long) s) & 0x7) == 0)) {
- while (count >= 32) {
- unsigned long long t1, t2, t3, t4;
- t1 = *(unsigned long long *) (s);
- t2 = *(unsigned long long *) (s + 8);
- t3 = *(unsigned long long *) (s + 16);
- t4 = *(unsigned long long *) (s + 24);
- *(unsigned long long *) (d) = t1;
- *(unsigned long long *) (d + 8) = t2;
- *(unsigned long long *) (d + 16) = t3;
- *(unsigned long long *) (d + 24) = t4;
- d += 32;
- s += 32;
- count -= 32;
- }
- while (count >= 8) {
- *(unsigned long long *) d =
- *(unsigned long long *) s;
- d += 8;
- s += 8;
- count -= 8;
- }
- }
-
- if (((((unsigned long) d) & 0x3) == 0) &&
- ((((unsigned long) s) & 0x3) == 0)) {
- while (count >= 4) {
- *(unsigned long *) d = *(unsigned long *) s;
- d += 4;
- s += 4;
- count -= 4;
- }
- }
-
- if (((((unsigned long) d) & 0x1) == 0) &&
- ((((unsigned long) s) & 0x1) == 0)) {
- while (count >= 2) {
- *(unsigned short *) d = *(unsigned short *) s;
- d += 2;
- s += 2;
- count -= 2;
- }
- }
- }
-
- while (count--) {
- *d++ = *s++;
- }
-
- return d;
-}
diff --git a/arch/sh/lib64/memset.S b/arch/sh/lib64/memset.S
new file mode 100644
index 00000000000..2d37b048855
--- /dev/null
+++ b/arch/sh/lib64/memset.S
@@ -0,0 +1,91 @@
+/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
+/* Modified by SuperH, Inc. September 2003 */
+!
+! Fast SH memset
+!
+! by Toshiyasu Morita (tm@netcom.com)
+!
+! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
+! Copyright 2002 SuperH Ltd.
+!
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define SHHI shlld
+#define SHLO shlrd
+#else
+#define SHHI shlrd
+#define SHLO shlld
+#endif
+
+ .section .text..SHmedia32,"ax"
+ .globl memset
+ .type memset, @function
+
+ .align 5
+
+memset:
+ pta/l multiquad, tr0
+ andi r2, 7, r22
+ ptabs r18, tr2
+ mshflo.b r3,r3,r3
+ add r4, r22, r23
+ mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
+
+ movi 8, r9
+ bgtu/u r23, r9, tr0 // multiquad
+
+ beqi/u r4, 0, tr2 // Return with size 0 - ensures no mem accesses
+ ldlo.q r2, 0, r7
+ shlli r4, 2, r4
+ movi -1, r8
+ SHHI r8, r4, r8
+ SHHI r8, r4, r8
+ mcmv r7, r8, r3
+ stlo.q r2, 0, r3
+ blink tr2, r63
+
+multiquad:
+ pta/l lastquad, tr0
+ stlo.q r2, 0, r3
+ shlri r23, 3, r24
+ add r2, r4, r5
+ beqi/u r24, 1, tr0 // lastquad
+ pta/l loop, tr1
+ sub r2, r22, r25
+ andi r5, -8, r20 // calculate end address and
+ addi r20, -7*8, r8 // loop end address; This might overflow, so we need
+ // to use a different test before we start the loop
+ bge/u r24, r9, tr1 // loop
+ st.q r25, 8, r3
+ st.q r20, -8, r3
+ shlri r24, 1, r24
+ beqi/u r24, 1, tr0 // lastquad
+ st.q r25, 16, r3
+ st.q r20, -16, r3
+ beqi/u r24, 2, tr0 // lastquad
+ st.q r25, 24, r3
+ st.q r20, -24, r3
+lastquad:
+ sthi.q r5, -1, r3
+ blink tr2,r63
+
+loop:
+!!! alloco r25, 32 // QQQ comment out for short-term fix to SHUK #3895.
+ // QQQ commenting out is locically correct, but sub-optimal
+ // QQQ Sean McGoogan - 4th April 2003.
+ st.q r25, 8, r3
+ st.q r25, 16, r3
+ st.q r25, 24, r3
+ st.q r25, 32, r3
+ addi r25, 32, r25
+ bgeu/l r8, r25, tr1 // loop
+
+ st.q r20, -40, r3
+ st.q r20, -32, r3
+ st.q r20, -24, r3
+ st.q r20, -16, r3
+ st.q r20, -8, r3
+ sthi.q r5, -1, r3
+ blink tr2,r63
+
+ .size memset,.-memset
diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c
index ff559e2a96f..38c954e04f6 100644
--- a/arch/sh/lib64/panic.c
+++ b/arch/sh/lib64/panic.c
@@ -6,53 +6,10 @@
* for more details.
*/
-#include <linux/kernel.h>
-#include <asm/io.h>
-#include <asm/cpu/registers.h>
-
-/* THIS IS A PHYSICAL ADDRESS */
-#define HDSP2534_ADDR (0x04002100)
-
-#ifdef CONFIG_SH_CAYMAN
-
-static void poor_mans_delay(void)
-{
- int i;
- for (i = 0; i < 2500000; i++) {
- } /* poor man's delay */
-}
-
-static void show_value(unsigned long x)
-{
- int i;
- unsigned nibble;
- for (i = 0; i < 8; i++) {
- nibble = ((x >> (i * 4)) & 0xf);
-
- ctrl_outb(nibble + ((nibble > 9) ? 55 : 48),
- HDSP2534_ADDR + 0xe0 + ((7 - i) << 2));
- }
-}
-
-#endif
-
void
panic_handler(unsigned long panicPC, unsigned long panicSSR,
unsigned long panicEXPEVT)
{
-#ifdef CONFIG_SH_CAYMAN
- while (1) {
- /* This piece of code displays the PC on the LED display */
- show_value(panicPC);
- poor_mans_delay();
- show_value(panicSSR);
- poor_mans_delay();
- show_value(panicEXPEVT);
- poor_mans_delay();
- }
-#endif
-
/* Never return from the panic handler */
for (;;) ;
-
}
diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S
new file mode 100644
index 00000000000..1963bbd4228
--- /dev/null
+++ b/arch/sh/lib64/sdivsi3.S
@@ -0,0 +1,135 @@
+ .global __sdivsi3
+ .global __sdivsi3_1
+ .global __sdivsi3_2
+ .section .text..SHmedia32,"ax"
+ .align 2
+
+ /* inputs: r4,r5 */
+ /* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
+ /* result in r0 */
+__sdivsi3:
+__sdivsi3_1:
+ ptb __div_table,tr0
+ gettr tr0,r20
+
+__sdivsi3_2:
+ nsb r5, r1
+ shlld r5, r1, r25 /* normalize; [-2 ..1, 1..2) in s2.62 */
+ shari r25, 58, r21 /* extract 5(6) bit index (s2.4 with hole -1..1) */
+ /* bubble */
+ ldx.ub r20, r21, r19 /* u0.8 */
+ shari r25, 32, r25 /* normalize to s2.30 */
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 /* s2.38 */
+ ldx.w r20, r21, r21 /* s2.14 */
+ ptabs r18, tr0
+ shari r19, 24, r19 /* truncate to s2.14 */
+ sub r21, r19, r19 /* some 11 bit inverse in s1.14 */
+ muls.l r19, r19, r21 /* u0.28 */
+ sub r63, r1, r1
+ addi r1, 92, r1
+ muls.l r25, r21, r18 /* s2.58 */
+ shlli r19, 45, r19 /* multiply by two and convert to s2.58 */
+ /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18 /* some 22 bit inverse in s1.30 */
+ muls.l r18, r25, r0 /* s2.60 */
+ muls.l r18, r4, r25 /* s32.30 */
+ /* bubble */
+ shari r0, 16, r19 /* s-16.44 */
+ muls.l r19, r18, r19 /* s-16.74 */
+ shari r25, 63, r0
+ shari r4, 14, r18 /* s19.-14 */
+ shari r19, 30, r19 /* s-16.44 */
+ muls.l r19, r18, r19 /* s15.30 */
+ xor r21, r0, r21 /* You could also use the constant 1 << 27. */
+ add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+
+/* This table has been generated by divtab.c .
+Defects for bias -330:
+ Max defect: 6.081536e-07 at -1.000000e+00
+ Min defect: 2.849516e-08 at 1.030651e+00
+ Max 2nd step defect: 9.606539e-12 at -1.000000e+00
+ Min 2nd step defect: 0.000000e+00 at 0.000000e+00
+ Defect at 1: 1.238659e-07
+ Defect at -2: 1.061708e-07 */
+
+ .balign 2
+ .type __div_table,@object
+ .size __div_table,128
+/* negative division constants */
+ .word -16638
+ .word -17135
+ .word -17737
+ .word -18433
+ .word -19103
+ .word -19751
+ .word -20583
+ .word -21383
+ .word -22343
+ .word -23353
+ .word -24407
+ .word -25582
+ .word -26863
+ .word -28382
+ .word -29965
+ .word -31800
+/* negative division factors */
+ .byte 66
+ .byte 70
+ .byte 75
+ .byte 81
+ .byte 87
+ .byte 93
+ .byte 101
+ .byte 109
+ .byte 119
+ .byte 130
+ .byte 142
+ .byte 156
+ .byte 172
+ .byte 192
+ .byte 214
+ .byte 241
+ .skip 16
+ .global __div_table
+__div_table:
+ .skip 16
+/* positive division factors */
+ .byte 241
+ .byte 214
+ .byte 192
+ .byte 172
+ .byte 156
+ .byte 142
+ .byte 130
+ .byte 119
+ .byte 109
+ .byte 101
+ .byte 93
+ .byte 87
+ .byte 81
+ .byte 75
+ .byte 70
+ .byte 66
+/* positive division constants */
+ .word 31801
+ .word 29966
+ .word 28383
+ .word 26864
+ .word 25583
+ .word 24408
+ .word 23354
+ .word 22344
+ .word 21384
+ .word 20584
+ .word 19752
+ .word 19104
+ .word 18434
+ .word 17738
+ .word 17136
+ .word 16639
diff --git a/arch/sh/lib64/strcpy.S b/arch/sh/lib64/strcpy.S
new file mode 100644
index 00000000000..ea7c9c533ee
--- /dev/null
+++ b/arch/sh/lib64/strcpy.S
@@ -0,0 +1,97 @@
+/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
+/* Modified by SuperH, Inc. September 2003 */
+! Entry: arg0: destination
+! arg1: source
+! Exit: result: destination
+!
+! SH5 code Copyright 2002 SuperH Ltd.
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define SHHI shlld
+#define SHLO shlrd
+#else
+#define SHHI shlrd
+#define SHLO shlld
+#endif
+
+ .section .text..SHmedia32,"ax"
+ .globl strcpy
+ .type strcpy, @function
+ .align 5
+
+strcpy:
+
+ pta/l shortstring,tr1
+ ldlo.q r3,0,r4
+ ptabs r18,tr4
+ shlli r3,3,r7
+ addi r2, 8, r0
+ mcmpeq.b r4,r63,r6
+ SHHI r6,r7,r6
+ bnei/u r6,0,tr1 // shortstring
+ pta/l no_lddst, tr2
+ ori r3,-8,r23
+ sub r2, r23, r0
+ sub r3, r2, r21
+ addi r21, 8, r20
+ ldx.q r0, r21, r5
+ pta/l loop, tr0
+ ori r2,-8,r22
+ mcmpeq.b r5, r63, r6
+ bgt/u r22, r23, tr2 // no_lddst
+
+ // r22 < r23 : Need to do a load from the destination.
+ // r22 == r23 : Doesn't actually need to load from destination,
+ // but still can be handled here.
+ ldlo.q r2, 0, r9
+ movi -1, r8
+ SHLO r8, r7, r8
+ mcmv r4, r8, r9
+ stlo.q r2, 0, r9
+ beqi/l r6, 0, tr0 // loop
+
+ add r5, r63, r4
+ addi r0, 8, r0
+ blink tr1, r63 // shortstring
+no_lddst:
+ // r22 > r23: note that for r22 == r23 the sthi.q would clobber
+ // bytes before the destination region.
+ stlo.q r2, 0, r4
+ SHHI r4, r7, r4
+ sthi.q r0, -1, r4
+ beqi/l r6, 0, tr0 // loop
+
+ add r5, r63, r4
+ addi r0, 8, r0
+shortstring:
+#if __BYTE_ORDER != __LITTLE_ENDIAN
+ pta/l shortstring2,tr1
+ byterev r4,r4
+#endif
+shortstring2:
+ st.b r0,-8,r4
+ andi r4,0xff,r5
+ shlri r4,8,r4
+ addi r0,1,r0
+ bnei/l r5,0,tr1
+ blink tr4,r63 // return
+
+ .balign 8
+loop:
+ stlo.q r0, 0, r5
+ ldx.q r0, r20, r4
+ addi r0, 16, r0
+ sthi.q r0, -9, r5
+ mcmpeq.b r4, r63, r6
+ bnei/u r6, 0, tr1 // shortstring
+ ldx.q r0, r21, r5
+ stlo.q r0, -8, r4
+ sthi.q r0, -1, r4
+ mcmpeq.b r5, r63, r6
+ beqi/l r6, 0, tr0 // loop
+
+ add r5, r63, r4
+ addi r0, 8, r0
+ blink tr1, r63 // shortstring
+
+ .size strcpy,.-strcpy
diff --git a/arch/sh/lib64/strlen.S b/arch/sh/lib64/strlen.S
new file mode 100644
index 00000000000..cbc0d912e5f
--- /dev/null
+++ b/arch/sh/lib64/strlen.S
@@ -0,0 +1,33 @@
+/*
+ * Simplistic strlen() implementation for SHmedia.
+ *
+ * Copyright (C) 2003 Paul Mundt <lethal@linux-sh.org>
+ */
+
+ .section .text..SHmedia32,"ax"
+ .globl strlen
+ .type strlen,@function
+
+ .balign 16
+strlen:
+ ptabs r18, tr4
+
+ /*
+ * Note: We could easily deal with the NULL case here with a simple
+ * sanity check, though it seems that the behavior we want is to fault
+ * in the event that r2 == NULL, so we don't bother.
+ */
+/* beqi r2, 0, tr4 */ ! Sanity check
+
+ movi -1, r0
+ pta/l loop, tr0
+loop:
+ ld.b r2, 0, r1
+ addi r2, 1, r2
+ addi r0, 1, r0
+ bnei/l r1, 0, tr0
+
+ or r0, r63, r2
+ blink tr4, r63
+
+ .size strlen,.-strlen
diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c
index d76bd801194..f215b063da7 100644
--- a/arch/sh/lib64/udelay.c
+++ b/arch/sh/lib64/udelay.c
@@ -33,7 +33,7 @@ void __delay(unsigned long loops)
:"0"(loops));
}
-inline void __const_udelay(unsigned long xloops)
+void __const_udelay(unsigned long xloops)
{
__delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy));
}
diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S
new file mode 100644
index 00000000000..6895c0225b8
--- /dev/null
+++ b/arch/sh/lib64/udivdi3.S
@@ -0,0 +1,120 @@
+ .section .text..SHmedia32,"ax"
+ .align 2
+ .global __udivdi3
+__udivdi3:
+ shlri r3,1,r4
+ nsb r4,r22
+ shlld r3,r22,r6
+ shlri r6,49,r5
+ movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
+ sub r21,r5,r1
+ mmulfx.w r1,r1,r4
+ mshflo.w r1,r63,r1
+ sub r63,r22,r20 // r63 == 64 % 64
+ mmulfx.w r5,r4,r4
+ pta large_divisor,tr0
+ addi r20,32,r9
+ msub.w r1,r4,r1
+ madd.w r1,r1,r1
+ mmulfx.w r1,r1,r4
+ shlri r6,32,r7
+ bgt/u r9,r63,tr0 // large_divisor
+ mmulfx.w r5,r4,r4
+ shlri r2,32+14,r19
+ addi r22,-31,r0
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r19,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ mulu.l r5,r3,r8
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ shlld r8,r0,r8
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r2,r8,r2
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
+
+ shlri r2,22,r21
+ mulu.l r21,r1,r21
+ shlld r5,r0,r8
+ addi r20,30-22,r0
+ shlrd r21,r0,r21
+ mulu.l r21,r3,r5
+ add r8,r21,r8
+ mcmpgt.l r21,r63,r21 // See Note 1
+ addi r20,30,r0
+ mshfhi.l r63,r21,r21
+ sub r2,r5,r2
+ andc r2,r21,r2
+
+ /* small divisor: need a third divide step */
+ mulu.l r2,r1,r7
+ ptabs r18,tr0
+ addi r2,1,r2
+ shlrd r7,r0,r7
+ mulu.l r7,r3,r5
+ add r8,r7,r8
+ sub r2,r3,r2
+ cmpgt r2,r5,r5
+ add r8,r5,r2
+ /* could test r3 here to check for divide by zero. */
+ blink tr0,r63
+
+large_divisor:
+ mmulfx.w r5,r4,r4
+ shlrd r2,r9,r25
+ shlri r25,32,r8
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r8,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ shlri r5,14-1,r8
+ mulu.l r8,r7,r5
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r25,r5,r25
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
+
+ shlri r25,22,r21
+ mulu.l r21,r1,r21
+ pta no_lo_adj,tr0
+ addi r22,32,r0
+ shlri r21,40,r21
+ mulu.l r21,r7,r5
+ add r8,r21,r8
+ shlld r2,r0,r2
+ sub r25,r5,r25
+ bgtu/u r7,r25,tr0 // no_lo_adj
+ addi r8,1,r8
+ sub r25,r7,r25
+no_lo_adj:
+ mextr4 r2,r25,r2
+
+ /* large_divisor: only needs a few adjustments. */
+ mulu.l r8,r6,r5
+ ptabs r18,tr0
+ /* bubble */
+ cmpgtu r5,r2,r5
+ sub r8,r5,r2
+ blink tr0,r63
+
+/* Note 1: To shift the result of the second divide stage so that the result
+ always fits into 32 bits, yet we still reduce the rest sufficiently
+ would require a lot of instructions to do the shifts just right. Using
+ the full 64 bit shift result to multiply with the divisor would require
+ four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+ Fortunately, if the upper 32 bits of the shift result are nonzero, we
+ know that the rest after taking this partial result into account will
+ fit into 32 bits. So we just clear the upper 32 bits of the rest if the
+ upper 32 bits of the partial result are nonzero. */
diff --git a/arch/sh/lib64/udivsi3.S b/arch/sh/lib64/udivsi3.S
new file mode 100644
index 00000000000..e68120e4b84
--- /dev/null
+++ b/arch/sh/lib64/udivsi3.S
@@ -0,0 +1,59 @@
+ .global __udivsi3
+ .section .text..SHmedia32,"ax"
+ .align 2
+
+/*
+ inputs: r4,r5
+ clobbered: r18,r19,r20,r21,r22,r25,tr0
+ result in r0.
+ */
+__udivsi3:
+ addz.l r5,r63,r22
+ nsb r22,r0
+ shlld r22,r0,r25
+ shlri r25,48,r25
+ movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */
+ sub r20,r25,r21
+ mmulfx.w r21,r21,r19
+ mshflo.w r21,r63,r21
+ ptabs r18,tr0
+ mmulfx.w r25,r19,r19
+ sub r20,r0,r0
+ /* bubble */
+ msub.w r21,r19,r19
+
+ /*
+ * It would be nice for scheduling to do this add to r21 before
+ * the msub.w, but we need a different value for r19 to keep
+ * errors under control.
+ */
+ addi r19,-2,r21
+ mulu.l r4,r21,r18
+ mmulfx.w r19,r19,r19
+ shlli r21,15,r21
+ shlrd r18,r0,r18
+ mulu.l r18,r22,r20
+ mmacnfx.wl r25,r19,r21
+ /* bubble */
+ sub r4,r20,r25
+
+ mulu.l r25,r21,r19
+ addi r0,14,r0
+ /* bubble */
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ add r18,r19,r18
+ /* bubble */
+ sub.l r25,r20,r25
+
+ mulu.l r25,r21,r19
+ addz.l r25,r63,r25
+ sub r25,r22,r25
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ addi r25,1,r25
+ add r18,r19,r18
+
+ cmpgt r25,r20,r25
+ add.l r18,r25,r0
+ blink tr0,r63