Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.

Let it rip!
This commit is contained in:
Linus Torvalds
2005-04-16 15:20:36 -07:00
commit 1da177e4c3
17291 changed files with 6718755 additions and 0 deletions

58
arch/alpha/lib/Makefile Normal file
View File

@@ -0,0 +1,58 @@
#
# Makefile for alpha-specific library files..
#
EXTRA_AFLAGS := $(CFLAGS)
EXTRA_CFLAGS := -Werror
# Many of these routines have implementations tuned for ev6.
# Choose them iff we're targeting ev6 specifically.
ev6-$(CONFIG_ALPHA_EV6) := ev6-
# Several make use of the cttz instruction introduced in ev67.
ev67-$(CONFIG_ALPHA_EV67) := ev67-
lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \
udelay.o \
$(ev6-y)memset.o \
$(ev6-y)memcpy.o \
memmove.o \
checksum.o \
csum_partial_copy.o \
$(ev67-y)strlen.o \
$(ev67-y)strcat.o \
strcpy.o \
$(ev67-y)strncat.o \
strncpy.o \
$(ev6-y)stxcpy.o \
$(ev6-y)stxncpy.o \
$(ev67-y)strchr.o \
$(ev67-y)strrchr.o \
$(ev6-y)memchr.o \
$(ev6-y)copy_user.o \
$(ev6-y)clear_user.o \
$(ev6-y)strncpy_from_user.o \
$(ev67-y)strlen_user.o \
$(ev6-y)csum_ipv6_magic.o \
$(ev6-y)clear_page.o \
$(ev6-y)copy_page.o \
strcasecmp.o \
fpreg.o \
callback_srm.o srm_puts.o srm_printk.o
lib-$(CONFIG_SMP) += dec_and_lock.o
# The division routines are built from single source, with different defines.
AFLAGS___divqu.o = -DDIV
AFLAGS___remqu.o = -DREM
AFLAGS___divlu.o = -DDIV -DINTSIZE
AFLAGS___remlu.o = -DREM -DINTSIZE
$(obj)/__divqu.o: $(obj)/$(ev6-y)divide.S
$(cmd_as_o_S)
$(obj)/__remqu.o: $(obj)/$(ev6-y)divide.S
$(cmd_as_o_S)
$(obj)/__divlu.o: $(obj)/$(ev6-y)divide.S
$(cmd_as_o_S)
$(obj)/__remlu.o: $(obj)/$(ev6-y)divide.S
$(cmd_as_o_S)

View File

@@ -0,0 +1,104 @@
/*
* arch/alpha/lib/callback_srm.S
*/
#include <linux/config.h>
#include <asm/console.h>
.text
#define HWRPB_CRB_OFFSET 0xc0
#if defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC)
.align 4
srm_dispatch:
#if defined(CONFIG_ALPHA_GENERIC)
ldl $4,alpha_using_srm
beq $4,nosrm
#endif
ldq $0,hwrpb # gp is set up by CALLBACK macro.
ldl $25,0($25) # Pick up the wrapper data.
mov $20,$21 # Shift arguments right.
mov $19,$20
ldq $1,HWRPB_CRB_OFFSET($0)
mov $18,$19
mov $17,$18
mov $16,$17
addq $0,$1,$2 # CRB address
ldq $27,0($2) # DISPATCH procedure descriptor (VMS call std)
extwl $25,0,$16 # SRM callback function code
ldq $3,8($27) # call address
extwl $25,2,$25 # argument information (VMS calling std)
jmp ($3) # Return directly to caller of wrapper.
.align 4
.globl srm_fixup
.ent srm_fixup
srm_fixup:
ldgp $29,0($27)
#if defined(CONFIG_ALPHA_GENERIC)
ldl $4,alpha_using_srm
beq $4,nosrm
#endif
ldq $0,hwrpb
ldq $1,HWRPB_CRB_OFFSET($0)
addq $0,$1,$2 # CRB address
ldq $27,16($2) # VA of FIXUP procedure descriptor
ldq $3,8($27) # call address
lda $25,2($31) # two integer arguments
jmp ($3) # Return directly to caller of srm_fixup.
.end srm_fixup
#if defined(CONFIG_ALPHA_GENERIC)
.align 3
nosrm:
lda $0,-1($31)
ret
#endif
#define CALLBACK(NAME, CODE, ARG_CNT) \
.align 4; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \
ldgp $29,0($27); br $25,srm_dispatch; .word CODE, ARG_CNT; .end callback_##NAME
#else /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */
#define CALLBACK(NAME, CODE, ARG_CNT) \
.align 3; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \
lda $0,-1($31); ret; .end callback_##NAME
.align 3
.globl srm_fixup
.ent srm_fixup
srm_fixup:
lda $0,-1($31)
ret
.end srm_fixup
#endif /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */
CALLBACK(puts, CCB_PUTS, 4)
CALLBACK(open, CCB_OPEN, 3)
CALLBACK(close, CCB_CLOSE, 2)
CALLBACK(read, CCB_READ, 5)
CALLBACK(open_console, CCB_OPEN_CONSOLE, 1)
CALLBACK(close_console, CCB_CLOSE_CONSOLE, 1)
CALLBACK(getenv, CCB_GET_ENV, 4)
CALLBACK(setenv, CCB_SET_ENV, 4)
CALLBACK(getc, CCB_GETC, 2)
CALLBACK(reset_term, CCB_RESET_TERM, 2)
CALLBACK(term_int, CCB_SET_TERM_INT, 3)
CALLBACK(term_ctl, CCB_SET_TERM_CTL, 3)
CALLBACK(process_keycode, CCB_PROCESS_KEYCODE, 3)
CALLBACK(ioctl, CCB_IOCTL, 6)
CALLBACK(write, CCB_WRITE, 5)
CALLBACK(reset_env, CCB_RESET_ENV, 4)
CALLBACK(save_env, CCB_SAVE_ENV, 1)
CALLBACK(pswitch, CCB_PSWITCH, 3)
CALLBACK(bios_emul, CCB_BIOS_EMUL, 5)
.data
__alpha_using_srm: # For use by bootpheader
.long 7 # value is not 1 for link debugging
.weak alpha_using_srm; alpha_using_srm = __alpha_using_srm
__callback_init_done: # For use by bootpheader
.long 7 # value is not 1 for link debugging
.weak callback_init_done; callback_init_done = __callback_init_done

186
arch/alpha/lib/checksum.c Normal file
View File

@@ -0,0 +1,186 @@
/*
* arch/alpha/lib/checksum.c
*
* This file contains network checksum routines that are better done
* in an architecture-specific manner due to speed..
* Comments in other versions indicate that the algorithms are from RFC1071
*
* accellerated versions (and 21264 assembly versions ) contributed by
* Rick Gorton <rick.gorton@alpha-processor.com>
*/
#include <linux/module.h>
#include <linux/string.h>
#include <asm/byteorder.h>
static inline unsigned short from64to16(unsigned long x)
{
/* Using extract instructions is a bit more efficient
than the original shift/bitmask version. */
union {
unsigned long ul;
unsigned int ui[2];
unsigned short us[4];
} in_v, tmp_v, out_v;
in_v.ul = x;
tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1];
/* Since the bits of tmp_v.sh[3] are going to always be zero,
we don't have to bother to add that in. */
out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1]
+ (unsigned long) tmp_v.us[2];
/* Similarly, out_v.us[2] is always zero for the final add. */
return out_v.us[0] + out_v.us[1];
}
/*
* computes the checksum of the TCP/UDP pseudo-header
* returns a 16-bit checksum, already complemented.
*/
unsigned short int csum_tcpudp_magic(unsigned long saddr,
unsigned long daddr,
unsigned short len,
unsigned short proto,
unsigned int sum)
{
return ~from64to16(saddr + daddr + sum +
((unsigned long) ntohs(len) << 16) +
((unsigned long) proto << 8));
}
unsigned int csum_tcpudp_nofold(unsigned long saddr,
unsigned long daddr,
unsigned short len,
unsigned short proto,
unsigned int sum)
{
unsigned long result;
result = (saddr + daddr + sum +
((unsigned long) ntohs(len) << 16) +
((unsigned long) proto << 8));
/* Fold down to 32-bits so we don't lose in the typedef-less
network stack. */
/* 64 to 33 */
result = (result & 0xffffffff) + (result >> 32);
/* 33 to 32 */
result = (result & 0xffffffff) + (result >> 32);
return result;
}
/*
* Do a 64-bit checksum on an arbitrary memory area..
*
* This isn't a great routine, but it's not _horrible_ either. The
* inner loop could be unrolled a bit further, and there are better
* ways to do the carry, but this is reasonable.
*/
static inline unsigned long do_csum(const unsigned char * buff, int len)
{
int odd, count;
unsigned long result = 0;
if (len <= 0)
goto out;
odd = 1 & (unsigned long) buff;
if (odd) {
result = *buff << 8;
len--;
buff++;
}
count = len >> 1; /* nr of 16-bit words.. */
if (count) {
if (2 & (unsigned long) buff) {
result += *(unsigned short *) buff;
count--;
len -= 2;
buff += 2;
}
count >>= 1; /* nr of 32-bit words.. */
if (count) {
if (4 & (unsigned long) buff) {
result += *(unsigned int *) buff;
count--;
len -= 4;
buff += 4;
}
count >>= 1; /* nr of 64-bit words.. */
if (count) {
unsigned long carry = 0;
do {
unsigned long w = *(unsigned long *) buff;
count--;
buff += 8;
result += carry;
result += w;
carry = (w > result);
} while (count);
result += carry;
result = (result & 0xffffffff) + (result >> 32);
}
if (len & 4) {
result += *(unsigned int *) buff;
buff += 4;
}
}
if (len & 2) {
result += *(unsigned short *) buff;
buff += 2;
}
}
if (len & 1)
result += *buff;
result = from64to16(result);
if (odd)
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
out:
return result;
}
/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*/
unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
{
return ~do_csum(iph,ihl*4);
}
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
{
unsigned long result = do_csum(buff, len);
/* add in old sum, and carry.. */
result += sum;
/* 32+c bits -> 32 bits */
result = (result & 0xffffffff) + (result >> 32);
return result;
}
EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
unsigned short ip_compute_csum(unsigned char * buff, int len)
{
return ~from64to16(do_csum(buff,len));
}

View File

@@ -0,0 +1,39 @@
/*
* arch/alpha/lib/clear_page.S
*
* Zero an entire page.
*/
.text
.align 4
.global clear_page
.ent clear_page
clear_page:
.prologue 0
lda $0,128
nop
unop
nop
1: stq $31,0($16)
stq $31,8($16)
stq $31,16($16)
stq $31,24($16)
stq $31,32($16)
stq $31,40($16)
stq $31,48($16)
subq $0,1,$0
stq $31,56($16)
addq $16,64,$16
unop
bne $0,1b
ret
nop
unop
nop
.end clear_page

113
arch/alpha/lib/clear_user.S Normal file
View File

@@ -0,0 +1,113 @@
/*
* arch/alpha/lib/clear_user.S
* Contributed by Richard Henderson <rth@tamu.edu>
*
* Zero user space, handling exceptions as we go.
*
* We have to make sure that $0 is always up-to-date and contains the
* right "bytes left to zero" value (and that it is updated only _after_
* a successful copy). There is also some rather minor exception setup
* stuff.
*
* NOTE! This is not directly C-callable, because the calling semantics
* are different:
*
* Inputs:
* length in $0
* destination address in $6
* exception pointer in $7
* return address in $28 (exceptions expect it there)
*
* Outputs:
* bytes left to copy in $0
*
* Clobbers:
* $1,$2,$3,$4,$5,$6
*/
/* Allow an exception for an insn; exit if we get one. */
#define EX(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exception-99b($31); \
.previous
.set noat
.set noreorder
.align 4
.globl __do_clear_user
.ent __do_clear_user
.frame $30, 0, $28
.prologue 0
$loop:
and $1, 3, $4 # e0 :
beq $4, 1f # .. e1 :
0: EX( stq_u $31, 0($6) ) # e0 : zero one word
subq $0, 8, $0 # .. e1 :
subq $4, 1, $4 # e0 :
addq $6, 8, $6 # .. e1 :
bne $4, 0b # e1 :
unop # :
1: bic $1, 3, $1 # e0 :
beq $1, $tail # .. e1 :
2: EX( stq_u $31, 0($6) ) # e0 : zero four words
subq $0, 8, $0 # .. e1 :
EX( stq_u $31, 8($6) ) # e0 :
subq $0, 8, $0 # .. e1 :
EX( stq_u $31, 16($6) ) # e0 :
subq $0, 8, $0 # .. e1 :
EX( stq_u $31, 24($6) ) # e0 :
subq $0, 8, $0 # .. e1 :
subq $1, 4, $1 # e0 :
addq $6, 32, $6 # .. e1 :
bne $1, 2b # e1 :
$tail:
bne $2, 1f # e1 : is there a tail to do?
ret $31, ($28), 1 # .. e1 :
1: EX( ldq_u $5, 0($6) ) # e0 :
clr $0 # .. e1 :
nop # e1 :
mskqh $5, $0, $5 # e0 :
EX( stq_u $5, 0($6) ) # e0 :
ret $31, ($28), 1 # .. e1 :
__do_clear_user:
and $6, 7, $4 # e0 : find dest misalignment
beq $0, $zerolength # .. e1 :
addq $0, $4, $1 # e0 : bias counter
and $1, 7, $2 # e1 : number of bytes in tail
srl $1, 3, $1 # e0 :
beq $4, $loop # .. e1 :
EX( ldq_u $5, 0($6) ) # e0 : load dst word to mask back in
beq $1, $oneword # .. e1 : sub-word store?
mskql $5, $6, $5 # e0 : take care of misaligned head
addq $6, 8, $6 # .. e1 :
EX( stq_u $5, -8($6) ) # e0 :
addq $0, $4, $0 # .. e1 : bytes left -= 8 - misalignment
subq $1, 1, $1 # e0 :
subq $0, 8, $0 # .. e1 :
br $loop # e1 :
unop # :
$oneword:
mskql $5, $6, $4 # e0 :
mskqh $5, $2, $5 # e0 :
or $5, $4, $5 # e1 :
EX( stq_u $5, 0($6) ) # e0 :
clr $0 # .. e1 :
$zerolength:
$exception:
ret $31, ($28), 1 # .. e1 :
.end __do_clear_user

View File

@@ -0,0 +1,49 @@
/*
* arch/alpha/lib/copy_page.S
*
* Copy an entire page.
*/
.text
.align 4
.global copy_page
.ent copy_page
copy_page:
.prologue 0
lda $18,128
nop
unop
nop
1: ldq $0,0($17)
ldq $1,8($17)
ldq $2,16($17)
ldq $3,24($17)
ldq $4,32($17)
ldq $5,40($17)
ldq $6,48($17)
ldq $7,56($17)
stq $0,0($16)
subq $18,1,$18
stq $1,8($16)
addq $17,64,$17
stq $2,16($16)
stq $3,24($16)
stq $4,32($16)
stq $5,40($16)
stq $6,48($16)
stq $7,56($16)
addq $16,64,$16
bne $18, 1b
ret
nop
unop
nop
.end copy_page

145
arch/alpha/lib/copy_user.S Normal file
View File

@@ -0,0 +1,145 @@
/*
* arch/alpha/lib/copy_user.S
*
* Copy to/from user space, handling exceptions as we go.. This
* isn't exactly pretty.
*
* This is essentially the same as "memcpy()", but with a few twists.
* Notably, we have to make sure that $0 is always up-to-date and
* contains the right "bytes left to copy" value (and that it is updated
* only _after_ a successful copy). There is also some rather minor
* exception setup stuff..
*
* NOTE! This is not directly C-callable, because the calling semantics are
* different:
*
* Inputs:
* length in $0
* destination address in $6
* source address in $7
* return address in $28
*
* Outputs:
* bytes left to copy in $0
*
* Clobbers:
* $1,$2,$3,$4,$5,$6,$7
*/
/* Allow an exception for an insn; exit if we get one. */
#define EXI(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exitin-99b($31); \
.previous
#define EXO(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exitout-99b($31); \
.previous
.set noat
.align 4
.globl __copy_user
.ent __copy_user
__copy_user:
.prologue 0
and $6,7,$3
beq $0,$35
beq $3,$36
subq $3,8,$3
.align 4
$37:
EXI( ldq_u $1,0($7) )
EXO( ldq_u $2,0($6) )
extbl $1,$7,$1
mskbl $2,$6,$2
insbl $1,$6,$1
addq $3,1,$3
bis $1,$2,$1
EXO( stq_u $1,0($6) )
subq $0,1,$0
addq $6,1,$6
addq $7,1,$7
beq $0,$41
bne $3,$37
$36:
and $7,7,$1
bic $0,7,$4
beq $1,$43
beq $4,$48
EXI( ldq_u $3,0($7) )
.align 4
$50:
EXI( ldq_u $2,8($7) )
subq $4,8,$4
extql $3,$7,$3
extqh $2,$7,$1
bis $3,$1,$1
EXO( stq $1,0($6) )
addq $7,8,$7
subq $0,8,$0
addq $6,8,$6
bis $2,$2,$3
bne $4,$50
$48:
beq $0,$41
.align 4
$57:
EXI( ldq_u $1,0($7) )
EXO( ldq_u $2,0($6) )
extbl $1,$7,$1
mskbl $2,$6,$2
insbl $1,$6,$1
bis $1,$2,$1
EXO( stq_u $1,0($6) )
subq $0,1,$0
addq $6,1,$6
addq $7,1,$7
bne $0,$57
br $31,$41
.align 4
$43:
beq $4,$65
.align 4
$66:
EXI( ldq $1,0($7) )
subq $4,8,$4
EXO( stq $1,0($6) )
addq $7,8,$7
subq $0,8,$0
addq $6,8,$6
bne $4,$66
$65:
beq $0,$41
EXI( ldq $2,0($7) )
EXO( ldq $1,0($6) )
mskql $2,$0,$2
mskqh $1,$0,$1
bis $2,$1,$2
EXO( stq $2,0($6) )
bis $31,$31,$0
$41:
$35:
$exitout:
ret $31,($28),1
$exitin:
/* A stupid byte-by-byte zeroing of the rest of the output
buffer. This cures security holes by never leaving
random kernel data around to be copied elsewhere. */
mov $0,$1
$101:
EXO ( ldq_u $2,0($6) )
subq $1,1,$1
mskbl $2,$6,$2
EXO ( stq_u $2,0($6) )
addq $6,1,$6
bgt $1,$101
ret $31,($28),1
.end __copy_user

View File

@@ -0,0 +1,92 @@
/*
* arch/alpha/lib/csum_ipv6_magic.S
* Contributed by Richard Henderson <rth@tamu.edu>
*
* unsigned short csum_ipv6_magic(struct in6_addr *saddr,
* struct in6_addr *daddr,
* __u32 len,
* unsigned short proto,
* unsigned int csum);
*/
.globl csum_ipv6_magic
.align 4
.ent csum_ipv6_magic
.frame $30,0,$26,0
csum_ipv6_magic:
.prologue 0
ldq $0,0($16) # e0 : load src & dst addr words
zapnot $20,15,$20 # .. e1 : zero extend incoming csum
extqh $18,1,$4 # e0 : byte swap len & proto while we wait
ldq $1,8($16) # .. e1 :
extbl $18,1,$5 # e0 :
ldq $2,0($17) # .. e1 :
extbl $18,2,$6 # e0 :
ldq $3,8($17) # .. e1 :
extbl $18,3,$18 # e0 :
sra $4,32,$4 # e0 :
sll $5,16,$5 # e0 :
addq $20,$0,$20 # .. e1 : begin summing the words
sll $6,8,$6 # e0 :
cmpult $20,$0,$0 # .. e1 :
extwh $19,7,$7 # e0 :
or $4,$18,$18 # .. e1 :
extbl $19,1,$19 # e0 :
or $5,$6,$5 # .. e1 :
or $18,$5,$18 # e0 : len complete
or $19,$7,$19 # .. e1 :
sll $19,48,$19 # e0 :
addq $20,$1,$20 # .. e1 :
sra $19,32,$19 # e0 : proto complete
cmpult $20,$1,$1 # .. e1 :
nop # e0 :
addq $20,$2,$20 # .. e1 :
cmpult $20,$2,$2 # e0 :
addq $20,$3,$20 # .. e1 :
cmpult $20,$3,$3 # e0 :
addq $20,$18,$20 # .. e1 :
cmpult $20,$18,$18 # e0 :
addq $20,$19,$20 # .. e1 :
cmpult $20,$19,$19 # e0 :
addq $0,$1,$0 # .. e1 : merge the carries back into the csum
addq $2,$3,$2 # e0 :
addq $18,$19,$18 # .. e1 :
addq $0,$2,$0 # e0 :
addq $20,$18,$20 # .. e1 :
addq $0,$20,$0 # e0 :
unop # :
extwl $0,2,$2 # e0 : begin folding the 64-bit value
zapnot $0,3,$3 # .. e1 :
extwl $0,4,$1 # e0 :
addq $2,$3,$3 # .. e1 :
extwl $0,6,$0 # e0 :
addq $3,$1,$3 # .. e1 :
addq $0,$3,$0 # e0 :
unop # :
extwl $0,2,$1 # e0 : fold 18-bit value
zapnot $0,3,$0 # .. e1 :
addq $0,$1,$0 # e0 :
unop # :
extwl $0,2,$1 # e0 : fold 17-bit value
zapnot $0,3,$0 # .. e1 :
addq $0,$1,$0 # e0 :
not $0,$0 # e1 : and complement.
zapnot $0,3,$0 # e0 :
ret # .. e1 :
.end csum_ipv6_magic

View File

@@ -0,0 +1,391 @@
/*
* csum_partial_copy - do IP checksumming and copy
*
* (C) Copyright 1996 Linus Torvalds
* accellerated versions (and 21264 assembly versions ) contributed by
* Rick Gorton <rick.gorton@alpha-processor.com>
*
* Don't look at this too closely - you'll go mad. The things
* we do for performance..
*/
#include <linux/types.h>
#include <linux/string.h>
#include <asm/uaccess.h>
#define ldq_u(x,y) \
__asm__ __volatile__("ldq_u %0,%1":"=r" (x):"m" (*(const unsigned long *)(y)))
#define stq_u(x,y) \
__asm__ __volatile__("stq_u %1,%0":"=m" (*(unsigned long *)(y)):"r" (x))
#define extql(x,y,z) \
__asm__ __volatile__("extql %1,%2,%0":"=r" (z):"r" (x),"r" (y))
#define extqh(x,y,z) \
__asm__ __volatile__("extqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
#define mskql(x,y,z) \
__asm__ __volatile__("mskql %1,%2,%0":"=r" (z):"r" (x),"r" (y))
#define mskqh(x,y,z) \
__asm__ __volatile__("mskqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
#define insql(x,y,z) \
__asm__ __volatile__("insql %1,%2,%0":"=r" (z):"r" (x),"r" (y))
#define insqh(x,y,z) \
__asm__ __volatile__("insqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
#define __get_user_u(x,ptr) \
({ \
long __guu_err; \
__asm__ __volatile__( \
"1: ldq_u %0,%2\n" \
"2:\n" \
".section __ex_table,\"a\"\n" \
" .long 1b - .\n" \
" lda %0,2b-1b(%1)\n" \
".previous" \
: "=r"(x), "=r"(__guu_err) \
: "m"(__m(ptr)), "1"(0)); \
__guu_err; \
})
#define __put_user_u(x,ptr) \
({ \
long __puu_err; \
__asm__ __volatile__( \
"1: stq_u %2,%1\n" \
"2:\n" \
".section __ex_table,\"a\"\n" \
" .long 1b - ." \
" lda $31,2b-1b(%0)\n" \
".previous" \
: "=r"(__puu_err) \
: "m"(__m(addr)), "rJ"(x), "0"(0)); \
__puu_err; \
})
static inline unsigned short from64to16(unsigned long x)
{
/* Using extract instructions is a bit more efficient
than the original shift/bitmask version. */
union {
unsigned long ul;
unsigned int ui[2];
unsigned short us[4];
} in_v, tmp_v, out_v;
in_v.ul = x;
tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1];
/* Since the bits of tmp_v.sh[3] are going to always be zero,
we don't have to bother to add that in. */
out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1]
+ (unsigned long) tmp_v.us[2];
/* Similarly, out_v.us[2] is always zero for the final add. */
return out_v.us[0] + out_v.us[1];
}
/*
* Ok. This isn't fun, but this is the EASY case.
*/
static inline unsigned long
csum_partial_cfu_aligned(const unsigned long __user *src, unsigned long *dst,
long len, unsigned long checksum,
int *errp)
{
unsigned long carry = 0;
int err = 0;
while (len >= 0) {
unsigned long word;
err |= __get_user(word, src);
checksum += carry;
src++;
checksum += word;
len -= 8;
carry = checksum < word;
*dst = word;
dst++;
}
len += 8;
checksum += carry;
if (len) {
unsigned long word, tmp;
err |= __get_user(word, src);
tmp = *dst;
mskql(word, len, word);
checksum += word;
mskqh(tmp, len, tmp);
carry = checksum < word;
*dst = word | tmp;
checksum += carry;
}
if (err) *errp = err;
return checksum;
}
/*
* This is even less fun, but this is still reasonably
* easy.
*/
static inline unsigned long
csum_partial_cfu_dest_aligned(const unsigned long __user *src,
unsigned long *dst,
unsigned long soff,
long len, unsigned long checksum,
int *errp)
{
unsigned long first;
unsigned long word, carry;
unsigned long lastsrc = 7+len+(unsigned long)src;
int err = 0;
err |= __get_user_u(first,src);
carry = 0;
while (len >= 0) {
unsigned long second;
err |= __get_user_u(second, src+1);
extql(first, soff, word);
len -= 8;
src++;
extqh(second, soff, first);
checksum += carry;
word |= first;
first = second;
checksum += word;
*dst = word;
dst++;
carry = checksum < word;
}
len += 8;
checksum += carry;
if (len) {
unsigned long tmp;
unsigned long second;
err |= __get_user_u(second, lastsrc);
tmp = *dst;
extql(first, soff, word);
extqh(second, soff, first);
word |= first;
mskql(word, len, word);
checksum += word;
mskqh(tmp, len, tmp);
carry = checksum < word;
*dst = word | tmp;
checksum += carry;
}
if (err) *errp = err;
return checksum;
}
/*
* This is slightly less fun than the above..
*/
static inline unsigned long
csum_partial_cfu_src_aligned(const unsigned long __user *src,
unsigned long *dst,
unsigned long doff,
long len, unsigned long checksum,
unsigned long partial_dest,
int *errp)
{
unsigned long carry = 0;
unsigned long word;
unsigned long second_dest;
int err = 0;
mskql(partial_dest, doff, partial_dest);
while (len >= 0) {
err |= __get_user(word, src);
len -= 8;
insql(word, doff, second_dest);
checksum += carry;
stq_u(partial_dest | second_dest, dst);
src++;
checksum += word;
insqh(word, doff, partial_dest);
carry = checksum < word;
dst++;
}
len += 8;
if (len) {
checksum += carry;
err |= __get_user(word, src);
mskql(word, len, word);
len -= 8;
checksum += word;
insql(word, doff, second_dest);
len += doff;
carry = checksum < word;
partial_dest |= second_dest;
if (len >= 0) {
stq_u(partial_dest, dst);
if (!len) goto out;
dst++;
insqh(word, doff, partial_dest);
}
doff = len;
}
ldq_u(second_dest, dst);
mskqh(second_dest, doff, second_dest);
stq_u(partial_dest | second_dest, dst);
out:
checksum += carry;
if (err) *errp = err;
return checksum;
}
/*
* This is so totally un-fun that it's frightening. Don't
* look at this too closely, you'll go blind.
*/
static inline unsigned long
csum_partial_cfu_unaligned(const unsigned long __user * src,
unsigned long * dst,
unsigned long soff, unsigned long doff,
long len, unsigned long checksum,
unsigned long partial_dest,
int *errp)
{
unsigned long carry = 0;
unsigned long first;
unsigned long lastsrc;
int err = 0;
err |= __get_user_u(first, src);
lastsrc = 7+len+(unsigned long)src;
mskql(partial_dest, doff, partial_dest);
while (len >= 0) {
unsigned long second, word;
unsigned long second_dest;
err |= __get_user_u(second, src+1);
extql(first, soff, word);
checksum += carry;
len -= 8;
extqh(second, soff, first);
src++;
word |= first;
first = second;
insql(word, doff, second_dest);
checksum += word;
stq_u(partial_dest | second_dest, dst);
carry = checksum < word;
insqh(word, doff, partial_dest);
dst++;
}
len += doff;
checksum += carry;
if (len >= 0) {
unsigned long second, word;
unsigned long second_dest;
err |= __get_user_u(second, lastsrc);
extql(first, soff, word);
extqh(second, soff, first);
word |= first;
first = second;
mskql(word, len-doff, word);
checksum += word;
insql(word, doff, second_dest);
carry = checksum < word;
stq_u(partial_dest | second_dest, dst);
if (len) {
ldq_u(second_dest, dst+1);
insqh(word, doff, partial_dest);
mskqh(second_dest, len, second_dest);
stq_u(partial_dest | second_dest, dst+1);
}
checksum += carry;
} else {
unsigned long second, word;
unsigned long second_dest;
err |= __get_user_u(second, lastsrc);
extql(first, soff, word);
extqh(second, soff, first);
word |= first;
ldq_u(second_dest, dst);
mskql(word, len-doff, word);
checksum += word;
mskqh(second_dest, len, second_dest);
carry = checksum < word;
insql(word, doff, word);
stq_u(partial_dest | word | second_dest, dst);
checksum += carry;
}
if (err) *errp = err;
return checksum;
}
static unsigned int
do_csum_partial_copy_from_user(const char __user *src, char *dst, int len,
unsigned int sum, int *errp)
{
unsigned long checksum = (unsigned) sum;
unsigned long soff = 7 & (unsigned long) src;
unsigned long doff = 7 & (unsigned long) dst;
if (len) {
if (!doff) {
if (!soff)
checksum = csum_partial_cfu_aligned(
(const unsigned long __user *) src,
(unsigned long *) dst,
len-8, checksum, errp);
else
checksum = csum_partial_cfu_dest_aligned(
(const unsigned long __user *) src,
(unsigned long *) dst,
soff, len-8, checksum, errp);
} else {
unsigned long partial_dest;
ldq_u(partial_dest, dst);
if (!soff)
checksum = csum_partial_cfu_src_aligned(
(const unsigned long __user *) src,
(unsigned long *) dst,
doff, len-8, checksum,
partial_dest, errp);
else
checksum = csum_partial_cfu_unaligned(
(const unsigned long __user *) src,
(unsigned long *) dst,
soff, doff, len-8, checksum,
partial_dest, errp);
}
checksum = from64to16 (checksum);
}
return checksum;
}
unsigned int
csum_partial_copy_from_user(const char __user *src, char *dst, int len,
unsigned int sum, int *errp)
{
if (!access_ok(VERIFY_READ, src, len)) {
*errp = -EFAULT;
memset(dst, 0, len);
return sum;
}
return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
}
unsigned int
csum_partial_copy_nocheck(const char __user *src, char *dst, int len,
unsigned int sum)
{
return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
}

View File

@@ -0,0 +1,29 @@
/*
* arch/alpha/lib/dbg_current.S
* Contributed by Richard Henderson (rth@cygnus.com)
*
* Trap if we find current not correct.
*/
#include <asm/pal.h>
.text
.set noat
.globl _mcount
.ent _mcount
_mcount:
.frame $30, 0, $28, 0
.prologue 0
lda $0, -0x4000($30)
cmpult $8, $30, $1
cmpule $0, $30, $2
and $1, $2, $3
bne $3, 1f
call_pal PAL_bugchk
1: ret $31, ($28), 1
.end _mcount

View File

@@ -0,0 +1,27 @@
/*
* arch/alpha/lib/stackcheck.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Verify that we have not overflowed the stack. Oops if we have.
*/
#include <asm/asm_offsets.h>
.text
.set noat
.align 3
.globl _mcount
.ent _mcount
_mcount:
.frame $30, 0, $28, 0
.prologue 0
lda $0, TASK_SIZE($8)
cmpult $30, $0, $0
bne $0, 1f
ret ($28)
1: stq $31, -8($31) # oops me, damn it.
br 1b
.end _mcount

View File

@@ -0,0 +1,35 @@
/*
* arch/alpha/lib/killstack.S
* Contributed by Richard Henderson (rth@cygnus.com)
*
* Clobber the balance of the kernel stack, hoping to catch
* uninitialized local variables in the act.
*/
#include <asm/asm_offsets.h>
.text
.set noat
.align 5
.globl _mcount
.ent _mcount
_mcount:
.frame $30, 0, $28, 0
.prologue 0
ldi $0, 0xdeadbeef
lda $2, -STACK_SIZE
sll $0, 32, $1
and $30, $2, $2
or $0, $1, $0
lda $2, TASK_SIZE($2)
cmpult $2, $30, $1
beq $1, 2f
1: stq $0, 0($2)
addq $2, 8, $2
cmpult $2, $30, $1
bne $1, 1b
2: ret ($28)
.end _mcount

View File

@@ -0,0 +1,42 @@
/*
* arch/alpha/lib/dec_and_lock.c
*
* ll/sc version of atomic_dec_and_lock()
*
*/
#include <linux/spinlock.h>
#include <asm/atomic.h>
asm (".text \n\
.global _atomic_dec_and_lock \n\
.ent _atomic_dec_and_lock \n\
.align 4 \n\
_atomic_dec_and_lock: \n\
.prologue 0 \n\
1: ldl_l $1, 0($16) \n\
subl $1, 1, $1 \n\
beq $1, 2f \n\
stl_c $1, 0($16) \n\
beq $1, 4f \n\
mb \n\
clr $0 \n\
ret \n\
2: br $29, 3f \n\
3: ldgp $29, 0($29) \n\
br $atomic_dec_and_lock_1..ng \n\
.subsection 2 \n\
4: br 1b \n\
.previous \n\
.end _atomic_dec_and_lock");
static int __attribute_used__
atomic_dec_and_lock_1(atomic_t *atomic, spinlock_t *lock)
{
/* Slow path */
spin_lock(lock);
if (atomic_dec_and_test(atomic))
return 1;
spin_unlock(lock);
return 0;
}

195
arch/alpha/lib/divide.S Normal file
View File

@@ -0,0 +1,195 @@
/*
* arch/alpha/lib/divide.S
*
* (C) 1995 Linus Torvalds
*
* Alpha division..
*/
/*
* The alpha chip doesn't provide hardware division, so we have to do it
* by hand. The compiler expects the functions
*
* __divqu: 64-bit unsigned long divide
* __remqu: 64-bit unsigned long remainder
* __divqs/__remqs: signed 64-bit
* __divlu/__remlu: unsigned 32-bit
* __divls/__remls: signed 32-bit
*
* These are not normal C functions: instead of the normal
* calling sequence, these expect their arguments in registers
* $24 and $25, and return the result in $27. Register $28 may
* be clobbered (assembly temporary), anything else must be saved.
*
* In short: painful.
*
* This is a rather simple bit-at-a-time algorithm: it's very good
* at dividing random 64-bit numbers, but the more usual case where
* the divisor is small is handled better by the DEC algorithm
* using lookup tables. This uses much less memory, though, and is
* nicer on the cache.. Besides, I don't know the copyright status
* of the DEC code.
*/
/*
* My temporaries:
* $0 - current bit
* $1 - shifted divisor
* $2 - modulus/quotient
*
* $23 - return address
* $24 - dividend
* $25 - divisor
*
* $27 - quotient/modulus
* $28 - compare status
*/
#define halt .long 0
/*
* Select function type and registers
*/
#define mask $0
#define divisor $1
#define compare $28
#define tmp1 $3
#define tmp2 $4
#ifdef DIV
#define DIV_ONLY(x,y...) x,##y
#define MOD_ONLY(x,y...)
#define func(x) __div##x
#define modulus $2
#define quotient $27
#define GETSIGN(x) xor $24,$25,x
#define STACK 48
#else
#define DIV_ONLY(x,y...)
#define MOD_ONLY(x,y...) x,##y
#define func(x) __rem##x
#define modulus $27
#define quotient $2
#define GETSIGN(x) bis $24,$24,x
#define STACK 32
#endif
/*
* For 32-bit operations, we need to extend to 64-bit
*/
#ifdef INTSIZE
#define ufunction func(lu)
#define sfunction func(l)
#define LONGIFY(x) zapnot x,15,x
#define SLONGIFY(x) addl x,0,x
#else
#define ufunction func(qu)
#define sfunction func(q)
#define LONGIFY(x)
#define SLONGIFY(x)
#endif
.set noat
.align 3
.globl ufunction
.ent ufunction
ufunction:
subq $30,STACK,$30
.frame $30,STACK,$23
.prologue 0
7: stq $1, 0($30)
bis $25,$25,divisor
stq $2, 8($30)
bis $24,$24,modulus
stq $0,16($30)
bis $31,$31,quotient
LONGIFY(divisor)
stq tmp1,24($30)
LONGIFY(modulus)
bis $31,1,mask
DIV_ONLY(stq tmp2,32($30))
beq divisor, 9f /* div by zero */
#ifdef INTSIZE
/*
* shift divisor left, using 3-bit shifts for
* 32-bit divides as we can't overflow. Three-bit
* shifts will result in looping three times less
* here, but can result in two loops more later.
* Thus using a large shift isn't worth it (and
* s8add pairs better than a sll..)
*/
1: cmpult divisor,modulus,compare
s8addq divisor,$31,divisor
s8addq mask,$31,mask
bne compare,1b
#else
1: cmpult divisor,modulus,compare
blt divisor, 2f
addq divisor,divisor,divisor
addq mask,mask,mask
bne compare,1b
unop
#endif
/* ok, start to go right again.. */
2: DIV_ONLY(addq quotient,mask,tmp2)
srl mask,1,mask
cmpule divisor,modulus,compare
subq modulus,divisor,tmp1
DIV_ONLY(cmovne compare,tmp2,quotient)
srl divisor,1,divisor
cmovne compare,tmp1,modulus
bne mask,2b
9: ldq $1, 0($30)
ldq $2, 8($30)
ldq $0,16($30)
ldq tmp1,24($30)
DIV_ONLY(ldq tmp2,32($30))
addq $30,STACK,$30
ret $31,($23),1
.end ufunction
/*
* Uhh.. Ugly signed division. I'd rather not have it at all, but
* it's needed in some circumstances. There are different ways to
* handle this, really. This does:
* -a / b = a / -b = -(a / b)
* -a % b = -(a % b)
* a % -b = a % b
* which is probably not the best solution, but at least should
* have the property that (x/y)*y + (x%y) = x.
*/
.align 3
.globl sfunction
.ent sfunction
sfunction:
subq $30,STACK,$30
.frame $30,STACK,$23
.prologue 0
bis $24,$25,$28
SLONGIFY($28)
bge $28,7b
stq $24,0($30)
subq $31,$24,$28
stq $25,8($30)
cmovlt $24,$28,$24 /* abs($24) */
stq $23,16($30)
subq $31,$25,$28
stq tmp1,24($30)
cmovlt $25,$28,$25 /* abs($25) */
unop
bsr $23,ufunction
ldq $24,0($30)
ldq $25,8($30)
GETSIGN($28)
subq $31,$27,tmp1
SLONGIFY($28)
ldq $23,16($30)
cmovlt $28,tmp1,$27
ldq tmp1,24($30)
addq $30,STACK,$30
ret $31,($23),1
.end sfunction

View File

@@ -0,0 +1,54 @@
/*
* arch/alpha/lib/ev6-clear_page.S
*
* Zero an entire page.
*/
.text
.align 4
.global clear_page
.ent clear_page
clear_page:
.prologue 0
lda $0,128
lda $1,125
addq $16,64,$2
addq $16,128,$3
addq $16,192,$17
wh64 ($16)
wh64 ($2)
wh64 ($3)
1: wh64 ($17)
stq $31,0($16)
subq $0,1,$0
subq $1,1,$1
stq $31,8($16)
stq $31,16($16)
addq $17,64,$2
nop
stq $31,24($16)
stq $31,32($16)
cmovgt $1,$2,$17
nop
stq $31,40($16)
stq $31,48($16)
nop
nop
stq $31,56($16)
addq $16,64,$16
nop
bne $0,1b
ret
nop
nop
nop
.end clear_page

View File

@@ -0,0 +1,225 @@
/*
* arch/alpha/lib/ev6-clear_user.S
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Zero user space, handling exceptions as we go.
*
* We have to make sure that $0 is always up-to-date and contains the
* right "bytes left to zero" value (and that it is updated only _after_
* a successful copy). There is also some rather minor exception setup
* stuff.
*
* NOTE! This is not directly C-callable, because the calling semantics
* are different:
*
* Inputs:
* length in $0
* destination address in $6
* exception pointer in $7
* return address in $28 (exceptions expect it there)
*
* Outputs:
* bytes left to copy in $0
*
* Clobbers:
* $1,$2,$3,$4,$5,$6
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
* Determining actual stalls (other than slotting) doesn't appear to be easy to do.
* From perusing the source code context where this routine is called, it is
* a fair assumption that significant fractions of entire pages are zeroed, so
* it's going to be worth the effort to hand-unroll a big loop, and use wh64.
* ASSUMPTION:
* The believed purpose of only updating $0 after a store is that a signal
* may come along during the execution of this chunk of code, and we don't
* want to leave a hole (and we also want to avoid repeating lots of work)
*/
/* Allow an exception for an insn; exit if we get one. */
#define EX(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exception-99b($31); \
.previous
.set noat
.set noreorder
.align 4
.globl __do_clear_user
.ent __do_clear_user
.frame $30, 0, $28
.prologue 0
# Pipeline info : Slotting & Comments
__do_clear_user:
and $6, 7, $4 # .. E .. .. : find dest head misalignment
beq $0, $zerolength # U .. .. .. : U L U L
addq $0, $4, $1 # .. .. .. E : bias counter
and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail
# Note - we never actually use $2, so this is a moot computation
# and we can rewrite this later...
srl $1, 3, $1 # .. E .. .. : number of quadwords to clear
beq $4, $headalign # U .. .. .. : U L U L
/*
* Head is not aligned. Write (8 - $4) bytes to head of destination
* This means $6 is known to be misaligned
*/
EX( ldq_u $5, 0($6) ) # .. .. .. L : load dst word to mask back in
beq $1, $onebyte # .. .. U .. : sub-word store?
mskql $5, $6, $5 # .. U .. .. : take care of misaligned head
addq $6, 8, $6 # E .. .. .. : L U U L
EX( stq_u $5, -8($6) ) # .. .. .. L :
subq $1, 1, $1 # .. .. E .. :
addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment
subq $0, 8, $0 # E .. .. .. : U L U L
.align 4
/*
* (The .align directive ought to be a moot point)
* values upon initial entry to the loop
* $1 is number of quadwords to clear (zero is a valid value)
* $2 is number of trailing bytes (0..7) ($2 never used...)
* $6 is known to be aligned 0mod8
*/
$headalign:
subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop
and $6, 0x3f, $2 # .. .. E .. : Forward work for huge loop
subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop)
blt $4, $trailquad # U .. .. .. : U L U L
/*
* We know that we're going to do at least 16 quads, which means we are
* going to be able to use the large block clear loop at least once.
* Figure out how many quads we need to clear before we are 0mod64 aligned
* so we can use the wh64 instruction.
*/
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64
$alignmod64:
EX( stq_u $31, 0($6) ) # .. .. .. L
addq $3, 8, $3 # .. .. E ..
subq $0, 8, $0 # .. E .. ..
nop # E .. .. .. : U L U L
nop # .. .. .. E
subq $1, 1, $1 # .. .. E ..
addq $6, 8, $6 # .. E .. ..
blt $3, $alignmod64 # U .. .. .. : U L U L
$bigalign:
/*
* $0 is the number of bytes left
* $1 is the number of quads left
* $6 is aligned 0mod64
* we know that we'll be taking a minimum of one trip through
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
* We are _not_ going to update $0 after every single store. That
* would be silly, because there will be cross-cluster dependencies
* no matter how the code is scheduled. By doing it in slightly
* staggered fashion, we can still do this loop in 5 fetches
* The worse case will be doing two extra quads in some future execution,
* in the event of an interrupted clear.
* Assumes the wh64 needs to be for 2 trips through the loop in the future
* The wh64 is issued on for the starting destination address for trip +2
* through the loop, and if there are less than two trips left, the target
* address will be for the current trip.
*/
nop # E :
nop # E :
nop # E :
bis $6,$6,$3 # E : U L U L : Initial wh64 address is dest
/* This might actually help for the current trip... */
$do_wh64:
wh64 ($3) # .. .. .. L1 : memory subsystem hint
subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop?
EX( stq_u $31, 0($6) ) # .. L .. ..
subq $0, 8, $0 # E .. .. .. : U L U L
addq $6, 128, $3 # E : Target address of wh64
EX( stq_u $31, 8($6) ) # L :
EX( stq_u $31, 16($6) ) # L :
subq $0, 16, $0 # E : U L L U
nop # E :
EX( stq_u $31, 24($6) ) # L :
EX( stq_u $31, 32($6) ) # L :
subq $0, 168, $5 # E : U L L U : two trips through the loop left?
/* 168 = 192 - 24, since we've already completed some stores */
subq $0, 16, $0 # E :
EX( stq_u $31, 40($6) ) # L :
EX( stq_u $31, 48($6) ) # L :
cmovlt $5, $6, $3 # E : U L L U : Latency 2, extra mapping cycle
subq $1, 8, $1 # E :
subq $0, 16, $0 # E :
EX( stq_u $31, 56($6) ) # L :
nop # E : U L U L
nop # E :
subq $0, 8, $0 # E :
addq $6, 64, $6 # E :
bge $4, $do_wh64 # U : U L U L
$trailquad:
# zero to 16 quadwords left to store, plus any trailing bytes
# $1 is the number of quadwords left to go.
#
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go
$onequad:
EX( stq_u $31, 0($6) ) # .. .. .. L
subq $1, 1, $1 # .. .. E ..
subq $0, 8, $0 # .. E .. ..
nop # E .. .. .. : U L U L
nop # .. .. .. E
nop # .. .. E ..
addq $6, 8, $6 # .. E .. ..
bgt $1, $onequad # U .. .. .. : U L U L
# We have an unknown number of bytes left to go.
$trailbytes:
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
beq $0, $zerolength # U .. .. .. : U L U L
# $0 contains the number of bytes left to copy (0..31)
# so we will use $0 as the loop counter
# We know for a fact that $0 > 0 zero due to previous context
$onebyte:
EX( stb $31, 0($6) ) # .. .. .. L
subq $0, 1, $0 # .. .. E .. :
addq $6, 1, $6 # .. E .. .. :
bgt $0, $onebyte # U .. .. .. : U L U L
$zerolength:
$exception: # Destination for exception recovery(?)
nop # .. .. .. E :
nop # .. .. E .. :
nop # .. E .. .. :
ret $31, ($28), 1 # L0 .. .. .. : L U L U
.end __do_clear_user

View File

@@ -0,0 +1,203 @@
/*
* arch/alpha/lib/ev6-copy_page.S
*
* Copy an entire page.
*/
/* The following comparison of this routine vs the normal copy_page.S
was written by an unnamed ev6 hardware designer and forwarded to me
via Steven Hobbs <hobbs@steven.zko.dec.com>.
First Problem: STQ overflows.
-----------------------------
It would be nice if EV6 handled every resource overflow efficiently,
but for some it doesn't. Including store queue overflows. It causes
a trap and a restart of the pipe.
To get around this we sometimes use (to borrow a term from a VSSAD
researcher) "aeration". The idea is to slow the rate at which the
processor receives valid instructions by inserting nops in the fetch
path. In doing so, you can prevent the overflow and actually make
the code run faster. You can, of course, take advantage of the fact
that the processor can fetch at most 4 aligned instructions per cycle.
I inserted enough nops to force it to take 10 cycles to fetch the
loop code. In theory, EV6 should be able to execute this loop in
9 cycles but I was not able to get it to run that fast -- the initial
conditions were such that I could not reach this optimum rate on
(chaotic) EV6. I wrote the code such that everything would issue
in order.
Second Problem: Dcache index matches.
-------------------------------------
If you are going to use this routine on random aligned pages, there
is a 25% chance that the pages will be at the same dcache indices.
This results in many nasty memory traps without care.
The solution is to schedule the prefetches to avoid the memory
conflicts. I schedule the wh64 prefetches farther ahead of the
read prefetches to avoid this problem.
Third Problem: Needs more prefetching.
--------------------------------------
In order to improve the code I added deeper prefetching to take the
most advantage of EV6's bandwidth.
I also prefetched the read stream. Note that adding the read prefetch
forced me to add another cycle to the inner-most kernel - up to 11
from the original 8 cycles per iteration. We could improve performance
further by unrolling the loop and doing multiple prefetches per cycle.
I think that the code below will be very robust and fast code for the
purposes of copying aligned pages. It is slower when both source and
destination pages are in the dcache, but it is my guess that this is
less important than the dcache miss case. */
.text
.align 4
.global copy_page
.ent copy_page
copy_page:
.prologue 0
/* Prefetch 5 read cachelines; write-hint 10 cache lines. */
wh64 ($16)
ldl $31,0($17)
ldl $31,64($17)
lda $1,1*64($16)
wh64 ($1)
ldl $31,128($17)
ldl $31,192($17)
lda $1,2*64($16)
wh64 ($1)
ldl $31,256($17)
lda $18,118
lda $1,3*64($16)
wh64 ($1)
nop
lda $1,4*64($16)
lda $2,5*64($16)
wh64 ($1)
wh64 ($2)
lda $1,6*64($16)
lda $2,7*64($16)
wh64 ($1)
wh64 ($2)
lda $1,8*64($16)
lda $2,9*64($16)
wh64 ($1)
wh64 ($2)
lda $19,10*64($16)
nop
/* Main prefetching/write-hinting loop. */
1: ldq $0,0($17)
ldq $1,8($17)
unop
unop
unop
unop
ldq $2,16($17)
ldq $3,24($17)
ldq $4,32($17)
ldq $5,40($17)
unop
unop
unop
unop
ldq $6,48($17)
ldq $7,56($17)
ldl $31,320($17)
unop
unop
unop
/* This gives the extra cycle of aeration above the minimum. */
unop
unop
unop
unop
wh64 ($19)
unop
unop
unop
stq $0,0($16)
subq $18,1,$18
stq $1,8($16)
unop
unop
stq $2,16($16)
addq $17,64,$17
stq $3,24($16)
stq $4,32($16)
stq $5,40($16)
addq $19,64,$19
unop
stq $6,48($16)
stq $7,56($16)
addq $16,64,$16
bne $18, 1b
/* Prefetch the final 5 cache lines of the read stream. */
lda $18,10
ldl $31,320($17)
ldl $31,384($17)
ldl $31,448($17)
ldl $31,512($17)
ldl $31,576($17)
nop
nop
/* Non-prefetching, non-write-hinting cleanup loop for the
final 10 cache lines. */
2: ldq $0,0($17)
ldq $1,8($17)
ldq $2,16($17)
ldq $3,24($17)
ldq $4,32($17)
ldq $5,40($17)
ldq $6,48($17)
ldq $7,56($17)
stq $0,0($16)
subq $18,1,$18
stq $1,8($16)
addq $17,64,$17
stq $2,16($16)
stq $3,24($16)
stq $4,32($16)
stq $5,40($16)
stq $6,48($16)
stq $7,56($16)
addq $16,64,$16
bne $18, 2b
ret
nop
unop
nop
.end copy_page

View File

@@ -0,0 +1,259 @@
/*
* arch/alpha/lib/ev6-copy_user.S
*
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Copy to/from user space, handling exceptions as we go.. This
* isn't exactly pretty.
*
* This is essentially the same as "memcpy()", but with a few twists.
* Notably, we have to make sure that $0 is always up-to-date and
* contains the right "bytes left to copy" value (and that it is updated
* only _after_ a successful copy). There is also some rather minor
* exception setup stuff..
*
* NOTE! This is not directly C-callable, because the calling semantics are
* different:
*
* Inputs:
* length in $0
* destination address in $6
* source address in $7
* return address in $28
*
* Outputs:
* bytes left to copy in $0
*
* Clobbers:
* $1,$2,$3,$4,$5,$6,$7
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
*/
/* Allow an exception for an insn; exit if we get one. */
#define EXI(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exitin-99b($31); \
.previous
#define EXO(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exitout-99b($31); \
.previous
.set noat
.align 4
.globl __copy_user
.ent __copy_user
# Pipeline info: Slotting & Comments
__copy_user:
.prologue 0
subq $0, 32, $1 # .. E .. .. : Is this going to be a small copy?
beq $0, $zerolength # U .. .. .. : U L U L
and $6,7,$3 # .. .. .. E : is leading dest misalignment
ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data
beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall)
subq $3, 8, $3 # E .. .. .. : L U U L : trip counter
/*
* The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
* This loop aligns the destination a byte at a time
* We know we have at least one trip through this loop
*/
$aligndest:
EXI( ldbu $1,0($7) ) # .. .. .. L : Keep loads separate from stores
addq $6,1,$6 # .. .. E .. : Section 3.8 in the CWG
addq $3,1,$3 # .. E .. .. :
nop # E .. .. .. : U L U L
/*
* the -1 is to compensate for the inc($6) done in a previous quadpack
* which allows us zero dependencies within either quadpack in the loop
*/
EXO( stb $1,-1($6) ) # .. .. .. L :
addq $7,1,$7 # .. .. E .. : Section 3.8 in the CWG
subq $0,1,$0 # .. E .. .. :
bne $3, $aligndest # U .. .. .. : U L U L
/*
* If we fell through into here, we have a minimum of 33 - 7 bytes
* If we arrived via branch, we have a minimum of 32 bytes
*/
$destaligned:
and $7,7,$1 # .. .. .. E : Check _current_ source alignment
bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop
EXI( ldq_u $3,0($7) ) # .. L .. .. : Forward fetch for fallthrough code
beq $1,$quadaligned # U .. .. .. : U L U L
/*
* In the worst case, we've just executed an ldq_u here from 0($7)
* and we'll repeat it once if we take the branch
*/
/* Misaligned quadword loop - not unrolled. Leave it that way. */
$misquad:
EXI( ldq_u $2,8($7) ) # .. .. .. L :
subq $4,8,$4 # .. .. E .. :
extql $3,$7,$3 # .. U .. .. :
extqh $2,$7,$1 # U .. .. .. : U U L L
bis $3,$1,$1 # .. .. .. E :
EXO( stq $1,0($6) ) # .. .. L .. :
addq $7,8,$7 # .. E .. .. :
subq $0,8,$0 # E .. .. .. : U L L U
addq $6,8,$6 # .. .. .. E :
bis $2,$2,$3 # .. .. E .. :
nop # .. E .. .. :
bne $4,$misquad # U .. .. .. : U L U L
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
beq $0,$zerolength # U .. .. .. : U L U L
/* We know we have at least one trip through the byte loop */
EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
nop # .. E .. .. :
br $31, $dirtyentry # L0 .. .. .. : L U U L
/* Do the trailing byte loop load, then hop into the store part of the loop */
/*
* A minimum of (33 - 7) bytes to do a quad at a time.
* Based upon the usage context, it's worth the effort to unroll this loop
* $0 - number of bytes to be moved
* $4 - number of bytes to move as quadwords
* $6 is current destination address
* $7 is current source address
*/
$quadaligned:
subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff
nop # .. .. E ..
nop # .. E .. ..
blt $2, $onequad # U .. .. .. : U L U L
/*
* There is a significant assumption here that the source and destination
* addresses differ by more than 32 bytes. In this particular case, a
* sparsity of registers further bounds this to be a minimum of 8 bytes.
* But if this isn't met, then the output result will be incorrect.
* Furthermore, due to a lack of available registers, we really can't
* unroll this to be an 8x loop (which would enable us to use the wh64
* instruction memory hint instruction).
*/
$unroll4:
EXI( ldq $1,0($7) ) # .. .. .. L
EXI( ldq $2,8($7) ) # .. .. L ..
subq $4,32,$4 # .. E .. ..
nop # E .. .. .. : U U L L
addq $7,16,$7 # .. .. .. E
EXO( stq $1,0($6) ) # .. .. L ..
EXO( stq $2,8($6) ) # .. L .. ..
subq $0,16,$0 # E .. .. .. : U L L U
addq $6,16,$6 # .. .. .. E
EXI( ldq $1,0($7) ) # .. .. L ..
EXI( ldq $2,8($7) ) # .. L .. ..
subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip?
EXO( stq $1,0($6) ) # .. .. .. L
EXO( stq $2,8($6) ) # .. .. L ..
subq $0,16,$0 # .. E .. ..
addq $7,16,$7 # E .. .. .. : U L L U
nop # .. .. .. E
nop # .. .. E ..
addq $6,16,$6 # .. E .. ..
bgt $3,$unroll4 # U .. .. .. : U L U L
nop
nop
nop
beq $4, $noquads
$onequad:
EXI( ldq $1,0($7) )
subq $4,8,$4
addq $7,8,$7
nop
EXO( stq $1,0($6) )
subq $0,8,$0
addq $6,8,$6
bne $4,$onequad
$noquads:
nop
nop
nop
beq $0,$zerolength
/*
* For small copies (or the tail of a larger copy), do a very simple byte loop.
* There's no point in doing a lot of complex alignment calculations to try to
* to quadword stuff for a small amount of data.
* $0 - remaining number of bytes left to copy
* $6 - current dest addr
* $7 - current source addr
*/
$onebyteloop:
EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
nop # .. E .. .. :
nop # E .. .. .. : U L U L
$dirtyentry:
/*
* the -1 is to compensate for the inc($6) done in a previous quadpack
* which allows us zero dependencies within either quadpack in the loop
*/
EXO ( stb $2,-1($6) ) # .. .. .. L :
addq $7,1,$7 # .. .. E .. : quadpack as the load
subq $0,1,$0 # .. E .. .. : change count _after_ copy
bgt $0,$onebyteloop # U .. .. .. : U L U L
$zerolength:
$exitout: # Destination for exception recovery(?)
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
ret $31,($28),1 # L0 .. .. .. : L U L U
$exitin:
/* A stupid byte-by-byte zeroing of the rest of the output
buffer. This cures security holes by never leaving
random kernel data around to be copied elsewhere. */
nop
nop
nop
mov $0,$1
$101:
EXO ( stb $31,0($6) ) # L
subq $1,1,$1 # E
addq $6,1,$6 # E
bgt $1,$101 # U
nop
nop
nop
ret $31,($28),1 # L0
.end __copy_user

View File

@@ -0,0 +1,126 @@
/*
* arch/alpha/lib/ev6-csum_ipv6_magic.S
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* unsigned short csum_ipv6_magic(struct in6_addr *saddr,
* struct in6_addr *daddr,
* __u32 len,
* unsigned short proto,
* unsigned int csum);
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
* Determining actual stalls (other than slotting) doesn't appear to be easy to do.
*
* unsigned short csum_ipv6_magic(struct in6_addr *saddr,
* struct in6_addr *daddr,
* __u32 len,
* unsigned short proto,
* unsigned int csum);
*
* Swap <proto> (takes form 0xaabb)
* Then shift it left by 48, so result is:
* 0xbbaa0000 00000000
* Then turn it back into a sign extended 32-bit item
* 0xbbaa0000
*
* Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence
* (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence)
* Assume input takes form 0xAABBCCDD
*
* Finally, original 'folding' approach is to split the long into 4 unsigned shorts
* add 4 ushorts, resulting in ushort/carry
* add carry bits + ushort --> ushort
* add carry bits + ushort --> ushort (in case the carry results in an overflow)
* Truncate to a ushort. (took 13 instructions)
* From doing some testing, using the approach in checksum.c:from64to16()
* results in the same outcome:
* split into 2 uints, add those, generating a ulong
* add the 3 low ushorts together, generating a uint
* a final add of the 2 lower ushorts
* truncating the result.
*/
.globl csum_ipv6_magic
.align 4
.ent csum_ipv6_magic
.frame $30,0,$26,0
csum_ipv6_magic:
.prologue 0
ldq $0,0($16) # L : Latency: 3
inslh $18,7,$4 # U : 0000000000AABBCC
ldq $1,8($16) # L : Latency: 3
sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00
zapnot $20,15,$20 # U : zero extend incoming csum
ldq $2,0($17) # L : Latency: 3
sll $19,24,$19 # U : U L L U : 0x000000aa bb000000
inswl $18,3,$18 # U : 000000CCDD000000
ldq $3,8($17) # L : Latency: 3
bis $18,$4,$18 # E : 000000CCDDAABBCC
addl $19,$7,$19 # E : <sign bits>bbaabb00
nop # E : U L U L
addq $20,$0,$20 # E : begin summing the words
srl $18,16,$4 # U : 0000000000CCDDAA
zap $19,0x3,$19 # U : <sign bits>bbaa0000
nop # E : L U U L
cmpult $20,$0,$0 # E :
addq $20,$1,$20 # E :
zapnot $18,0xa,$18 # U : 00000000DD00BB00
zap $4,0xa,$4 # U : U U L L : 0000000000CC00AA
or $18,$4,$18 # E : 00000000DDCCBBAA
nop # E :
cmpult $20,$1,$1 # E :
addq $20,$2,$20 # E : U L U L
cmpult $20,$2,$2 # E :
addq $20,$3,$20 # E :
cmpult $20,$3,$3 # E : (1 cycle stall on $20)
addq $20,$18,$20 # E : U L U L (1 cycle stall on $20)
cmpult $20,$18,$18 # E :
addq $20,$19,$20 # E : (1 cycle stall on $20)
addq $0,$1,$0 # E : merge the carries back into the csum
addq $2,$3,$2 # E :
cmpult $20,$19,$19 # E :
addq $18,$19,$18 # E : (1 cycle stall on $19)
addq $0,$2,$0 # E :
addq $20,$18,$20 # E : U L U L :
/* (1 cycle stall on $18, 2 cycles on $20) */
addq $0,$20,$0 # E :
zapnot $0,15,$1 # U : Start folding output (1 cycle stall on $0)
nop # E :
srl $0,32,$0 # U : U L U L : (1 cycle stall on $0)
addq $1,$0,$1 # E : Finished generating ulong
extwl $1,2,$2 # U : ushort[1] (1 cycle stall on $1)
zapnot $1,3,$0 # U : ushort[0] (1 cycle stall on $1)
extwl $1,4,$1 # U : ushort[2] (1 cycle stall on $1)
addq $0,$2,$0 # E
addq $0,$1,$3 # E : Finished generating uint
/* (1 cycle stall on $0) */
extwl $3,2,$1 # U : ushort[1] (1 cycle stall on $3)
nop # E : L U L U
addq $1,$3,$0 # E : Final carry
not $0,$4 # E : complement (1 cycle stall on $0)
zapnot $4,3,$0 # U : clear upper garbage bits
/* (1 cycle stall on $4) */
ret # L0 : L U L U
.end csum_ipv6_magic

259
arch/alpha/lib/ev6-divide.S Normal file
View File

@@ -0,0 +1,259 @@
/*
* arch/alpha/lib/ev6-divide.S
*
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Alpha division..
*/
/*
* The alpha chip doesn't provide hardware division, so we have to do it
* by hand. The compiler expects the functions
*
* __divqu: 64-bit unsigned long divide
* __remqu: 64-bit unsigned long remainder
* __divqs/__remqs: signed 64-bit
* __divlu/__remlu: unsigned 32-bit
* __divls/__remls: signed 32-bit
*
* These are not normal C functions: instead of the normal
* calling sequence, these expect their arguments in registers
* $24 and $25, and return the result in $27. Register $28 may
* be clobbered (assembly temporary), anything else must be saved.
*
* In short: painful.
*
* This is a rather simple bit-at-a-time algorithm: it's very good
* at dividing random 64-bit numbers, but the more usual case where
* the divisor is small is handled better by the DEC algorithm
* using lookup tables. This uses much less memory, though, and is
* nicer on the cache.. Besides, I don't know the copyright status
* of the DEC code.
*/
/*
* My temporaries:
* $0 - current bit
* $1 - shifted divisor
* $2 - modulus/quotient
*
* $23 - return address
* $24 - dividend
* $25 - divisor
*
* $27 - quotient/modulus
* $28 - compare status
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
*/
#define halt .long 0
/*
* Select function type and registers
*/
#define mask $0
#define divisor $1
#define compare $28
#define tmp1 $3
#define tmp2 $4
#ifdef DIV
#define DIV_ONLY(x,y...) x,##y
#define MOD_ONLY(x,y...)
#define func(x) __div##x
#define modulus $2
#define quotient $27
#define GETSIGN(x) xor $24,$25,x
#define STACK 48
#else
#define DIV_ONLY(x,y...)
#define MOD_ONLY(x,y...) x,##y
#define func(x) __rem##x
#define modulus $27
#define quotient $2
#define GETSIGN(x) bis $24,$24,x
#define STACK 32
#endif
/*
* For 32-bit operations, we need to extend to 64-bit
*/
#ifdef INTSIZE
#define ufunction func(lu)
#define sfunction func(l)
#define LONGIFY(x) zapnot x,15,x
#define SLONGIFY(x) addl x,0,x
#else
#define ufunction func(qu)
#define sfunction func(q)
#define LONGIFY(x)
#define SLONGIFY(x)
#endif
.set noat
.align 4
.globl ufunction
.ent ufunction
ufunction:
subq $30,STACK,$30 # E :
.frame $30,STACK,$23
.prologue 0
7: stq $1, 0($30) # L :
bis $25,$25,divisor # E :
stq $2, 8($30) # L : L U L U
bis $24,$24,modulus # E :
stq $0,16($30) # L :
bis $31,$31,quotient # E :
LONGIFY(divisor) # E : U L L U
stq tmp1,24($30) # L :
LONGIFY(modulus) # E :
bis $31,1,mask # E :
DIV_ONLY(stq tmp2,32($30)) # L : L U U L
beq divisor, 9f /* div by zero */
/*
* In spite of the DIV_ONLY being either a non-instruction
* or an actual stq, the addition of the .align directive
* below ensures that label 1 is going to be nicely aligned
*/
.align 4
#ifdef INTSIZE
/*
* shift divisor left, using 3-bit shifts for
* 32-bit divides as we can't overflow. Three-bit
* shifts will result in looping three times less
* here, but can result in two loops more later.
* Thus using a large shift isn't worth it (and
* s8add pairs better than a sll..)
*/
1: cmpult divisor,modulus,compare # E :
s8addq divisor,$31,divisor # E :
s8addq mask,$31,mask # E :
bne compare,1b # U : U L U L
#else
1: cmpult divisor,modulus,compare # E :
nop # E :
nop # E :
blt divisor, 2f # U : U L U L
addq divisor,divisor,divisor # E :
addq mask,mask,mask # E :
unop # E :
bne compare,1b # U : U L U L
#endif
/* ok, start to go right again.. */
2:
/*
* Keep things nicely bundled... use a nop instead of not
* having an instruction for DIV_ONLY
*/
#ifdef DIV
DIV_ONLY(addq quotient,mask,tmp2) # E :
#else
nop # E :
#endif
srl mask,1,mask # U :
cmpule divisor,modulus,compare # E :
subq modulus,divisor,tmp1 # E :
#ifdef DIV
DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot
nop # E : as part of the cmovne
srl divisor,1,divisor # U :
nop # E : L U L U
nop # E :
cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
nop # E : as part of the cmovne
bne mask,2b # U : U L U L
#else
srl divisor,1,divisor # U :
cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
nop # E : as part of the cmovne
bne mask,2b # U : U L L U
#endif
9: ldq $1, 0($30) # L :
ldq $2, 8($30) # L :
nop # E :
nop # E : U U L L
ldq $0,16($30) # L :
ldq tmp1,24($30) # L :
nop # E :
nop # E :
#ifdef DIV
DIV_ONLY(ldq tmp2,32($30)) # L :
#else
nop # E :
#endif
addq $30,STACK,$30 # E :
ret $31,($23),1 # L0 : L U U L
.end ufunction
/*
* Uhh.. Ugly signed division. I'd rather not have it at all, but
* it's needed in some circumstances. There are different ways to
* handle this, really. This does:
* -a / b = a / -b = -(a / b)
* -a % b = -(a % b)
* a % -b = a % b
* which is probably not the best solution, but at least should
* have the property that (x/y)*y + (x%y) = x.
*/
.align 4
.globl sfunction
.ent sfunction
sfunction:
subq $30,STACK,$30 # E :
.frame $30,STACK,$23
.prologue 0
bis $24,$25,$28 # E :
SLONGIFY($28) # E :
bge $28,7b # U :
stq $24,0($30) # L :
subq $31,$24,$28 # E :
stq $25,8($30) # L :
nop # E : U L U L
cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot
nop # E : as part of the cmov
stq $23,16($30) # L :
subq $31,$25,$28 # E : U L U L
stq tmp1,24($30) # L :
cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot
nop # E :
bsr $23,ufunction # L0: L U L U
ldq $24,0($30) # L :
ldq $25,8($30) # L :
GETSIGN($28) # E :
subq $31,$27,tmp1 # E : U U L L
SLONGIFY($28) # E :
ldq $23,16($30) # L :
cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot
nop # E : U L L U : as part of the cmov
ldq tmp1,24($30) # L :
nop # E : as part of the cmov
addq $30,STACK,$30 # E :
ret $31,($23),1 # L0 : L U U L
.end sfunction

191
arch/alpha/lib/ev6-memchr.S Normal file
View File

@@ -0,0 +1,191 @@
/*
* arch/alpha/lib/ev6-memchr.S
*
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Finds characters in a memory area. Optimized for the Alpha:
*
* - memory accessed as aligned quadwords only
* - uses cmpbge to compare 8 bytes in parallel
* - does binary search to find 0 byte in last
* quadword (HAKMEM needed 12 instructions to
* do this instead of the 9 instructions that
* binary search needs).
*
* For correctness consider that:
*
* - only minimum number of quadwords may be accessed
* - the third argument is an unsigned long
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
*/
.set noreorder
.set noat
.align 4
.globl memchr
.ent memchr
memchr:
.frame $30,0,$26,0
.prologue 0
# Hack -- if someone passes in (size_t)-1, hoping to just
# search til the end of the address space, we will overflow
# below when we find the address of the last byte. Given
# that we will never have a 56-bit address space, cropping
# the length is the easiest way to avoid trouble.
zap $18, 0x80, $5 # U : Bound length
beq $18, $not_found # U :
ldq_u $1, 0($16) # L : load first quadword Latency=3
and $17, 0xff, $17 # E : L L U U : 00000000000000ch
insbl $17, 1, $2 # U : 000000000000ch00
cmpult $18, 9, $4 # E : small (< 1 quad) string?
or $2, $17, $17 # E : 000000000000chch
lda $3, -1($31) # E : U L L U
sll $17, 16, $2 # U : 00000000chch0000
addq $16, $5, $5 # E : Max search address
or $2, $17, $17 # E : 00000000chchchch
sll $17, 32, $2 # U : U L L U : chchchch00000000
or $2, $17, $17 # E : chchchchchchchch
extql $1, $16, $7 # U : $7 is upper bits
beq $4, $first_quad # U :
ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
extqh $6, $16, $6 # U : 2 cycle stall for $6
mov $16, $0 # E :
nop # E :
or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
# Deal with the case where at most 8 bytes remain to be searched
# in $1. E.g.:
# $18 = 6
# $1 = ????c6c5c4c3c2c1
$last_quad:
negq $18, $6 # E :
xor $17, $1, $1 # E :
srl $3, $6, $6 # U : $6 = mask of $18 bits set
cmpbge $31, $1, $2 # E : L U L U
nop
nop
and $2, $6, $2 # E :
beq $2, $not_found # U : U L U L
$found_it:
#if defined(__alpha_fix__) && defined(__alpha_cix__)
/*
* Since we are guaranteed to have set one of the bits, we don't
* have to worry about coming back with a 0x40 out of cttz...
*/
cttz $2, $3 # U0 :
addq $0, $3, $0 # E : All done
nop # E :
ret # L0 : L U L U
#else
/*
* Slow and clunky. It can probably be improved.
* An exercise left for others.
*/
negq $2, $3 # E :
and $2, $3, $2 # E :
and $2, 0x0f, $1 # E :
addq $0, 4, $3 # E :
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
nop # E : keep with cmov
and $2, 0x33, $1 # E :
addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
nop # E : keep with cmov
and $2, 0x55, $1 # E :
addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
nop
nop
ret # L0 : L U L U
#endif
# Deal with the case where $18 > 8 bytes remain to be
# searched. $16 may not be aligned.
.align 4
$first_quad:
andnot $16, 0x7, $0 # E :
insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
xor $1, $17, $1 # E :
or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
cmpbge $31, $1, $2 # E :
bne $2, $found_it # U :
# At least one byte left to process.
ldq $1, 8($0) # L :
subq $5, 1, $18 # E : U L U L
addq $0, 8, $0 # E :
# Make $18 point to last quad to be accessed (the
# last quad may or may not be partial).
andnot $18, 0x7, $18 # E :
cmpult $0, $18, $2 # E :
beq $2, $final # U : U L U L
# At least two quads remain to be accessed.
subq $18, $0, $4 # E : $4 <- nr quads to be processed
and $4, 8, $4 # E : odd number of quads?
bne $4, $odd_quad_count # U :
# At least three quads remain to be accessed
mov $1, $4 # E : L U L U : move prefetched value to correct reg
.align 4
$unrolled_loop:
ldq $1, 8($0) # L : prefetch $1
xor $17, $4, $2 # E :
cmpbge $31, $2, $2 # E :
bne $2, $found_it # U : U L U L
addq $0, 8, $0 # E :
nop # E :
nop # E :
nop # E :
$odd_quad_count:
xor $17, $1, $2 # E :
ldq $4, 8($0) # L : prefetch $4
cmpbge $31, $2, $2 # E :
addq $0, 8, $6 # E :
bne $2, $found_it # U :
cmpult $6, $18, $6 # E :
addq $0, 8, $0 # E :
nop # E :
bne $6, $unrolled_loop # U :
mov $4, $1 # E : move prefetched value into $1
nop # E :
nop # E :
$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
nop # E :
nop # E :
bne $18, $last_quad # U :
$not_found:
mov $31, $0 # E :
nop # E :
nop # E :
ret # L0 :
.end memchr

248
arch/alpha/lib/ev6-memcpy.S Normal file
View File

@@ -0,0 +1,248 @@
/*
* arch/alpha/lib/ev6-memcpy.S
* 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Reasonably optimized memcpy() routine for the Alpha 21264
*
* - memory accessed as aligned quadwords only
* - uses bcmpge to compare 8 bytes in parallel
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
*
* Temp usage notes:
* $1,$2, - scratch
*/
.set noreorder
.set noat
.align 4
.globl memcpy
.ent memcpy
memcpy:
.frame $30,0,$26,0
.prologue 0
mov $16, $0 # E : copy dest to return
ble $18, $nomoredata # U : done with the copy?
xor $16, $17, $1 # E : are source and dest alignments the same?
and $1, 7, $1 # E : are they the same mod 8?
bne $1, $misaligned # U : Nope - gotta do this the slow way
/* source and dest are same mod 8 address */
and $16, 7, $1 # E : Are both 0mod8?
beq $1, $both_0mod8 # U : Yes
nop # E :
/*
* source and dest are same misalignment. move a byte at a time
* until a 0mod8 alignment for both is reached.
* At least one byte more to move
*/
$head_align:
ldbu $1, 0($17) # L : grab a byte
subq $18, 1, $18 # E : count--
addq $17, 1, $17 # E : src++
stb $1, 0($16) # L :
addq $16, 1, $16 # E : dest++
and $16, 7, $1 # E : Are we at 0mod8 yet?
ble $18, $nomoredata # U : done with the copy?
bne $1, $head_align # U :
$both_0mod8:
cmple $18, 127, $1 # E : Can we unroll the loop?
bne $1, $no_unroll # U :
and $16, 63, $1 # E : get mod64 alignment
beq $1, $do_unroll # U : no single quads to fiddle
$single_head_quad:
ldq $1, 0($17) # L : get 8 bytes
subq $18, 8, $18 # E : count -= 8
addq $17, 8, $17 # E : src += 8
nop # E :
stq $1, 0($16) # L : store
addq $16, 8, $16 # E : dest += 8
and $16, 63, $1 # E : get mod64 alignment
bne $1, $single_head_quad # U : still not fully aligned
$do_unroll:
addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
cmple $18, 127, $1 # E : Can we go through the unrolled loop?
bne $1, $tail_quads # U : Nope
nop # E :
$unroll_body:
wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
# ($7) are about to be over-written
ldq $6, 0($17) # L0 : bytes 0..7
nop # E :
nop # E :
ldq $4, 8($17) # L : bytes 8..15
ldq $5, 16($17) # L : bytes 16..23
addq $7, 64, $7 # E : Update next wh64 address
nop # E :
ldq $3, 24($17) # L : bytes 24..31
addq $16, 64, $1 # E : fallback value for wh64
nop # E :
nop # E :
addq $17, 32, $17 # E : src += 32 bytes
stq $6, 0($16) # L : bytes 0..7
nop # E :
nop # E :
stq $4, 8($16) # L : bytes 8..15
stq $5, 16($16) # L : bytes 16..23
subq $18, 192, $2 # E : At least two more trips to go?
nop # E :
stq $3, 24($16) # L : bytes 24..31
addq $16, 32, $16 # E : dest += 32 bytes
nop # E :
nop # E :
ldq $6, 0($17) # L : bytes 0..7
ldq $4, 8($17) # L : bytes 8..15
cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
# fallback wh64 address if < 2 more trips
nop # E :
ldq $5, 16($17) # L : bytes 16..23
ldq $3, 24($17) # L : bytes 24..31
addq $16, 32, $16 # E : dest += 32
subq $18, 64, $18 # E : count -= 64
addq $17, 32, $17 # E : src += 32
stq $6, -32($16) # L : bytes 0..7
stq $4, -24($16) # L : bytes 8..15
cmple $18, 63, $1 # E : At least one more trip?
stq $5, -16($16) # L : bytes 16..23
stq $3, -8($16) # L : bytes 24..31
nop # E :
beq $1, $unroll_body
$tail_quads:
$no_unroll:
.align 4
subq $18, 8, $18 # E : At least a quad left?
blt $18, $less_than_8 # U : Nope
nop # E :
nop # E :
$move_a_quad:
ldq $1, 0($17) # L : fetch 8
subq $18, 8, $18 # E : count -= 8
addq $17, 8, $17 # E : src += 8
nop # E :
stq $1, 0($16) # L : store 8
addq $16, 8, $16 # E : dest += 8
bge $18, $move_a_quad # U :
nop # E :
$less_than_8:
.align 4
addq $18, 8, $18 # E : add back for trailing bytes
ble $18, $nomoredata # U : All-done
nop # E :
nop # E :
/* Trailing bytes */
$tail_bytes:
subq $18, 1, $18 # E : count--
ldbu $1, 0($17) # L : fetch a byte
addq $17, 1, $17 # E : src++
nop # E :
stb $1, 0($16) # L : store a byte
addq $16, 1, $16 # E : dest++
bgt $18, $tail_bytes # U : more to be done?
nop # E :
/* branching to exit takes 3 extra cycles, so replicate exit here */
ret $31, ($26), 1 # L0 :
nop # E :
nop # E :
nop # E :
$misaligned:
mov $0, $4 # E : dest temp
and $0, 7, $1 # E : dest alignment mod8
beq $1, $dest_0mod8 # U : life doesnt totally suck
nop
$aligndest:
ble $18, $nomoredata # U :
ldbu $1, 0($17) # L : fetch a byte
subq $18, 1, $18 # E : count--
addq $17, 1, $17 # E : src++
stb $1, 0($4) # L : store it
addq $4, 1, $4 # E : dest++
and $4, 7, $1 # E : dest 0mod8 yet?
bne $1, $aligndest # U : go until we are aligned.
/* Source has unknown alignment, but dest is known to be 0mod8 */
$dest_0mod8:
subq $18, 8, $18 # E : At least a quad left?
blt $18, $misalign_tail # U : Nope
ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
nop # E :
$mis_quad:
ldq_u $16, 8($17) # L : Fetch next 8
extql $3, $17, $3 # U : masking
extqh $16, $17, $1 # U : masking
bis $3, $1, $1 # E : merged bytes to store
subq $18, 8, $18 # E : count -= 8
addq $17, 8, $17 # E : src += 8
stq $1, 0($4) # L : store 8 (aligned)
mov $16, $3 # E : "rotate" source data
addq $4, 8, $4 # E : dest += 8
bge $18, $mis_quad # U : More quads to move
nop
nop
$misalign_tail:
addq $18, 8, $18 # E : account for tail stuff
ble $18, $nomoredata # U :
nop
nop
$misalign_byte:
ldbu $1, 0($17) # L : fetch 1
subq $18, 1, $18 # E : count--
addq $17, 1, $17 # E : src++
nop # E :
stb $1, 0($4) # L : store
addq $4, 1, $4 # E : dest++
bgt $18, $misalign_byte # U : more to go?
nop
$nomoredata:
ret $31, ($26), 1 # L0 :
nop # E :
nop # E :
nop # E :
.end memcpy
/* For backwards module compatibility. */
__memcpy = memcpy
.globl __memcpy

597
arch/alpha/lib/ev6-memset.S Normal file
View File

@@ -0,0 +1,597 @@
/*
* arch/alpha/lib/ev6-memset.S
*
* This is an efficient (and relatively small) implementation of the C library
* "memset()" function for the 21264 implementation of Alpha.
*
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* The algorithm for the leading and trailing quadwords remains the same,
* however the loop has been unrolled to enable better memory throughput,
* and the code has been replicated for each of the entry points: __memset
* and __memsetw to permit better scheduling to eliminate the stalling
* encountered during the mask replication.
* A future enhancement might be to put in a byte store loop for really
* small (say < 32 bytes) memset()s. Whether or not that change would be
* a win in the kernel would depend upon the contextual usage.
* WARNING: Maintaining this is going to be more work than the above version,
* as fixes will need to be made in multiple places. The performance gain
* is worth it.
*/
.set noat
.set noreorder
.text
.globl __memset
.globl __memsetw
.globl __constant_c_memset
.globl memset
.ent __memset
.align 5
__memset:
.frame $30,0,$26,0
.prologue 0
/*
* Serious stalling happens. The only way to mitigate this is to
* undertake a major re-write to interleave the constant materialization
* with other parts of the fall-through code. This is important, even
* though it makes maintenance tougher.
* Do this later.
*/
and $17,255,$1 # E : 00000000000000ch
insbl $17,1,$2 # U : 000000000000ch00
bis $16,$16,$0 # E : return value
ble $18,end_b # U : zero length requested?
addq $18,$16,$6 # E : max address to write to
bis $1,$2,$17 # E : 000000000000chch
insbl $1,2,$3 # U : 0000000000ch0000
insbl $1,3,$4 # U : 00000000ch000000
or $3,$4,$3 # E : 00000000chch0000
inswl $17,4,$5 # U : 0000chch00000000
xor $16,$6,$1 # E : will complete write be within one quadword?
inswl $17,6,$2 # U : chch000000000000
or $17,$3,$17 # E : 00000000chchchch
or $2,$5,$2 # E : chchchch00000000
bic $1,7,$1 # E : fit within a single quadword?
and $16,7,$3 # E : Target addr misalignment
or $17,$2,$17 # E : chchchchchchchch
beq $1,within_quad_b # U :
nop # E :
beq $3,aligned_b # U : target is 0mod8
/*
* Target address is misaligned, and won't fit within a quadword
*/
ldq_u $4,0($16) # L : Fetch first partial
bis $16,$16,$5 # E : Save the address
insql $17,$16,$2 # U : Insert new bytes
subq $3,8,$3 # E : Invert (for addressing uses)
addq $18,$3,$18 # E : $18 is new count ($3 is negative)
mskql $4,$16,$4 # U : clear relevant parts of the quad
subq $16,$3,$16 # E : $16 is new aligned destination
bis $2,$4,$1 # E : Final bytes
nop
stq_u $1,0($5) # L : Store result
nop
nop
.align 4
aligned_b:
/*
* We are now guaranteed to be quad aligned, with at least
* one partial quad to write.
*/
sra $18,3,$3 # U : Number of remaining quads to write
and $18,7,$18 # E : Number of trailing bytes to write
bis $16,$16,$5 # E : Save dest address
beq $3,no_quad_b # U : tail stuff only
/*
* it's worth the effort to unroll this and use wh64 if possible
* Lifted a bunch of code from clear_user.S
* At this point, entry values are:
* $16 Current destination address
* $5 A copy of $16
* $6 The max quadword address to write to
* $18 Number trailer bytes
* $3 Number quads to write
*/
and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
blt $4, loop_b # U :
/*
* We know we've got at least 16 quads, minimum of one trip
* through unrolled loop. Do a quad at a time to get us 0mod64
* aligned.
*/
nop # E :
nop # E :
nop # E :
beq $1, $bigalign_b # U :
$alignmod64_b:
stq $17, 0($5) # L :
subq $3, 1, $3 # E : For consistency later
addq $1, 8, $1 # E : Increment towards zero for alignment
addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
nop
nop
addq $5, 8, $5 # E : Inc address
blt $1, $alignmod64_b # U :
$bigalign_b:
/*
* $3 - number quads left to go
* $5 - target address (aligned 0mod64)
* $17 - mask of stuff to store
* Scratch registers available: $7, $2, $4, $1
* we know that we'll be taking a minimum of one trip through
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
* Assumes the wh64 needs to be for 2 trips through the loop in the future
* The wh64 is issued on for the starting destination address for trip +2
* through the loop, and if there are less than two trips left, the target
* address will be for the current trip.
*/
$do_wh64_b:
wh64 ($4) # L1 : memory subsystem write hint
subq $3, 24, $2 # E : For determining future wh64 addresses
stq $17, 0($5) # L :
nop # E :
addq $5, 128, $4 # E : speculative target of next wh64
stq $17, 8($5) # L :
stq $17, 16($5) # L :
addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
stq $17, 24($5) # L :
stq $17, 32($5) # L :
cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
nop
stq $17, 40($5) # L :
stq $17, 48($5) # L :
subq $3, 16, $2 # E : Repeat the loop at least once more?
nop
stq $17, 56($5) # L :
addq $5, 64, $5 # E :
subq $3, 8, $3 # E :
bge $2, $do_wh64_b # U :
nop
nop
nop
beq $3, no_quad_b # U : Might have finished already
.align 4
/*
* Simple loop for trailing quadwords, or for small amounts
* of data (where we can't use an unrolled loop and wh64)
*/
loop_b:
stq $17,0($5) # L :
subq $3,1,$3 # E : Decrement number quads left
addq $5,8,$5 # E : Inc address
bne $3,loop_b # U : more?
no_quad_b:
/*
* Write 0..7 trailing bytes.
*/
nop # E :
beq $18,end_b # U : All done?
ldq $7,0($5) # L :
mskqh $7,$6,$2 # U : Mask final quad
insqh $17,$6,$4 # U : New bits
bis $2,$4,$1 # E : Put it all together
stq $1,0($5) # L : And back to memory
ret $31,($26),1 # L0 :
within_quad_b:
ldq_u $1,0($16) # L :
insql $17,$16,$2 # U : New bits
mskql $1,$16,$4 # U : Clear old
bis $2,$4,$2 # E : New result
mskql $2,$6,$4 # U :
mskqh $1,$6,$2 # U :
bis $2,$4,$1 # E :
stq_u $1,0($16) # L :
end_b:
nop
nop
nop
ret $31,($26),1 # L0 :
.end __memset
/*
* This is the original body of code, prior to replication and
* rescheduling. Leave it here, as there may be calls to this
* entry point.
*/
.align 4
.ent __constant_c_memset
__constant_c_memset:
.frame $30,0,$26,0
.prologue 0
addq $18,$16,$6 # E : max address to write to
bis $16,$16,$0 # E : return value
xor $16,$6,$1 # E : will complete write be within one quadword?
ble $18,end # U : zero length requested?
bic $1,7,$1 # E : fit within a single quadword
beq $1,within_one_quad # U :
and $16,7,$3 # E : Target addr misalignment
beq $3,aligned # U : target is 0mod8
/*
* Target address is misaligned, and won't fit within a quadword
*/
ldq_u $4,0($16) # L : Fetch first partial
bis $16,$16,$5 # E : Save the address
insql $17,$16,$2 # U : Insert new bytes
subq $3,8,$3 # E : Invert (for addressing uses)
addq $18,$3,$18 # E : $18 is new count ($3 is negative)
mskql $4,$16,$4 # U : clear relevant parts of the quad
subq $16,$3,$16 # E : $16 is new aligned destination
bis $2,$4,$1 # E : Final bytes
nop
stq_u $1,0($5) # L : Store result
nop
nop
.align 4
aligned:
/*
* We are now guaranteed to be quad aligned, with at least
* one partial quad to write.
*/
sra $18,3,$3 # U : Number of remaining quads to write
and $18,7,$18 # E : Number of trailing bytes to write
bis $16,$16,$5 # E : Save dest address
beq $3,no_quad # U : tail stuff only
/*
* it's worth the effort to unroll this and use wh64 if possible
* Lifted a bunch of code from clear_user.S
* At this point, entry values are:
* $16 Current destination address
* $5 A copy of $16
* $6 The max quadword address to write to
* $18 Number trailer bytes
* $3 Number quads to write
*/
and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
blt $4, loop # U :
/*
* We know we've got at least 16 quads, minimum of one trip
* through unrolled loop. Do a quad at a time to get us 0mod64
* aligned.
*/
nop # E :
nop # E :
nop # E :
beq $1, $bigalign # U :
$alignmod64:
stq $17, 0($5) # L :
subq $3, 1, $3 # E : For consistency later
addq $1, 8, $1 # E : Increment towards zero for alignment
addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
nop
nop
addq $5, 8, $5 # E : Inc address
blt $1, $alignmod64 # U :
$bigalign:
/*
* $3 - number quads left to go
* $5 - target address (aligned 0mod64)
* $17 - mask of stuff to store
* Scratch registers available: $7, $2, $4, $1
* we know that we'll be taking a minimum of one trip through
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
* Assumes the wh64 needs to be for 2 trips through the loop in the future
* The wh64 is issued on for the starting destination address for trip +2
* through the loop, and if there are less than two trips left, the target
* address will be for the current trip.
*/
$do_wh64:
wh64 ($4) # L1 : memory subsystem write hint
subq $3, 24, $2 # E : For determining future wh64 addresses
stq $17, 0($5) # L :
nop # E :
addq $5, 128, $4 # E : speculative target of next wh64
stq $17, 8($5) # L :
stq $17, 16($5) # L :
addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
stq $17, 24($5) # L :
stq $17, 32($5) # L :
cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
nop
stq $17, 40($5) # L :
stq $17, 48($5) # L :
subq $3, 16, $2 # E : Repeat the loop at least once more?
nop
stq $17, 56($5) # L :
addq $5, 64, $5 # E :
subq $3, 8, $3 # E :
bge $2, $do_wh64 # U :
nop
nop
nop
beq $3, no_quad # U : Might have finished already
.align 4
/*
* Simple loop for trailing quadwords, or for small amounts
* of data (where we can't use an unrolled loop and wh64)
*/
loop:
stq $17,0($5) # L :
subq $3,1,$3 # E : Decrement number quads left
addq $5,8,$5 # E : Inc address
bne $3,loop # U : more?
no_quad:
/*
* Write 0..7 trailing bytes.
*/
nop # E :
beq $18,end # U : All done?
ldq $7,0($5) # L :
mskqh $7,$6,$2 # U : Mask final quad
insqh $17,$6,$4 # U : New bits
bis $2,$4,$1 # E : Put it all together
stq $1,0($5) # L : And back to memory
ret $31,($26),1 # L0 :
within_one_quad:
ldq_u $1,0($16) # L :
insql $17,$16,$2 # U : New bits
mskql $1,$16,$4 # U : Clear old
bis $2,$4,$2 # E : New result
mskql $2,$6,$4 # U :
mskqh $1,$6,$2 # U :
bis $2,$4,$1 # E :
stq_u $1,0($16) # L :
end:
nop
nop
nop
ret $31,($26),1 # L0 :
.end __constant_c_memset
/*
* This is a replicant of the __constant_c_memset code, rescheduled
* to mask stalls. Note that entry point names also had to change
*/
.align 5
.ent __memsetw
__memsetw:
.frame $30,0,$26,0
.prologue 0
inswl $17,0,$5 # U : 000000000000c1c2
inswl $17,2,$2 # U : 00000000c1c20000
bis $16,$16,$0 # E : return value
addq $18,$16,$6 # E : max address to write to
ble $18, end_w # U : zero length requested?
inswl $17,4,$3 # U : 0000c1c200000000
inswl $17,6,$4 # U : c1c2000000000000
xor $16,$6,$1 # E : will complete write be within one quadword?
or $2,$5,$2 # E : 00000000c1c2c1c2
or $3,$4,$17 # E : c1c2c1c200000000
bic $1,7,$1 # E : fit within a single quadword
and $16,7,$3 # E : Target addr misalignment
or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
beq $1,within_quad_w # U :
nop
beq $3,aligned_w # U : target is 0mod8
/*
* Target address is misaligned, and won't fit within a quadword
*/
ldq_u $4,0($16) # L : Fetch first partial
bis $16,$16,$5 # E : Save the address
insql $17,$16,$2 # U : Insert new bytes
subq $3,8,$3 # E : Invert (for addressing uses)
addq $18,$3,$18 # E : $18 is new count ($3 is negative)
mskql $4,$16,$4 # U : clear relevant parts of the quad
subq $16,$3,$16 # E : $16 is new aligned destination
bis $2,$4,$1 # E : Final bytes
nop
stq_u $1,0($5) # L : Store result
nop
nop
.align 4
aligned_w:
/*
* We are now guaranteed to be quad aligned, with at least
* one partial quad to write.
*/
sra $18,3,$3 # U : Number of remaining quads to write
and $18,7,$18 # E : Number of trailing bytes to write
bis $16,$16,$5 # E : Save dest address
beq $3,no_quad_w # U : tail stuff only
/*
* it's worth the effort to unroll this and use wh64 if possible
* Lifted a bunch of code from clear_user.S
* At this point, entry values are:
* $16 Current destination address
* $5 A copy of $16
* $6 The max quadword address to write to
* $18 Number trailer bytes
* $3 Number quads to write
*/
and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
blt $4, loop_w # U :
/*
* We know we've got at least 16 quads, minimum of one trip
* through unrolled loop. Do a quad at a time to get us 0mod64
* aligned.
*/
nop # E :
nop # E :
nop # E :
beq $1, $bigalign_w # U :
$alignmod64_w:
stq $17, 0($5) # L :
subq $3, 1, $3 # E : For consistency later
addq $1, 8, $1 # E : Increment towards zero for alignment
addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
nop
nop
addq $5, 8, $5 # E : Inc address
blt $1, $alignmod64_w # U :
$bigalign_w:
/*
* $3 - number quads left to go
* $5 - target address (aligned 0mod64)
* $17 - mask of stuff to store
* Scratch registers available: $7, $2, $4, $1
* we know that we'll be taking a minimum of one trip through
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
* Assumes the wh64 needs to be for 2 trips through the loop in the future
* The wh64 is issued on for the starting destination address for trip +2
* through the loop, and if there are less than two trips left, the target
* address will be for the current trip.
*/
$do_wh64_w:
wh64 ($4) # L1 : memory subsystem write hint
subq $3, 24, $2 # E : For determining future wh64 addresses
stq $17, 0($5) # L :
nop # E :
addq $5, 128, $4 # E : speculative target of next wh64
stq $17, 8($5) # L :
stq $17, 16($5) # L :
addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
stq $17, 24($5) # L :
stq $17, 32($5) # L :
cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
nop
stq $17, 40($5) # L :
stq $17, 48($5) # L :
subq $3, 16, $2 # E : Repeat the loop at least once more?
nop
stq $17, 56($5) # L :
addq $5, 64, $5 # E :
subq $3, 8, $3 # E :
bge $2, $do_wh64_w # U :
nop
nop
nop
beq $3, no_quad_w # U : Might have finished already
.align 4
/*
* Simple loop for trailing quadwords, or for small amounts
* of data (where we can't use an unrolled loop and wh64)
*/
loop_w:
stq $17,0($5) # L :
subq $3,1,$3 # E : Decrement number quads left
addq $5,8,$5 # E : Inc address
bne $3,loop_w # U : more?
no_quad_w:
/*
* Write 0..7 trailing bytes.
*/
nop # E :
beq $18,end_w # U : All done?
ldq $7,0($5) # L :
mskqh $7,$6,$2 # U : Mask final quad
insqh $17,$6,$4 # U : New bits
bis $2,$4,$1 # E : Put it all together
stq $1,0($5) # L : And back to memory
ret $31,($26),1 # L0 :
within_quad_w:
ldq_u $1,0($16) # L :
insql $17,$16,$2 # U : New bits
mskql $1,$16,$4 # U : Clear old
bis $2,$4,$2 # E : New result
mskql $2,$6,$4 # U :
mskqh $1,$6,$2 # U :
bis $2,$4,$1 # E :
stq_u $1,0($16) # L :
end_w:
nop
nop
nop
ret $31,($26),1 # L0 :
.end __memsetw
memset = __memset

View File

@@ -0,0 +1,424 @@
/*
* arch/alpha/lib/ev6-strncpy_from_user.S
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Just like strncpy except in the return value:
*
* -EFAULT if an exception occurs before the terminator is copied.
* N if the buffer filled.
*
* Otherwise the length of the string is returned.
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* A bunch of instructions got moved and temp registers were changed
* to aid in scheduling. Control flow was also re-arranged to eliminate
* branches, and to provide longer code sequences to enable better scheduling.
* A total rewrite (using byte load/stores for start & tail sequences)
* is desirable, but very difficult to do without a from-scratch rewrite.
* Save that for the future.
*/
#include <asm/errno.h>
#include <asm/regdef.h>
/* Allow an exception for an insn; exit if we get one. */
#define EX(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exception-99b($0); \
.previous
.set noat
.set noreorder
.text
.globl __strncpy_from_user
.ent __strncpy_from_user
.frame $30, 0, $26
.prologue 0
.align 4
__strncpy_from_user:
and a0, 7, t3 # E : find dest misalignment
beq a2, $zerolength # U :
/* Are source and destination co-aligned? */
mov a0, v0 # E : save the string start
xor a0, a1, t4 # E :
EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword
ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword
addq a2, t3, a2 # E : bias count by dest misalignment
subq a2, 1, a3 # E :
addq zero, 1, t10 # E :
and t4, 7, t4 # E : misalignment between the two
and a3, 7, t6 # E : number of tail bytes
sll t10, t6, t10 # E : t10 = bitmask of last count byte
bne t4, $unaligned # U :
lda t2, -1 # E : build a mask against false zero
/*
* We are co-aligned; take care of a partial first word.
* On entry to this basic block:
* t0 == the first destination word for masking back in
* t1 == the first source word.
*/
srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8
addq a1, 8, a1 # E :
mskqh t2, a1, t2 # U : detection in the src word
nop
/* Create the 1st output word and detect 0's in the 1st input word. */
mskqh t1, a1, t3 # U :
mskql t0, a1, t0 # U : assemble the first output word
ornot t1, t2, t2 # E :
nop
cmpbge zero, t2, t8 # E : bits set iff null found
or t0, t3, t0 # E :
beq a2, $a_eoc # U :
bne t8, $a_eos # U : 2nd branch in a quad. Bad.
/* On entry to this basic block:
* t0 == a source quad not containing a null.
* a0 - current aligned destination address
* a1 - current aligned source address
* a2 - count of quadwords to move.
* NOTE: Loop improvement - unrolling this is going to be
* a huge win, since we're going to stall otherwise.
* Fix this later. For _really_ large copies, look
* at using wh64 on a look-ahead basis. See the code
* in clear_user.S and copy_user.S.
* Presumably, since (a0) and (a1) do not overlap (by C definition)
* Lots of nops here:
* - Separate loads from stores
* - Keep it to 1 branch/quadpack so the branch predictor
* can train.
*/
$a_loop:
stq_u t0, 0(a0) # L :
addq a0, 8, a0 # E :
nop
subq a2, 1, a2 # E :
EX( ldq_u t0, 0(a1) ) # L :
addq a1, 8, a1 # E :
cmpbge zero, t0, t8 # E : Stall 2 cycles on t0
beq a2, $a_eoc # U :
beq t8, $a_loop # U :
nop
nop
nop
/* Take care of the final (partial) word store. At this point
* the end-of-count bit is set in t8 iff it applies.
*
* On entry to this basic block we have:
* t0 == the source word containing the null
* t8 == the cmpbge mask that found it.
*/
$a_eos:
negq t8, t12 # E : find low bit set
and t8, t12, t12 # E :
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t1, 0(a0) # L :
subq t12, 1, t6 # E :
or t12, t6, t8 # E :
zapnot t0, t8, t0 # U : clear src bytes > null
zap t1, t8, t1 # U : clear dst bytes <= null
or t0, t1, t0 # E :
stq_u t0, 0(a0) # L :
br $finish_up # L0 :
nop
nop
/* Add the end-of-count bit to the eos detection bitmask. */
.align 4
$a_eoc:
or t10, t8, t8
br $a_eos
nop
nop
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 4
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, unmasked
t1 == the shifted low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
EX( ldq_u t2, 8(a1) ) # L : load second src word
addq a1, 8, a1 # E :
mskql t0, a0, t0 # U : mask trailing garbage in dst
extqh t2, a1, t4 # U :
or t1, t4, t1 # E : first aligned src word complete
mskqh t1, a0, t1 # U : mask leading garbage in src
or t0, t1, t0 # E : first output word complete
or t0, t6, t6 # E : mask original data for zero test
cmpbge zero, t6, t8 # E :
beq a2, $u_eocfin # U :
bne t8, $u_final # U : bad news - 2nd branch in a quad
lda t6, -1 # E : mask out the bits we have
mskql t6, a1, t6 # U : already seen
stq_u t0, 0(a0) # L : store first output word
or t6, t2, t2 # E :
cmpbge zero, t2, t8 # E : find nulls in second partial
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
bne t8, $u_late_head_exit # U :
nop
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
extql t2, a1, t1 # U : position hi-bits of lo word
EX( ldq_u t2, 8(a1) ) # L : read next high-order source word
addq a1, 8, a1 # E :
cmpbge zero, t2, t8 # E :
beq a2, $u_eoc # U :
bne t8, $u_eos # U :
nop
nop
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
/*
* Extra nops here:
* separate load quads from store quads
* only one branch/quad to permit predictor training
*/
.align 4
$u_loop:
extqh t2, a1, t0 # U : extract high bits for current word
addq a1, 8, a1 # E :
extql t2, a1, t3 # U : extract low bits for next time
addq a0, 8, a0 # E :
or t0, t1, t0 # E : current dst word now complete
EX( ldq_u t2, 0(a1) ) # L : load high word for next time
subq a2, 1, a2 # E :
nop
stq_u t0, -8(a0) # L : save the current word
mov t3, t1 # E :
cmpbge zero, t2, t8 # E : test new word for eos
beq a2, $u_eoc # U :
beq t8, $u_loop # U :
nop
nop
nop
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
.align 4
$u_eos:
extqh t2, a1, t0 # U :
or t0, t1, t0 # E : first (partial) source word complete
cmpbge zero, t0, t8 # E : is the null in this first bit?
nop
bne t8, $u_final # U :
stq_u t0, 0(a0) # L : the null was in the high-order bits
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
.align 4
$u_late_head_exit:
extql t2, a1, t0 # U :
cmpbge zero, t0, t8 # E :
or t8, t10, t6 # E :
cmoveq a2, t6, t8 # E :
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t0 == assembled source word
t8 == cmpbge mask that found the null. */
.align 4
$u_final:
negq t8, t6 # E : isolate low bit set
and t6, t8, t12 # E :
ldq_u t1, 0(a0) # L :
subq t12, 1, t6 # E :
or t6, t12, t8 # E :
zapnot t0, t8, t0 # U : kill source bytes > null
zap t1, t8, t1 # U : kill dest bytes <= null
or t0, t1, t0 # E :
stq_u t0, 0(a0) # E :
br $finish_up # U :
nop
nop
.align 4
$u_eoc: # end-of-count
extqh t2, a1, t0 # U :
or t0, t1, t0 # E :
cmpbge zero, t0, t8 # E :
nop
.align 4
$u_eocfin: # end-of-count, final word
or t10, t8, t8 # E :
br $u_final # U :
nop
nop
/* Unaligned copy entry point. */
.align 4
$unaligned:
srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8
and a0, 7, t4 # E : find dest misalignment
and a1, 7, t5 # E : find src misalignment
mov zero, t0 # E :
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t6 # E :
beq t4, 1f # U :
ldq_u t0, 0(a0) # L :
lda t6, -1 # E :
mskql t6, a0, t6 # E :
nop
nop
nop
.align 4
1:
subq a1, t4, a1 # E : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
cmplt t4, t5, t12 # E :
extql t1, a1, t1 # U : shift src into place
lda t2, -1 # E : for creating masks later
beq t12, $u_head # U :
mskqh t2, t5, t2 # U : begin src byte validity mask
cmpbge zero, t1, t8 # E : is there a zero?
nop
extql t2, a1, t2 # U :
or t8, t10, t5 # E : test for end-of-count too
cmpbge zero, t2, t3 # E :
cmoveq a2, t5, t8 # E : Latency=2, extra map slot
nop # E : goes with cmov
andnot t8, t3, t8 # E :
beq t8, $u_head # U :
nop
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # L :
negq t8, t6 # E : build bitmask of bytes <= zero
mskqh t1, t4, t1 # U :
and t6, t8, t12 # E :
subq t12, 1, t6 # E :
or t6, t12, t8 # E :
zapnot t2, t8, t2 # U : prepare source word; mirror changes
zapnot t1, t8, t1 # U : to source validity mask
andnot t0, t2, t0 # E : zero place for source to reside
or t0, t1, t0 # E : and put it there
stq_u t0, 0(a0) # L :
nop
.align 4
$finish_up:
zapnot t0, t12, t4 # U : was last byte written null?
and t12, 0xf0, t3 # E : binary search for the address of the
cmovne t4, 1, t4 # E : Latency=2, extra map slot
nop # E : with cmovne
and t12, 0xcc, t2 # E : last byte written
and t12, 0xaa, t1 # E :
cmovne t3, 4, t3 # E : Latency=2, extra map slot
nop # E : with cmovne
bic a0, 7, t0
cmovne t2, 2, t2 # E : Latency=2, extra map slot
nop # E : with cmovne
nop
cmovne t1, 1, t1 # E : Latency=2, extra map slot
nop # E : with cmovne
addq t0, t3, t0 # E :
addq t1, t2, t1 # E :
addq t0, t1, t0 # E :
addq t0, t4, t0 # add one if we filled the buffer
subq t0, v0, v0 # find string length
ret # L0 :
.align 4
$zerolength:
nop
nop
nop
clr v0
$exception:
nop
nop
nop
ret
.end __strncpy_from_user

321
arch/alpha/lib/ev6-stxcpy.S Normal file
View File

@@ -0,0 +1,321 @@
/*
* arch/alpha/lib/ev6-stxcpy.S
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Copy a null-terminated string from SRC to DST.
*
* This is an internal routine used by strcpy, stpcpy, and strcat.
* As such, it uses special linkage conventions to make implementation
* of these public functions more efficient.
*
* On input:
* t9 = return address
* a0 = DST
* a1 = SRC
*
* On output:
* t12 = bitmask (with one bit set) indicating the last byte written
* a0 = unaligned address of the last *word* written
*
* Furthermore, v0, a3-a5, t11, and t12 are untouched.
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
*/
#include <asm/regdef.h>
.set noat
.set noreorder
.text
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
doesn't like putting the entry point for a procedure somewhere in the
middle of the procedure descriptor. Work around this by putting the
aligned copy in its own procedure descriptor */
.ent stxcpy_aligned
.align 4
stxcpy_aligned:
.frame sp, 0, t9
.prologue 0
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == the first source word. */
/* Create the 1st output word and detect 0's in the 1st input word. */
lda t2, -1 # E : build a mask against false zero
mskqh t2, a1, t2 # U : detection in the src word (stall)
mskqh t1, a1, t3 # U :
ornot t1, t2, t2 # E : (stall)
mskql t0, a1, t0 # U : assemble the first output word
cmpbge zero, t2, t8 # E : bits set iff null found
or t0, t3, t1 # E : (stall)
bne t8, $a_eos # U : (stall)
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == a source word not containing a null. */
/* Nops here to separate store quads from load quads */
$a_loop:
stq_u t1, 0(a0) # L :
addq a0, 8, a0 # E :
nop
nop
ldq_u t1, 0(a1) # L : Latency=3
addq a1, 8, a1 # E :
cmpbge zero, t1, t8 # E : (3 cycle stall)
beq t8, $a_loop # U : (stall for t8)
/* Take care of the final (partial) word store.
On entry to this basic block we have:
t1 == the source word containing the null
t8 == the cmpbge mask that found it. */
$a_eos:
negq t8, t6 # E : find low bit set
and t8, t6, t12 # E : (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
and t12, 0x80, t6 # E : (stall)
bne t6, 1f # U : (stall)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t0, 0(a0) # L : Latency=3
subq t12, 1, t6 # E :
zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
or t12, t6, t8 # E : (stall)
zap t0, t8, t0 # E : clear dst bytes <= null
or t0, t1, t1 # E : (stall)
nop
nop
1: stq_u t1, 0(a0) # L :
ret (t9) # L0 : Latency=3
nop
nop
.end stxcpy_aligned
.align 4
.ent __stxcpy
.globl __stxcpy
__stxcpy:
.frame sp, 0, t9
.prologue 0
/* Are source and destination co-aligned? */
xor a0, a1, t0 # E :
unop # E :
and t0, 7, t0 # E : (stall)
bne t0, $unaligned # U : (stall)
/* We are co-aligned; take care of a partial first word. */
ldq_u t1, 0(a1) # L : load first src word
and a0, 7, t0 # E : take care not to load a word ...
addq a1, 8, a1 # E :
beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
ldq_u t0, 0(a0) # L :
br stxcpy_aligned # L0 : Latency=3
nop
nop
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 4
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, for masking back in, if needed else 0
t1 == the low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
ldq_u t2, 8(a1) # L :
addq a1, 8, a1 # E :
extql t1, a1, t1 # U : (stall on a1)
extqh t2, a1, t4 # U : (stall on a1)
mskql t0, a0, t0 # U :
or t1, t4, t1 # E :
mskqh t1, a0, t1 # U : (stall on t1)
or t0, t1, t1 # E : (stall on t1)
or t1, t6, t6 # E :
cmpbge zero, t6, t8 # E : (stall)
lda t6, -1 # E : for masking just below
bne t8, $u_final # U : (stall)
mskql t6, a1, t6 # U : mask out the bits we have
or t6, t2, t2 # E : already extracted before (stall)
cmpbge zero, t2, t8 # E : testing eos (stall)
bne t8, $u_late_head_exit # U : (stall)
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
stq_u t1, 0(a0) # L : store first output word
addq a0, 8, a0 # E :
extql t2, a1, t0 # U : position ho-bits of lo word
ldq_u t2, 8(a1) # U : read next high-order source word
addq a1, 8, a1 # E :
cmpbge zero, t2, t8 # E : (stall for t2)
nop # E :
bne t8, $u_eos # U : (stall)
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t0 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
.align 3
$u_loop:
extqh t2, a1, t1 # U : extract high bits for current word
addq a1, 8, a1 # E : (stall)
extql t2, a1, t3 # U : extract low bits for next time (stall)
addq a0, 8, a0 # E :
or t0, t1, t1 # E : current dst word now complete
ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
stq_u t1, -8(a0) # L : save the current word (stall)
mov t3, t0 # E :
cmpbge zero, t2, t8 # E : test new word for eos
beq t8, $u_loop # U : (stall)
nop
nop
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t0 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
$u_eos:
extqh t2, a1, t1 # U :
or t0, t1, t1 # E : first (partial) source word complete (stall)
cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
bne t8, $u_final # U : (stall)
$u_late_head_exit:
stq_u t1, 0(a0) # L : the null was in the high-order bits
addq a0, 8, a0 # E :
extql t2, a1, t1 # U :
cmpbge zero, t1, t8 # E : (stall)
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t1 == assembled source word
t8 == cmpbge mask that found the null. */
$u_final:
negq t8, t6 # E : isolate low bit set
and t6, t8, t12 # E : (stall)
and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
bne t6, 1f # U : (stall)
ldq_u t0, 0(a0) # E :
subq t12, 1, t6 # E :
or t6, t12, t8 # E : (stall)
zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
or t0, t1, t1 # E : (stall)
nop
nop
1: stq_u t1, 0(a0) # L :
ret (t9) # L0 : Latency=3
nop
nop
/* Unaligned copy entry point. */
.align 4
$unaligned:
ldq_u t1, 0(a1) # L : load first source word
and a0, 7, t4 # E : find dest misalignment
and a1, 7, t5 # E : find src misalignment
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t0 # E :
mov zero, t6 # E :
beq t4, 1f # U :
ldq_u t0, 0(a0) # L :
lda t6, -1 # E :
mskql t6, a0, t6 # U :
nop
nop
nop
1:
subq a1, t4, a1 # E : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
cmplt t4, t5, t12 # E :
beq t12, $u_head # U :
lda t2, -1 # E : mask out leading garbage in source
mskqh t2, t5, t2 # U :
ornot t1, t2, t3 # E : (stall)
cmpbge zero, t3, t8 # E : is there a zero? (stall)
beq t8, $u_head # U : (stall)
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # L :
negq t8, t6 # E : build bitmask of bytes <= zero
and t6, t8, t12 # E : (stall)
and a1, 7, t5 # E :
subq t12, 1, t6 # E :
or t6, t12, t8 # E : (stall)
srl t12, t5, t12 # U : adjust final null return value
zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
and t1, t2, t1 # E : to source validity mask
extql t2, a1, t2 # U :
extql t1, a1, t1 # U : (stall)
andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
or t0, t1, t1 # e1 : and put it there
stq_u t1, 0(a0) # .. e0 : (stall)
ret (t9) # e1 :
nop
.end __stxcpy

View File

@@ -0,0 +1,397 @@
/*
* arch/alpha/lib/ev6-stxncpy.S
* 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
*
* Copy no more than COUNT bytes of the null-terminated string from
* SRC to DST.
*
* This is an internal routine used by strncpy, stpncpy, and strncat.
* As such, it uses special linkage conventions to make implementation
* of these public functions more efficient.
*
* On input:
* t9 = return address
* a0 = DST
* a1 = SRC
* a2 = COUNT
*
* Furthermore, COUNT may not be zero.
*
* On output:
* t0 = last word written
* t10 = bitmask (with one bit set) indicating the byte position of
* the end of the range specified by COUNT
* t12 = bitmask (with one bit set) indicating the last byte written
* a0 = unaligned address of the last *word* written
* a2 = the number of full words left in COUNT
*
* Furthermore, v0, a3-a5, t11, and $at are untouched.
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
*/
#include <asm/regdef.h>
.set noat
.set noreorder
.text
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
doesn't like putting the entry point for a procedure somewhere in the
middle of the procedure descriptor. Work around this by putting the
aligned copy in its own procedure descriptor */
.ent stxncpy_aligned
.align 4
stxncpy_aligned:
.frame sp, 0, t9, 0
.prologue 0
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == the first source word. */
/* Create the 1st output word and detect 0's in the 1st input word. */
lda t2, -1 # E : build a mask against false zero
mskqh t2, a1, t2 # U : detection in the src word (stall)
mskqh t1, a1, t3 # U :
ornot t1, t2, t2 # E : (stall)
mskql t0, a1, t0 # U : assemble the first output word
cmpbge zero, t2, t8 # E : bits set iff null found
or t0, t3, t0 # E : (stall)
beq a2, $a_eoc # U :
bne t8, $a_eos # U :
nop
nop
nop
/* On entry to this basic block:
t0 == a source word not containing a null. */
/*
* nops here to:
* separate store quads from load quads
* limit of 1 bcond/quad to permit training
*/
$a_loop:
stq_u t0, 0(a0) # L :
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
nop
ldq_u t0, 0(a1) # L :
addq a1, 8, a1 # E :
cmpbge zero, t0, t8 # E :
beq a2, $a_eoc # U :
beq t8, $a_loop # U :
nop
nop
nop
/* Take care of the final (partial) word store. At this point
the end-of-count bit is set in t8 iff it applies.
On entry to this basic block we have:
t0 == the source word containing the null
t8 == the cmpbge mask that found it. */
$a_eos:
negq t8, t12 # E : find low bit set
and t8, t12, t12 # E : (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
and t12, 0x80, t6 # E : (stall)
bne t6, 1f # U : (stall)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t1, 0(a0) # L :
subq t12, 1, t6 # E :
or t12, t6, t8 # E : (stall)
zapnot t0, t8, t0 # U : clear src bytes > null (stall)
zap t1, t8, t1 # .. e1 : clear dst bytes <= null
or t0, t1, t0 # e1 : (stall)
nop
nop
1: stq_u t0, 0(a0) # L :
ret (t9) # L0 : Latency=3
nop
nop
/* Add the end-of-count bit to the eos detection bitmask. */
$a_eoc:
or t10, t8, t8 # E :
br $a_eos # L0 : Latency=3
nop
nop
.end stxncpy_aligned
.align 4
.ent __stxncpy
.globl __stxncpy
__stxncpy:
.frame sp, 0, t9, 0
.prologue 0
/* Are source and destination co-aligned? */
xor a0, a1, t1 # E :
and a0, 7, t0 # E : find dest misalignment
and t1, 7, t1 # E : (stall)
addq a2, t0, a2 # E : bias count by dest misalignment (stall)
subq a2, 1, a2 # E :
and a2, 7, t2 # E : (stall)
srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)
addq zero, 1, t10 # E :
sll t10, t2, t10 # U : t10 = bitmask of last count byte
bne t1, $unaligned # U :
/* We are co-aligned; take care of a partial first word. */
ldq_u t1, 0(a1) # L : load first src word
addq a1, 8, a1 # E :
beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
ldq_u t0, 0(a0) # L :
nop
nop
br stxncpy_aligned # .. e1 :
nop
nop
nop
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 4
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, unmasked
t1 == the shifted low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
ldq_u t2, 8(a1) # L : Latency=3 load second src word
addq a1, 8, a1 # E :
mskql t0, a0, t0 # U : mask trailing garbage in dst
extqh t2, a1, t4 # U : (3 cycle stall on t2)
or t1, t4, t1 # E : first aligned src word complete (stall)
mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
or t0, t1, t0 # E : first output word complete (stall)
or t0, t6, t6 # E : mask original data for zero test (stall)
cmpbge zero, t6, t8 # E :
beq a2, $u_eocfin # U :
lda t6, -1 # E :
nop
bne t8, $u_final # U :
mskql t6, a1, t6 # U : mask out bits already seen
stq_u t0, 0(a0) # L : store first output word
or t6, t2, t2 # E : (stall)
cmpbge zero, t2, t8 # E : find nulls in second partial
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
bne t8, $u_late_head_exit # U :
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
extql t2, a1, t1 # U : position hi-bits of lo word
beq a2, $u_eoc # U :
ldq_u t2, 8(a1) # L : read next high-order source word
addq a1, 8, a1 # E :
extqh t2, a1, t0 # U : position lo-bits of hi word (stall)
cmpbge zero, t2, t8 # E :
nop
bne t8, $u_eos # U :
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t0 == the shifted low-order bits from the current source word
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
.align 4
$u_loop:
or t0, t1, t0 # E : current dst word now complete
subq a2, 1, a2 # E : decrement word count
extql t2, a1, t1 # U : extract low bits for next time
addq a0, 8, a0 # E :
stq_u t0, -8(a0) # U : save the current word
beq a2, $u_eoc # U :
ldq_u t2, 8(a1) # U : Latency=3 load high word for next time
addq a1, 8, a1 # E :
extqh t2, a1, t0 # U : extract low bits (2 cycle stall)
cmpbge zero, t2, t8 # E : test new word for eos
nop
beq t8, $u_loop # U :
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t0 == the shifted low-order bits from the current source word
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
$u_eos:
or t0, t1, t0 # E : first (partial) source word complete
nop
cmpbge zero, t0, t8 # E : is the null in this first bit? (stall)
bne t8, $u_final # U : (stall)
stq_u t0, 0(a0) # L : the null was in the high-order bits
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
nop
$u_late_head_exit:
extql t2, a1, t0 # U :
cmpbge zero, t0, t8 # E :
or t8, t10, t6 # E : (stall)
cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall)
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t0 == assembled source word
t8 == cmpbge mask that found the null. */
$u_final:
negq t8, t6 # E : isolate low bit set
and t6, t8, t12 # E : (stall)
and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
bne t6, 1f # U : (stall)
ldq_u t1, 0(a0) # L :
subq t12, 1, t6 # E :
or t6, t12, t8 # E : (stall)
zapnot t0, t8, t0 # U : kill source bytes > null
zap t1, t8, t1 # U : kill dest bytes <= null
or t0, t1, t0 # E : (stall)
nop
nop
1: stq_u t0, 0(a0) # L :
ret (t9) # L0 : Latency=3
/* Got to end-of-count before end of string.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word */
$u_eoc:
and a1, 7, t6 # E : avoid final load if possible
sll t10, t6, t6 # U : (stall)
and t6, 0xff, t6 # E : (stall)
bne t6, 1f # U : (stall)
ldq_u t2, 8(a1) # L : load final src word
nop
extqh t2, a1, t0 # U : extract low bits for last word (stall)
or t1, t0, t1 # E : (stall)
1: cmpbge zero, t1, t8 # E :
mov t1, t0 # E :
$u_eocfin: # end-of-count, final word
or t10, t8, t8 # E :
br $u_final # L0 : Latency=3
/* Unaligned copy entry point. */
.align 4
$unaligned:
ldq_u t1, 0(a1) # L : load first source word
and a0, 7, t4 # E : find dest misalignment
and a1, 7, t5 # E : find src misalignment
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t0 # E :
mov zero, t6 # E :
beq t4, 1f # U :
ldq_u t0, 0(a0) # L :
lda t6, -1 # E :
mskql t6, a0, t6 # U :
nop
nop
subq a1, t4, a1 # E : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
1: cmplt t4, t5, t12 # E :
extql t1, a1, t1 # U : shift src into place
lda t2, -1 # E : for creating masks later
beq t12, $u_head # U : (stall)
extql t2, a1, t2 # U :
cmpbge zero, t1, t8 # E : is there a zero?
andnot t2, t6, t12 # E : dest mask for a single word copy
or t8, t10, t5 # E : test for end-of-count too
cmpbge zero, t12, t3 # E :
cmoveq a2, t5, t8 # E : Latency=2, extra map slot
nop # E : keep with cmoveq
andnot t8, t3, t8 # E : (stall)
beq t8, $u_head # U :
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # L :
negq t8, t6 # E : build bitmask of bytes <= zero
mskqh t1, t4, t1 # U :
and t6, t8, t2 # E :
subq t2, 1, t6 # E : (stall)
or t6, t2, t8 # E : (stall)
zapnot t12, t8, t12 # U : prepare source word; mirror changes (stall)
zapnot t1, t8, t1 # U : to source validity mask
andnot t0, t12, t0 # E : zero place for source to reside
or t0, t1, t0 # E : and put it there (stall both t0, t1)
stq_u t0, 0(a0) # L : (stall)
ret (t9) # L0 : Latency=3
nop
nop
nop
.end __stxncpy

View File

@@ -0,0 +1,54 @@
/*
* arch/alpha/lib/ev67-strcat.S
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Append a null-terminated string from SRC to DST.
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
* Commentary: It seems bogus to walk the input string twice - once
* to determine the length, and then again while doing the copy.
* A significant (future) enhancement would be to only read the input
* string once.
*/
.text
.align 4
.globl strcat
.ent strcat
strcat:
.frame $30, 0, $26
.prologue 0
mov $16, $0 # E : set up return value
/* Find the end of the string. */
ldq_u $1, 0($16) # L : load first quadword (a0 may be misaligned)
lda $2, -1 # E :
insqh $2, $16, $2 # U :
andnot $16, 7, $16 # E :
or $2, $1, $1 # E :
cmpbge $31, $1, $2 # E : bits set iff byte == 0
bne $2, $found # U :
$loop: ldq $1, 8($16) # L :
addq $16, 8, $16 # E :
cmpbge $31, $1, $2 # E :
beq $2, $loop # U :
$found: cttz $2, $3 # U0 :
addq $16, $3, $16 # E :
/* Now do the append. */
mov $26, $23 # E :
br __stxcpy # L0 :
.end strcat

View File

@@ -0,0 +1,88 @@
/*
* arch/alpha/lib/ev67-strchr.S
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Return the address of a given character within a null-terminated
* string, or null if it is not found.
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
*/
#include <asm/regdef.h>
.set noreorder
.set noat
.align 4
.globl strchr
.ent strchr
strchr:
.frame sp, 0, ra
.prologue 0
ldq_u t0, 0(a0) # L : load first quadword Latency=3
and a1, 0xff, t3 # E : 00000000000000ch
insbl a1, 1, t5 # U : 000000000000ch00
insbl a1, 7, a2 # U : ch00000000000000
insbl t3, 6, a3 # U : 00ch000000000000
or t5, t3, a1 # E : 000000000000chch
andnot a0, 7, v0 # E : align our loop pointer
lda t4, -1 # E : build garbage mask
mskqh t4, a0, t4 # U : only want relevant part of first quad
or a2, a3, a2 # E : chch000000000000
inswl a1, 2, t5 # E : 00000000chch0000
inswl a1, 4, a3 # E : 0000chch00000000
or a1, a2, a1 # E : chch00000000chch
or a3, t5, t5 # E : 0000chchchch0000
cmpbge zero, t0, t2 # E : bits set iff byte == zero
cmpbge zero, t4, t4 # E : bits set iff byte is garbage
/* This quad is _very_ serialized. Lots of stalling happens */
or t5, a1, a1 # E : chchchchchchchch
xor t0, a1, t1 # E : make bytes == c zero
cmpbge zero, t1, t3 # E : bits set iff byte == c
or t2, t3, t0 # E : bits set iff char match or zero match
andnot t0, t4, t0 # E : clear garbage bits
cttz t0, a2 # U0 : speculative (in case we get a match)
nop # E :
bne t0, $found # U :
/*
* Yuk. This loop is going to stall like crazy waiting for the
* data to be loaded. Not much can be done about it unless it's
* unrolled multiple times - is that safe to do in kernel space?
* Or would exception handling recovery code do the trick here?
*/
$loop: ldq t0, 8(v0) # L : Latency=3
addq v0, 8, v0 # E :
xor t0, a1, t1 # E :
cmpbge zero, t0, t2 # E : bits set iff byte == 0
cmpbge zero, t1, t3 # E : bits set iff byte == c
or t2, t3, t0 # E :
cttz t3, a2 # U0 : speculative (in case we get a match)
beq t0, $loop # U :
$found: negq t0, t1 # E : clear all but least set bit
and t0, t1, t0 # E :
and t0, t3, t1 # E : bit set iff byte was the char
addq v0, a2, v0 # E : Add in the bit number from above
cmoveq t1, $31, v0 # E : Two mapping slots, latency = 2
nop
nop
ret # L0 :
.end strchr

View File

@@ -0,0 +1,49 @@
/*
* arch/alpha/lib/ev67-strlen.S
* 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Finds length of a 0-terminated string. Optimized for the
* Alpha architecture:
*
* - memory accessed as aligned quadwords only
* - uses bcmpge to compare 8 bytes in parallel
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
*/
.set noreorder
.set noat
.globl strlen
.ent strlen
.align 4
strlen:
ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
lda $2, -1($31) # E :
insqh $2, $16, $2 # U :
andnot $16, 7, $0 # E :
or $2, $1, $1 # E :
cmpbge $31, $1, $2 # E : $2 <- bitmask: bit i == 1 <==> i-th byte == 0
nop # E :
bne $2, $found # U :
$loop: ldq $1, 8($0) # L :
addq $0, 8, $0 # E : addr += 8
cmpbge $31, $1, $2 # E :
beq $2, $loop # U :
$found:
cttz $2, $3 # U0 :
addq $0, $3, $0 # E :
subq $0, $16, $0 # E :
ret $31, ($26) # L0 :
.end strlen

View File

@@ -0,0 +1,107 @@
/*
* arch/alpha/lib/ev67-strlen_user.S
* 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
*
* Return the length of the string including the NULL terminator
* (strlen+1) or zero if an error occurred.
*
* In places where it is critical to limit the processing time,
* and the data is not trusted, strnlen_user() should be used.
* It will return a value greater than its second argument if
* that limit would be exceeded. This implementation is allowed
* to access memory beyond the limit, but will not cross a page
* boundary when doing so.
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
*/
#include <asm/regdef.h>
/* Allow an exception for an insn; exit if we get one. */
#define EX(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda v0, $exception-99b(zero); \
.previous
.set noreorder
.set noat
.text
.globl __strlen_user
.ent __strlen_user
.frame sp, 0, ra
.align 4
__strlen_user:
ldah a1, 32767(zero) # do not use plain strlen_user() for strings
# that might be almost 2 GB long; you should
# be using strnlen_user() instead
nop
nop
nop
.globl __strnlen_user
.align 4
__strnlen_user:
.prologue 0
EX( ldq_u t0, 0(a0) ) # L : load first quadword (a0 may be misaligned)
lda t1, -1(zero) # E :
insqh t1, a0, t1 # U :
andnot a0, 7, v0 # E :
or t1, t0, t0 # E :
subq a0, 1, a0 # E : get our +1 for the return
cmpbge zero, t0, t1 # E : t1 <- bitmask: bit i == 1 <==> i-th byte == 0
subq a1, 7, t2 # E :
subq a0, v0, t0 # E :
bne t1, $found # U :
addq t2, t0, t2 # E :
addq a1, 1, a1 # E :
nop # E :
nop # E :
.align 4
$loop: ble t2, $limit # U :
EX( ldq t0, 8(v0) ) # L :
nop # E :
nop # E :
cmpbge zero, t0, t1 # E :
subq t2, 8, t2 # E :
addq v0, 8, v0 # E : addr += 8
beq t1, $loop # U :
$found: cttz t1, t2 # U0 :
addq v0, t2, v0 # E :
subq v0, a0, v0 # E :
ret # L0 :
$exception:
nop
nop
nop
ret
.align 4 # currently redundant
$limit:
nop
nop
subq a1, t2, v0
ret
.end __strlen_user

View File

@@ -0,0 +1,94 @@
/*
* arch/alpha/lib/ev67-strncat.S
* 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
*
* Append no more than COUNT characters from the null-terminated string SRC
* to the null-terminated string DST. Always null-terminate the new DST.
*
* This differs slightly from the semantics in libc in that we never write
* past count, whereas libc may write to count+1. This follows the generic
* implementation in lib/string.c and is, IMHO, more sensible.
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
*/
.text
.align 4
.globl strncat
.ent strncat
strncat:
.frame $30, 0, $26
.prologue 0
mov $16, $0 # set up return value
beq $18, $zerocount # U :
/* Find the end of the string. */
ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
lda $2, -1($31) # E :
insqh $2, $0, $2 # U :
andnot $16, 7, $16 # E :
nop # E :
or $2, $1, $1 # E :
nop # E :
nop # E :
cmpbge $31, $1, $2 # E : bits set iff byte == 0
bne $2, $found # U :
$loop: ldq $1, 8($16) # L :
addq $16, 8, $16 # E :
cmpbge $31, $1, $2 # E :
beq $2, $loop # U :
$found: cttz $2, $3 # U0 :
addq $16, $3, $16 # E :
nop # E :
bsr $23, __stxncpy # L0 :/* Now do the append. */
/* Worry about the null termination. */
zapnot $1, $27, $2 # U : was last byte a null?
cmplt $27, $24, $5 # E : did we fill the buffer completely?
bne $2, 0f # U :
ret # L0 :
0: or $5, $18, $2 # E :
nop
bne $2, 2f # U :
and $24, 0x80, $3 # E : no zero next byte
nop # E :
bne $3, 1f # U :
/* Here there are bytes left in the current word. Clear one. */
addq $24, $24, $24 # E : end-of-count bit <<= 1
nop # E :
2: zap $1, $24, $1 # U :
nop # E :
stq_u $1, 0($16) # L :
ret # L0 :
1: /* Here we must clear the first byte of the next DST word */
stb $31, 8($16) # L :
nop # E :
nop # E :
ret # L0 :
$zerocount:
nop # E :
nop # E :
nop # E :
ret # L0 :
.end strncat

View File

@@ -0,0 +1,109 @@
/*
* arch/alpha/lib/ev67-strrchr.S
* 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Finds length of a 0-terminated string. Optimized for the
* Alpha architecture:
*
* - memory accessed as aligned quadwords only
* - uses bcmpge to compare 8 bytes in parallel
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
*/
#include <asm/regdef.h>
.set noreorder
.set noat
.align 4
.ent strrchr
.globl strrchr
strrchr:
.frame sp, 0, ra
.prologue 0
and a1, 0xff, t2 # E : 00000000000000ch
insbl a1, 1, t4 # U : 000000000000ch00
insbl a1, 2, t5 # U : 0000000000ch0000
ldq_u t0, 0(a0) # L : load first quadword Latency=3
mov zero, t6 # E : t6 is last match aligned addr
or t2, t4, a1 # E : 000000000000chch
sll t5, 8, t3 # U : 00000000ch000000
mov zero, t8 # E : t8 is last match byte compare mask
andnot a0, 7, v0 # E : align source addr
or t5, t3, t3 # E : 00000000chch0000
sll a1, 32, t2 # U : 0000chch00000000
sll a1, 48, t4 # U : chch000000000000
or t4, a1, a1 # E : chch00000000chch
or t2, t3, t2 # E : 0000chchchch0000
or a1, t2, a1 # E : chchchchchchchch
lda t5, -1 # E : build garbage mask
cmpbge zero, t0, t1 # E : bits set iff byte == zero
mskqh t5, a0, t4 # E : Complete garbage mask
xor t0, a1, t2 # E : make bytes == c zero
cmpbge zero, t4, t4 # E : bits set iff byte is garbage
cmpbge zero, t2, t3 # E : bits set iff byte == c
andnot t1, t4, t1 # E : clear garbage from null test
andnot t3, t4, t3 # E : clear garbage from char test
bne t1, $eos # U : did we already hit the terminator?
/* Character search main loop */
$loop:
ldq t0, 8(v0) # L : load next quadword
cmovne t3, v0, t6 # E : save previous comparisons match
nop # : Latency=2, extra map slot (keep nop with cmov)
nop
cmovne t3, t3, t8 # E : Latency=2, extra map slot
nop # : keep with cmovne
addq v0, 8, v0 # E :
xor t0, a1, t2 # E :
cmpbge zero, t0, t1 # E : bits set iff byte == zero
cmpbge zero, t2, t3 # E : bits set iff byte == c
beq t1, $loop # U : if we havnt seen a null, loop
nop
/* Mask out character matches after terminator */
$eos:
negq t1, t4 # E : isolate first null byte match
and t1, t4, t4 # E :
subq t4, 1, t5 # E : build a mask of the bytes upto...
or t4, t5, t4 # E : ... and including the null
and t3, t4, t3 # E : mask out char matches after null
cmovne t3, t3, t8 # E : save it, if match found Latency=2, extra map slot
nop # : Keep with cmovne
nop
cmovne t3, v0, t6 # E :
nop # : Keep with cmovne
/* Locate the address of the last matched character */
ctlz t8, t2 # U0 : Latency=3 (0x40 for t8=0)
nop
cmoveq t8, 0x3f, t2 # E : Compensate for case when no match is seen
nop # E : hide the cmov latency (2) behind ctlz latency
lda t5, 0x3f($31) # E :
subq t5, t2, t5 # E : Normalize leading zero count
addq t6, t5, v0 # E : and add to quadword address
ret # L0 : Latency=3
nop
nop
.end strrchr

193
arch/alpha/lib/fpreg.c Normal file
View File

@@ -0,0 +1,193 @@
/*
* arch/alpha/lib/fpreg.c
*
* (C) Copyright 1998 Linus Torvalds
*/
#if defined(__alpha_cix__) || defined(__alpha_fix__)
#define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val));
#else
#define STT(reg,val) asm volatile ("stt $f"#reg",%0" : "=m"(val));
#endif
unsigned long
alpha_read_fp_reg (unsigned long reg)
{
unsigned long val;
switch (reg) {
case 0: STT( 0, val); break;
case 1: STT( 1, val); break;
case 2: STT( 2, val); break;
case 3: STT( 3, val); break;
case 4: STT( 4, val); break;
case 5: STT( 5, val); break;
case 6: STT( 6, val); break;
case 7: STT( 7, val); break;
case 8: STT( 8, val); break;
case 9: STT( 9, val); break;
case 10: STT(10, val); break;
case 11: STT(11, val); break;
case 12: STT(12, val); break;
case 13: STT(13, val); break;
case 14: STT(14, val); break;
case 15: STT(15, val); break;
case 16: STT(16, val); break;
case 17: STT(17, val); break;
case 18: STT(18, val); break;
case 19: STT(19, val); break;
case 20: STT(20, val); break;
case 21: STT(21, val); break;
case 22: STT(22, val); break;
case 23: STT(23, val); break;
case 24: STT(24, val); break;
case 25: STT(25, val); break;
case 26: STT(26, val); break;
case 27: STT(27, val); break;
case 28: STT(28, val); break;
case 29: STT(29, val); break;
case 30: STT(30, val); break;
case 31: STT(31, val); break;
default: return 0;
}
return val;
}
#if defined(__alpha_cix__) || defined(__alpha_fix__)
#define LDT(reg,val) asm volatile ("itoft %0,$f"#reg : : "r"(val));
#else
#define LDT(reg,val) asm volatile ("ldt $f"#reg",%0" : : "m"(val));
#endif
void
alpha_write_fp_reg (unsigned long reg, unsigned long val)
{
switch (reg) {
case 0: LDT( 0, val); break;
case 1: LDT( 1, val); break;
case 2: LDT( 2, val); break;
case 3: LDT( 3, val); break;
case 4: LDT( 4, val); break;
case 5: LDT( 5, val); break;
case 6: LDT( 6, val); break;
case 7: LDT( 7, val); break;
case 8: LDT( 8, val); break;
case 9: LDT( 9, val); break;
case 10: LDT(10, val); break;
case 11: LDT(11, val); break;
case 12: LDT(12, val); break;
case 13: LDT(13, val); break;
case 14: LDT(14, val); break;
case 15: LDT(15, val); break;
case 16: LDT(16, val); break;
case 17: LDT(17, val); break;
case 18: LDT(18, val); break;
case 19: LDT(19, val); break;
case 20: LDT(20, val); break;
case 21: LDT(21, val); break;
case 22: LDT(22, val); break;
case 23: LDT(23, val); break;
case 24: LDT(24, val); break;
case 25: LDT(25, val); break;
case 26: LDT(26, val); break;
case 27: LDT(27, val); break;
case 28: LDT(28, val); break;
case 29: LDT(29, val); break;
case 30: LDT(30, val); break;
case 31: LDT(31, val); break;
}
}
#if defined(__alpha_cix__) || defined(__alpha_fix__)
#define STS(reg,val) asm volatile ("ftois $f"#reg",%0" : "=r"(val));
#else
#define STS(reg,val) asm volatile ("sts $f"#reg",%0" : "=m"(val));
#endif
unsigned long
alpha_read_fp_reg_s (unsigned long reg)
{
unsigned long val;
switch (reg) {
case 0: STS( 0, val); break;
case 1: STS( 1, val); break;
case 2: STS( 2, val); break;
case 3: STS( 3, val); break;
case 4: STS( 4, val); break;
case 5: STS( 5, val); break;
case 6: STS( 6, val); break;
case 7: STS( 7, val); break;
case 8: STS( 8, val); break;
case 9: STS( 9, val); break;
case 10: STS(10, val); break;
case 11: STS(11, val); break;
case 12: STS(12, val); break;
case 13: STS(13, val); break;
case 14: STS(14, val); break;
case 15: STS(15, val); break;
case 16: STS(16, val); break;
case 17: STS(17, val); break;
case 18: STS(18, val); break;
case 19: STS(19, val); break;
case 20: STS(20, val); break;
case 21: STS(21, val); break;
case 22: STS(22, val); break;
case 23: STS(23, val); break;
case 24: STS(24, val); break;
case 25: STS(25, val); break;
case 26: STS(26, val); break;
case 27: STS(27, val); break;
case 28: STS(28, val); break;
case 29: STS(29, val); break;
case 30: STS(30, val); break;
case 31: STS(31, val); break;
default: return 0;
}
return val;
}
#if defined(__alpha_cix__) || defined(__alpha_fix__)
#define LDS(reg,val) asm volatile ("itofs %0,$f"#reg : : "r"(val));
#else
#define LDS(reg,val) asm volatile ("lds $f"#reg",%0" : : "m"(val));
#endif
void
alpha_write_fp_reg_s (unsigned long reg, unsigned long val)
{
switch (reg) {
case 0: LDS( 0, val); break;
case 1: LDS( 1, val); break;
case 2: LDS( 2, val); break;
case 3: LDS( 3, val); break;
case 4: LDS( 4, val); break;
case 5: LDS( 5, val); break;
case 6: LDS( 6, val); break;
case 7: LDS( 7, val); break;
case 8: LDS( 8, val); break;
case 9: LDS( 9, val); break;
case 10: LDS(10, val); break;
case 11: LDS(11, val); break;
case 12: LDS(12, val); break;
case 13: LDS(13, val); break;
case 14: LDS(14, val); break;
case 15: LDS(15, val); break;
case 16: LDS(16, val); break;
case 17: LDS(17, val); break;
case 18: LDS(18, val); break;
case 19: LDS(19, val); break;
case 20: LDS(20, val); break;
case 21: LDS(21, val); break;
case 22: LDS(22, val); break;
case 23: LDS(23, val); break;
case 24: LDS(24, val); break;
case 25: LDS(25, val); break;
case 26: LDS(26, val); break;
case 27: LDS(27, val); break;
case 28: LDS(28, val); break;
case 29: LDS(29, val); break;
case 30: LDS(30, val); break;
case 31: LDS(31, val); break;
}
}

164
arch/alpha/lib/memchr.S Normal file
View File

@@ -0,0 +1,164 @@
/* Copyright (C) 1996 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by David Mosberger (davidm@cs.arizona.edu).
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Finds characters in a memory area. Optimized for the Alpha:
- memory accessed as aligned quadwords only
- uses cmpbge to compare 8 bytes in parallel
- does binary search to find 0 byte in last
quadword (HAKMEM needed 12 instructions to
do this instead of the 9 instructions that
binary search needs).
For correctness consider that:
- only minimum number of quadwords may be accessed
- the third argument is an unsigned long
*/
.set noreorder
.set noat
.globl memchr
.ent memchr
memchr:
.frame $30,0,$26,0
.prologue 0
# Hack -- if someone passes in (size_t)-1, hoping to just
# search til the end of the address space, we will overflow
# below when we find the address of the last byte. Given
# that we will never have a 56-bit address space, cropping
# the length is the easiest way to avoid trouble.
zap $18, 0x80, $5 #-e0 :
beq $18, $not_found # .. e1 :
ldq_u $1, 0($16) # e1 : load first quadword
insbl $17, 1, $2 # .. e0 : $2 = 000000000000ch00
and $17, 0xff, $17 #-e0 : $17 = 00000000000000ch
cmpult $18, 9, $4 # .. e1 :
or $2, $17, $17 # e0 : $17 = 000000000000chch
lda $3, -1($31) # .. e1 :
sll $17, 16, $2 #-e0 : $2 = 00000000chch0000
addq $16, $5, $5 # .. e1 :
or $2, $17, $17 # e1 : $17 = 00000000chchchch
unop # :
sll $17, 32, $2 #-e0 : $2 = chchchch00000000
or $2, $17, $17 # e1 : $17 = chchchchchchchch
extql $1, $16, $7 # e0 :
beq $4, $first_quad # .. e1 :
ldq_u $6, -1($5) #-e1 : eight or less bytes to search
extqh $6, $16, $6 # .. e0 :
mov $16, $0 # e0 :
or $7, $6, $1 # .. e1 : $1 = quadword starting at $16
# Deal with the case where at most 8 bytes remain to be searched
# in $1. E.g.:
# $18 = 6
# $1 = ????c6c5c4c3c2c1
$last_quad:
negq $18, $6 #-e0 :
xor $17, $1, $1 # .. e1 :
srl $3, $6, $6 # e0 : $6 = mask of $18 bits set
cmpbge $31, $1, $2 # .. e1 :
and $2, $6, $2 #-e0 :
beq $2, $not_found # .. e1 :
$found_it:
# Now, determine which byte matched:
negq $2, $3 # e0 :
and $2, $3, $2 # e1 :
and $2, 0x0f, $1 #-e0 :
addq $0, 4, $3 # .. e1 :
cmoveq $1, $3, $0 # e0 :
addq $0, 2, $3 # .. e1 :
and $2, 0x33, $1 #-e0 :
cmoveq $1, $3, $0 # .. e1 :
and $2, 0x55, $1 # e0 :
addq $0, 1, $3 # .. e1 :
cmoveq $1, $3, $0 #-e0 :
$done: ret # .. e1 :
# Deal with the case where $18 > 8 bytes remain to be
# searched. $16 may not be aligned.
.align 4
$first_quad:
andnot $16, 0x7, $0 #-e1 :
insqh $3, $16, $2 # .. e0 : $2 = 0000ffffffffffff ($16<0:2> ff)
xor $1, $17, $1 # e0 :
or $1, $2, $1 # e1 : $1 = ====ffffffffffff
cmpbge $31, $1, $2 #-e0 :
bne $2, $found_it # .. e1 :
# At least one byte left to process.
ldq $1, 8($0) # e0 :
subq $5, 1, $18 # .. e1 :
addq $0, 8, $0 #-e0 :
# Make $18 point to last quad to be accessed (the
# last quad may or may not be partial).
andnot $18, 0x7, $18 # .. e1 :
cmpult $0, $18, $2 # e0 :
beq $2, $final # .. e1 :
# At least two quads remain to be accessed.
subq $18, $0, $4 #-e0 : $4 <- nr quads to be processed
and $4, 8, $4 # e1 : odd number of quads?
bne $4, $odd_quad_count # e1 :
# At least three quads remain to be accessed
mov $1, $4 # e0 : move prefetched value to correct reg
.align 4
$unrolled_loop:
ldq $1, 8($0) #-e0 : prefetch $1
xor $17, $4, $2 # .. e1 :
cmpbge $31, $2, $2 # e0 :
bne $2, $found_it # .. e1 :
addq $0, 8, $0 #-e0 :
$odd_quad_count:
xor $17, $1, $2 # .. e1 :
ldq $4, 8($0) # e0 : prefetch $4
cmpbge $31, $2, $2 # .. e1 :
addq $0, 8, $6 #-e0 :
bne $2, $found_it # .. e1 :
cmpult $6, $18, $6 # e0 :
addq $0, 8, $0 # .. e1 :
bne $6, $unrolled_loop #-e1 :
mov $4, $1 # e0 : move prefetched value into $1
$final: subq $5, $0, $18 # .. e1 : $18 <- number of bytes left to do
bne $18, $last_quad # e1 :
$not_found:
mov $31, $0 #-e0 :
ret # .. e1 :
.end memchr

163
arch/alpha/lib/memcpy.c Normal file
View File

@@ -0,0 +1,163 @@
/*
* linux/arch/alpha/lib/memcpy.c
*
* Copyright (C) 1995 Linus Torvalds
*/
/*
* This is a reasonably optimized memcpy() routine.
*/
/*
* Note that the C code is written to be optimized into good assembly. However,
* at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
* explicit compare against 0 (instead of just using the proper "blt reg, xx" or
* "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
*/
#include <linux/types.h>
/*
* This should be done in one go with ldq_u*2/mask/stq_u. Do it
* with a macro so that we can fix it up later..
*/
#define ALIGN_DEST_TO8_UP(d,s,n) \
while (d & 7) { \
if (n <= 0) return; \
n--; \
*(char *) d = *(char *) s; \
d++; s++; \
}
#define ALIGN_DEST_TO8_DN(d,s,n) \
while (d & 7) { \
if (n <= 0) return; \
n--; \
d--; s--; \
*(char *) d = *(char *) s; \
}
/*
* This should similarly be done with ldq_u*2/mask/stq. The destination
* is aligned, but we don't fill in a full quad-word
*/
#define DO_REST_UP(d,s,n) \
while (n > 0) { \
n--; \
*(char *) d = *(char *) s; \
d++; s++; \
}
#define DO_REST_DN(d,s,n) \
while (n > 0) { \
n--; \
d--; s--; \
*(char *) d = *(char *) s; \
}
/*
* This should be done with ldq/mask/stq. The source and destination are
* aligned, but we don't fill in a full quad-word
*/
#define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
#define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
/*
* This does unaligned memory copies. We want to avoid storing to
* an unaligned address, as that would do a read-modify-write cycle.
* We also want to avoid double-reading the unaligned reads.
*
* Note the ordering to try to avoid load (and address generation) latencies.
*/
static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
long n)
{
ALIGN_DEST_TO8_UP(d,s,n);
n -= 8; /* to avoid compare against 8 in the loop */
if (n >= 0) {
unsigned long low_word, high_word;
__asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
do {
unsigned long tmp;
__asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
n -= 8;
__asm__("extql %1,%2,%0"
:"=r" (low_word)
:"r" (low_word), "r" (s));
__asm__("extqh %1,%2,%0"
:"=r" (tmp)
:"r" (high_word), "r" (s));
s += 8;
*(unsigned long *) d = low_word | tmp;
d += 8;
low_word = high_word;
} while (n >= 0);
}
n += 8;
DO_REST_UP(d,s,n);
}
static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
long n)
{
/* I don't understand AXP assembler well enough for this. -Tim */
s += n;
d += n;
while (n--)
* (char *) --d = * (char *) --s;
}
/*
* Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
* for the load-store. I don't know why, but it would seem that using a floating
* point register for the move seems to slow things down (very small difference,
* though).
*
* Note the ordering to try to avoid load (and address generation) latencies.
*/
static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
long n)
{
ALIGN_DEST_TO8_UP(d,s,n);
n -= 8;
while (n >= 0) {
unsigned long tmp;
__asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
n -= 8;
s += 8;
*(unsigned long *) d = tmp;
d += 8;
}
n += 8;
DO_REST_ALIGNED_UP(d,s,n);
}
static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
long n)
{
s += n;
d += n;
ALIGN_DEST_TO8_DN(d,s,n);
n -= 8;
while (n >= 0) {
unsigned long tmp;
s -= 8;
__asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
n -= 8;
d -= 8;
*(unsigned long *) d = tmp;
}
n += 8;
DO_REST_ALIGNED_DN(d,s,n);
}
void * memcpy(void * dest, const void *src, size_t n)
{
if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
__memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
n);
return dest;
}
__memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
return dest;
}
/* For backward modules compatibility, define __memcpy. */
asm("__memcpy = memcpy; .globl __memcpy");

181
arch/alpha/lib/memmove.S Normal file
View File

@@ -0,0 +1,181 @@
/*
* arch/alpha/lib/memmove.S
*
* Barely optimized memmove routine for Alpha EV5.
*
* This is hand-massaged output from the original memcpy.c. We defer to
* memcpy whenever possible; the backwards copy loops are not unrolled.
*/
.set noat
.set noreorder
.text
.align 4
.globl memmove
.ent memmove
memmove:
ldgp $29, 0($27)
unop
nop
.prologue 1
addq $16,$18,$4
addq $17,$18,$5
cmpule $4,$17,$1 /* dest + n <= src */
cmpule $5,$16,$2 /* dest >= src + n */
bis $1,$2,$1
mov $16,$0
xor $16,$17,$2
bne $1,memcpy !samegp
and $2,7,$2 /* Test for src/dest co-alignment. */
and $16,7,$1
cmpule $16,$17,$3
bne $3,$memmove_up /* dest < src */
and $4,7,$1
bne $2,$misaligned_dn
unop
beq $1,$skip_aligned_byte_loop_head_dn
$aligned_byte_loop_head_dn:
lda $4,-1($4)
lda $5,-1($5)
unop
ble $18,$egress
ldq_u $3,0($5)
ldq_u $2,0($4)
lda $18,-1($18)
extbl $3,$5,$1
insbl $1,$4,$1
mskbl $2,$4,$2
bis $1,$2,$1
and $4,7,$6
stq_u $1,0($4)
bne $6,$aligned_byte_loop_head_dn
$skip_aligned_byte_loop_head_dn:
lda $18,-8($18)
blt $18,$skip_aligned_word_loop_dn
$aligned_word_loop_dn:
ldq $1,-8($5)
nop
lda $5,-8($5)
lda $18,-8($18)
stq $1,-8($4)
nop
lda $4,-8($4)
bge $18,$aligned_word_loop_dn
$skip_aligned_word_loop_dn:
lda $18,8($18)
bgt $18,$byte_loop_tail_dn
unop
ret $31,($26),1
.align 4
$misaligned_dn:
nop
fnop
unop
beq $18,$egress
$byte_loop_tail_dn:
ldq_u $3,-1($5)
ldq_u $2,-1($4)
lda $5,-1($5)
lda $4,-1($4)
lda $18,-1($18)
extbl $3,$5,$1
insbl $1,$4,$1
mskbl $2,$4,$2
bis $1,$2,$1
stq_u $1,0($4)
bgt $18,$byte_loop_tail_dn
br $egress
$memmove_up:
mov $16,$4
mov $17,$5
bne $2,$misaligned_up
beq $1,$skip_aligned_byte_loop_head_up
$aligned_byte_loop_head_up:
unop
ble $18,$egress
ldq_u $3,0($5)
ldq_u $2,0($4)
lda $18,-1($18)
extbl $3,$5,$1
insbl $1,$4,$1
mskbl $2,$4,$2
bis $1,$2,$1
lda $5,1($5)
stq_u $1,0($4)
lda $4,1($4)
and $4,7,$6
bne $6,$aligned_byte_loop_head_up
$skip_aligned_byte_loop_head_up:
lda $18,-8($18)
blt $18,$skip_aligned_word_loop_up
$aligned_word_loop_up:
ldq $1,0($5)
nop
lda $5,8($5)
lda $18,-8($18)
stq $1,0($4)
nop
lda $4,8($4)
bge $18,$aligned_word_loop_up
$skip_aligned_word_loop_up:
lda $18,8($18)
bgt $18,$byte_loop_tail_up
unop
ret $31,($26),1
.align 4
$misaligned_up:
nop
fnop
unop
beq $18,$egress
$byte_loop_tail_up:
ldq_u $3,0($5)
ldq_u $2,0($4)
lda $18,-1($18)
extbl $3,$5,$1
insbl $1,$4,$1
mskbl $2,$4,$2
bis $1,$2,$1
stq_u $1,0($4)
lda $5,1($5)
lda $4,1($4)
nop
bgt $18,$byte_loop_tail_up
$egress:
ret $31,($26),1
nop
nop
nop
.end memmove

124
arch/alpha/lib/memset.S Normal file
View File

@@ -0,0 +1,124 @@
/*
* linux/arch/alpha/memset.S
*
* This is an efficient (and small) implementation of the C library "memset()"
* function for the alpha.
*
* (C) Copyright 1996 Linus Torvalds
*
* This routine is "moral-ware": you are free to use it any way you wish, and
* the only obligation I put on you is a moral one: if you make any improvements
* to the routine, please send me your improvements for me to use similarly.
*
* The scheduling comments are according to the EV5 documentation (and done by
* hand, so they might well be incorrect, please do tell me about it..)
*/
.set noat
.set noreorder
.text
.globl memset
.globl __memset
.globl __memsetw
.globl __constant_c_memset
.ent __memset
.align 5
__memset:
.frame $30,0,$26,0
.prologue 0
and $17,255,$1 /* E1 */
insbl $17,1,$17 /* .. E0 */
bis $17,$1,$17 /* E0 (p-c latency, next cycle) */
sll $17,16,$1 /* E1 (p-c latency, next cycle) */
bis $17,$1,$17 /* E0 (p-c latency, next cycle) */
sll $17,32,$1 /* E1 (p-c latency, next cycle) */
bis $17,$1,$17 /* E0 (p-c latency, next cycle) */
ldq_u $31,0($30) /* .. E1 */
.align 5
__constant_c_memset:
addq $18,$16,$6 /* E0 */
bis $16,$16,$0 /* .. E1 */
xor $16,$6,$1 /* E0 */
ble $18,end /* .. E1 */
bic $1,7,$1 /* E0 */
beq $1,within_one_quad /* .. E1 (note EV5 zero-latency forwarding) */
and $16,7,$3 /* E0 */
beq $3,aligned /* .. E1 (note EV5 zero-latency forwarding) */
ldq_u $4,0($16) /* E0 */
bis $16,$16,$5 /* .. E1 */
insql $17,$16,$2 /* E0 */
subq $3,8,$3 /* .. E1 */
addq $18,$3,$18 /* E0 $18 is new count ($3 is negative) */
mskql $4,$16,$4 /* .. E1 (and possible load stall) */
subq $16,$3,$16 /* E0 $16 is new aligned destination */
bis $2,$4,$1 /* .. E1 */
bis $31,$31,$31 /* E0 */
ldq_u $31,0($30) /* .. E1 */
stq_u $1,0($5) /* E0 */
bis $31,$31,$31 /* .. E1 */
.align 4
aligned:
sra $18,3,$3 /* E0 */
and $18,7,$18 /* .. E1 */
bis $16,$16,$5 /* E0 */
beq $3,no_quad /* .. E1 */
.align 3
loop:
stq $17,0($5) /* E0 */
subq $3,1,$3 /* .. E1 */
addq $5,8,$5 /* E0 */
bne $3,loop /* .. E1 */
no_quad:
bis $31,$31,$31 /* E0 */
beq $18,end /* .. E1 */
ldq $7,0($5) /* E0 */
mskqh $7,$6,$2 /* .. E1 (and load stall) */
insqh $17,$6,$4 /* E0 */
bis $2,$4,$1 /* .. E1 */
stq $1,0($5) /* E0 */
ret $31,($26),1 /* .. E1 */
.align 3
within_one_quad:
ldq_u $1,0($16) /* E0 */
insql $17,$16,$2 /* E1 */
mskql $1,$16,$4 /* E0 (after load stall) */
bis $2,$4,$2 /* E0 */
mskql $2,$6,$4 /* E0 */
mskqh $1,$6,$2 /* .. E1 */
bis $2,$4,$1 /* E0 */
stq_u $1,0($16) /* E0 */
end:
ret $31,($26),1 /* E1 */
.end __memset
.align 5
.ent __memsetw
__memsetw:
.prologue 0
inswl $17,0,$1 /* E0 */
inswl $17,2,$2 /* E0 */
inswl $17,4,$3 /* E0 */
or $1,$2,$1 /* .. E1 */
inswl $17,6,$4 /* E0 */
or $1,$3,$1 /* .. E1 */
or $1,$4,$17 /* E0 */
br __constant_c_memset /* .. E1 */
.end __memsetw
memset = __memset

View File

@@ -0,0 +1,41 @@
/*
* arch/alpha/lib/srm_printk.c
*/
#include <linux/kernel.h>
#include <asm/console.h>
long
srm_printk(const char *fmt, ...)
{
static char buf[1024];
va_list args;
long len, num_lf;
char *src, *dst;
va_start(args, fmt);
len = vsprintf(buf, fmt, args);
va_end(args);
/* count number of linefeeds in string: */
num_lf = 0;
for (src = buf; *src; ++src) {
if (*src == '\n') {
++num_lf;
}
}
if (num_lf) {
/* expand each linefeed into carriage-return/linefeed: */
for (dst = src + num_lf; src >= buf; ) {
if (*src == '\n') {
*dst-- = '\r';
}
*dst-- = *src--;
}
}
srm_puts(buf, num_lf+len);
return len;
}

23
arch/alpha/lib/srm_puts.c Normal file
View File

@@ -0,0 +1,23 @@
/*
* arch/alpha/lib/srm_puts.c
*/
#include <linux/string.h>
#include <asm/console.h>
long
srm_puts(const char *str, long len)
{
long remaining, written;
if (!callback_init_done)
return len;
for (remaining = len; remaining > 0; remaining -= written)
{
written = callback_puts(0, str, remaining);
written &= 0xffffffff;
str += written;
}
return len;
}

103
arch/alpha/lib/stacktrace.c Normal file
View File

@@ -0,0 +1,103 @@
#include <linux/kernel.h>
#include <asm/system.h>
typedef unsigned int instr;
#define MAJOR_OP 0xfc000000
#define LDA_OP 0x20000000
#define STQ_OP 0xb4000000
#define BR_OP 0xc0000000
#define STK_ALLOC_1 0x23de8000 /* lda $30,-X($30) */
#define STK_ALLOC_1M 0xffff8000
#define STK_ALLOC_2 0x43c0153e /* subq $30,X,$30 */
#define STK_ALLOC_2M 0xffe01fff
#define MEM_REG 0x03e00000
#define MEM_BASE 0x001f0000
#define MEM_OFF 0x0000ffff
#define MEM_OFF_SIGN 0x00008000
#define BASE_SP 0x001e0000
#define STK_ALLOC_MATCH(INSTR) \
(((INSTR) & STK_ALLOC_1M) == STK_ALLOC_1 \
|| ((INSTR) & STK_ALLOC_2M) == STK_ALLOC_2)
#define STK_PUSH_MATCH(INSTR) \
(((INSTR) & (MAJOR_OP | MEM_BASE | MEM_OFF_SIGN)) == (STQ_OP | BASE_SP))
#define MEM_OP_OFFSET(INSTR) \
(((long)((INSTR) & MEM_OFF) << 48) >> 48)
#define MEM_OP_REG(INSTR) \
(((INSTR) & MEM_REG) >> 22)
/* Branches, jumps, PAL calls, and illegal opcodes end a basic block. */
#define BB_END(INSTR) \
(((instr)(INSTR) >= BR_OP) | ((instr)(INSTR) < LDA_OP) | \
((((instr)(INSTR) ^ 0x60000000) < 0x20000000) & \
(((instr)(INSTR) & 0x0c000000) != 0)))
#define IS_KERNEL_TEXT(PC) ((unsigned long)(PC) > START_ADDR)
static char reg_name[][4] = {
"v0 ", "t0 ", "t1 ", "t2 ", "t3 ", "t4 ", "t5 ", "t6 ", "t7 ",
"s0 ", "s1 ", "s2 ", "s3 ", "s4 ", "s5 ", "s6 ", "a0 ", "a1 ",
"a2 ", "a3 ", "a4 ", "a5 ", "t8 ", "t9 ", "t10", "t11", "ra ",
"pv ", "at ", "gp ", "sp ", "0"
};
static instr *
display_stored_regs(instr * pro_pc, unsigned char * sp)
{
instr * ret_pc = 0;
int reg;
unsigned long value;
printk("Prologue [<%p>], Frame %p:\n", pro_pc, sp);
while (!BB_END(*pro_pc))
if (STK_PUSH_MATCH(*pro_pc)) {
reg = (*pro_pc & MEM_REG) >> 21;
value = *(unsigned long *)(sp + (*pro_pc & MEM_OFF));
if (reg == 26)
ret_pc = (instr *)value;
printk("\t\t%s / 0x%016lx\n", reg_name[reg], value);
}
return ret_pc;
}
static instr *
seek_prologue(instr * pc)
{
while (!STK_ALLOC_MATCH(*pc))
--pc;
while (!BB_END(*(pc - 1)))
--pc;
return pc;
}
static long
stack_increment(instr * prologue_pc)
{
while (!STK_ALLOC_MATCH(*prologue_pc))
++prologue_pc;
/* Count the bytes allocated. */
if ((*prologue_pc & STK_ALLOC_1M) == STK_ALLOC_1M)
return -(((long)(*prologue_pc) << 48) >> 48);
else
return (*prologue_pc >> 13) & 0xff;
}
void
stacktrace(void)
{
instr * ret_pc;
instr * prologue = (instr *)stacktrace;
register unsigned char * sp __asm__ ("$30");
printk("\tstack trace:\n");
do {
ret_pc = display_stored_regs(prologue, sp);
sp += stack_increment(prologue);
prologue = seek_prologue(ret_pc);
} while (IS_KERNEL_TEXT(ret_pc));
}

View File

@@ -0,0 +1,26 @@
/*
* linux/arch/alpha/lib/strcasecmp.c
*/
#include <linux/string.h>
/* We handle nothing here except the C locale. Since this is used in
only one place, on strings known to contain only 7 bit ASCII, this
is ok. */
int strcasecmp(const char *a, const char *b)
{
int ca, cb;
do {
ca = *a++ & 0xff;
cb = *b++ & 0xff;
if (ca >= 'A' && ca <= 'Z')
ca += 'a' - 'A';
if (cb >= 'A' && cb <= 'Z')
cb += 'a' - 'A';
} while (ca == cb && ca != '\0');
return ca - cb;
}

52
arch/alpha/lib/strcat.S Normal file
View File

@@ -0,0 +1,52 @@
/*
* arch/alpha/lib/strcat.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Append a null-terminated string from SRC to DST.
*/
.text
.align 3
.globl strcat
.ent strcat
strcat:
.frame $30, 0, $26
.prologue 0
mov $16, $0 # set up return value
/* Find the end of the string. */
ldq_u $1, 0($16) # load first quadword (a0 may be misaligned)
lda $2, -1
insqh $2, $16, $2
andnot $16, 7, $16
or $2, $1, $1
cmpbge $31, $1, $2 # bits set iff byte == 0
bne $2, $found
$loop: ldq $1, 8($16)
addq $16, 8, $16
cmpbge $31, $1, $2
beq $2, $loop
$found: negq $2, $3 # clear all but least set bit
and $2, $3, $2
and $2, 0xf0, $3 # binary search for that set bit
and $2, 0xcc, $4
and $2, 0xaa, $5
cmovne $3, 4, $3
cmovne $4, 2, $4
cmovne $5, 1, $5
addq $3, $4, $3
addq $16, $5, $16
addq $16, $3, $16
/* Now do the append. */
mov $26, $23
br __stxcpy
.end strcat

70
arch/alpha/lib/strchr.S Normal file
View File

@@ -0,0 +1,70 @@
/*
* arch/alpha/lib/strchr.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Return the address of a given character within a null-terminated
* string, or null if it is not found.
*/
#include <asm/regdef.h>
.set noreorder
.set noat
.align 3
.globl strchr
.ent strchr
strchr:
.frame sp, 0, ra
.prologue 0
zapnot a1, 1, a1 # e0 : zero extend the search character
ldq_u t0, 0(a0) # .. e1 : load first quadword
sll a1, 8, t5 # e0 : replicate the search character
andnot a0, 7, v0 # .. e1 : align our loop pointer
or t5, a1, a1 # e0 :
lda t4, -1 # .. e1 : build garbage mask
sll a1, 16, t5 # e0 :
cmpbge zero, t0, t2 # .. e1 : bits set iff byte == zero
mskqh t4, a0, t4 # e0 :
or t5, a1, a1 # .. e1 :
sll a1, 32, t5 # e0 :
cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage
or t5, a1, a1 # e0 :
xor t0, a1, t1 # .. e1 : make bytes == c zero
cmpbge zero, t1, t3 # e0 : bits set iff byte == c
or t2, t3, t0 # e1 : bits set iff char match or zero match
andnot t0, t4, t0 # e0 : clear garbage bits
bne t0, $found # .. e1 (zdb)
$loop: ldq t0, 8(v0) # e0 :
addq v0, 8, v0 # .. e1 :
nop # e0 :
xor t0, a1, t1 # .. e1 (ev5 data stall)
cmpbge zero, t0, t2 # e0 : bits set iff byte == 0
cmpbge zero, t1, t3 # .. e1 : bits set iff byte == c
or t2, t3, t0 # e0 :
beq t0, $loop # .. e1 (zdb)
$found: negq t0, t1 # e0 : clear all but least set bit
and t0, t1, t0 # e1 (stall)
and t0, t3, t1 # e0 : bit set iff byte was the char
beq t1, $retnull # .. e1 (zdb)
and t0, 0xf0, t2 # e0 : binary search for that set bit
and t0, 0xcc, t3 # .. e1 :
and t0, 0xaa, t4 # e0 :
cmovne t2, 4, t2 # .. e1 :
cmovne t3, 2, t3 # e0 :
cmovne t4, 1, t4 # .. e1 :
addq t2, t3, t2 # e0 :
addq v0, t4, v0 # .. e1 :
addq v0, t2, v0 # e0 :
ret # .. e1 :
$retnull:
mov zero, v0 # e0 :
ret # .. e1 :
.end strchr

23
arch/alpha/lib/strcpy.S Normal file
View File

@@ -0,0 +1,23 @@
/*
* arch/alpha/lib/strcpy.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Copy a null-terminated string from SRC to DST. Return a pointer
* to the null-terminator in the source.
*/
.text
.align 3
.globl strcpy
.ent strcpy
strcpy:
.frame $30, 0, $26
.prologue 0
mov $16, $0 # set up return value
mov $26, $23 # set up return address
unop
br __stxcpy # do the copy
.end strcpy

57
arch/alpha/lib/strlen.S Normal file
View File

@@ -0,0 +1,57 @@
/*
* strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu)
*
* Finds length of a 0-terminated string. Optimized for the
* Alpha architecture:
*
* - memory accessed as aligned quadwords only
* - uses bcmpge to compare 8 bytes in parallel
* - does binary search to find 0 byte in last
* quadword (HAKMEM needed 12 instructions to
* do this instead of the 9 instructions that
* binary search needs).
*/
.set noreorder
.set noat
.align 3
.globl strlen
.ent strlen
strlen:
ldq_u $1, 0($16) # load first quadword ($16 may be misaligned)
lda $2, -1($31)
insqh $2, $16, $2
andnot $16, 7, $0
or $2, $1, $1
cmpbge $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0
bne $2, found
loop: ldq $1, 8($0)
addq $0, 8, $0 # addr += 8
nop # helps dual issue last two insns
cmpbge $31, $1, $2
beq $2, loop
found: blbs $2, done # make aligned case fast
negq $2, $3
and $2, $3, $2
and $2, 0x0f, $1
addq $0, 4, $3
cmoveq $1, $3, $0
and $2, 0x33, $1
addq $0, 2, $3
cmoveq $1, $3, $0
and $2, 0x55, $1
addq $0, 1, $3
cmoveq $1, $3, $0
done: subq $0, $16, $0
ret $31, ($26)
.end strlen

View File

@@ -0,0 +1,91 @@
/*
* arch/alpha/lib/strlen_user.S
*
* Return the length of the string including the NUL terminator
* (strlen+1) or zero if an error occurred.
*
* In places where it is critical to limit the processing time,
* and the data is not trusted, strnlen_user() should be used.
* It will return a value greater than its second argument if
* that limit would be exceeded. This implementation is allowed
* to access memory beyond the limit, but will not cross a page
* boundary when doing so.
*/
#include <asm/regdef.h>
/* Allow an exception for an insn; exit if we get one. */
#define EX(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda v0, $exception-99b(zero); \
.previous
.set noreorder
.set noat
.text
.globl __strlen_user
.ent __strlen_user
.frame sp, 0, ra
.align 3
__strlen_user:
ldah a1, 32767(zero) # do not use plain strlen_user() for strings
# that might be almost 2 GB long; you should
# be using strnlen_user() instead
.globl __strnlen_user
.align 3
__strnlen_user:
.prologue 0
EX( ldq_u t0, 0(a0) ) # load first quadword (a0 may be misaligned)
lda t1, -1(zero)
insqh t1, a0, t1
andnot a0, 7, v0
or t1, t0, t0
subq a0, 1, a0 # get our +1 for the return
cmpbge zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
subq a1, 7, t2
subq a0, v0, t0
bne t1, $found
addq t2, t0, t2
addq a1, 1, a1
.align 3
$loop: ble t2, $limit
EX( ldq t0, 8(v0) )
subq t2, 8, t2
addq v0, 8, v0 # addr += 8
cmpbge zero, t0, t1
beq t1, $loop
$found: negq t1, t2 # clear all but least set bit
and t1, t2, t1
and t1, 0xf0, t2 # binary search for that set bit
and t1, 0xcc, t3
and t1, 0xaa, t4
cmovne t2, 4, t2
cmovne t3, 2, t3
cmovne t4, 1, t4
addq t2, t3, t2
addq v0, t4, v0
addq v0, t2, v0
nop # dual issue next two on ev4 and ev5
subq v0, a0, v0
$exception:
ret
.align 3 # currently redundant
$limit:
subq a1, t2, v0
ret
.end __strlen_user

84
arch/alpha/lib/strncat.S Normal file
View File

@@ -0,0 +1,84 @@
/*
* arch/alpha/lib/strncat.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Append no more than COUNT characters from the null-terminated string SRC
* to the null-terminated string DST. Always null-terminate the new DST.
*
* This differs slightly from the semantics in libc in that we never write
* past count, whereas libc may write to count+1. This follows the generic
* implementation in lib/string.c and is, IMHO, more sensible.
*/
.text
.align 3
.globl strncat
.ent strncat
strncat:
.frame $30, 0, $26
.prologue 0
mov $16, $0 # set up return value
beq $18, $zerocount
/* Find the end of the string. */
ldq_u $1, 0($16) # load first quadword ($16 may be misaligned)
lda $2, -1($31)
insqh $2, $16, $2
andnot $16, 7, $16
or $2, $1, $1
cmpbge $31, $1, $2 # bits set iff byte == 0
bne $2, $found
$loop: ldq $1, 8($16)
addq $16, 8, $16
cmpbge $31, $1, $2
beq $2, $loop
$found: negq $2, $3 # clear all but least set bit
and $2, $3, $2
and $2, 0xf0, $3 # binary search for that set bit
and $2, 0xcc, $4
and $2, 0xaa, $5
cmovne $3, 4, $3
cmovne $4, 2, $4
cmovne $5, 1, $5
addq $3, $4, $3
addq $16, $5, $16
addq $16, $3, $16
/* Now do the append. */
bsr $23, __stxncpy
/* Worry about the null termination. */
zapnot $1, $27, $2 # was last byte a null?
bne $2, 0f
ret
0: cmplt $27, $24, $2 # did we fill the buffer completely?
or $2, $18, $2
bne $2, 2f
and $24, 0x80, $2 # no zero next byte
bne $2, 1f
/* Here there are bytes left in the current word. Clear one. */
addq $24, $24, $24 # end-of-count bit <<= 1
2: zap $1, $24, $1
stq_u $1, 0($16)
ret
1: /* Here we must read the next DST word and clear the first byte. */
ldq_u $1, 8($16)
zap $1, 1, $1
stq_u $1, 8($16)
$zerocount:
ret
.end strncat

81
arch/alpha/lib/strncpy.S Normal file
View File

@@ -0,0 +1,81 @@
/*
* arch/alpha/lib/strncpy.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Copy no more than COUNT bytes of the null-terminated string from
* SRC to DST. If SRC does not cover all of COUNT, the balance is
* zeroed.
*
* Or, rather, if the kernel cared about that weird ANSI quirk. This
* version has cropped that bit o' nastiness as well as assuming that
* __stxncpy is in range of a branch.
*/
.set noat
.set noreorder
.text
.align 4
.globl strncpy
.ent strncpy
strncpy:
.frame $30, 0, $26
.prologue 0
mov $16, $0 # set return value now
beq $18, $zerolen
unop
bsr $23, __stxncpy # do the work of the copy
unop
bne $18, $multiword # do we have full words left?
subq $24, 1, $3 # nope
subq $27, 1, $4
or $3, $24, $3 # clear the bits between the last
or $4, $27, $4 # written byte and the last byte in COUNT
andnot $4, $3, $4
zap $1, $4, $1
stq_u $1, 0($16)
ret
.align 4
$multiword:
subq $24, 1, $2 # clear the final bits in the prev word
or $2, $24, $2
zapnot $1, $2, $1
subq $18, 1, $18
stq_u $1, 0($16)
addq $16, 8, $16
unop
beq $18, 1f
nop
unop
nop
blbc $18, 0f
stq_u $31, 0($16) # zero one word
subq $18, 1, $18
addq $16, 8, $16
beq $18, 1f
0: stq_u $31, 0($16) # zero two words
subq $18, 2, $18
stq_u $31, 8($16)
addq $16, 16, $16
bne $18, 0b
1: ldq_u $1, 0($16) # clear the leading bits in the final word
subq $27, 1, $2
or $2, $27, $2
zap $1, $2, $1
stq_u $1, 0($16)
$zerolen:
ret
.end strncpy

View File

@@ -0,0 +1,339 @@
/*
* arch/alpha/lib/strncpy_from_user.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Just like strncpy except in the return value:
*
* -EFAULT if an exception occurs before the terminator is copied.
* N if the buffer filled.
*
* Otherwise the length of the string is returned.
*/
#include <asm/errno.h>
#include <asm/regdef.h>
/* Allow an exception for an insn; exit if we get one. */
#define EX(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
lda $31, $exception-99b($0); \
.previous
.set noat
.set noreorder
.text
.globl __strncpy_from_user
.ent __strncpy_from_user
.frame $30, 0, $26
.prologue 0
.align 3
$aligned:
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == the first source word. */
/* Create the 1st output word and detect 0's in the 1st input word. */
lda t2, -1 # e1 : build a mask against false zero
mskqh t2, a1, t2 # e0 : detection in the src word
mskqh t1, a1, t3 # e0 :
ornot t1, t2, t2 # .. e1 :
mskql t0, a1, t0 # e0 : assemble the first output word
cmpbge zero, t2, t8 # .. e1 : bits set iff null found
or t0, t3, t0 # e0 :
beq a2, $a_eoc # .. e1 :
bne t8, $a_eos # .. e1 :
/* On entry to this basic block:
t0 == a source word not containing a null. */
$a_loop:
stq_u t0, 0(a0) # e0 :
addq a0, 8, a0 # .. e1 :
EX( ldq_u t0, 0(a1) ) # e0 :
addq a1, 8, a1 # .. e1 :
subq a2, 1, a2 # e0 :
cmpbge zero, t0, t8 # .. e1 (stall)
beq a2, $a_eoc # e1 :
beq t8, $a_loop # e1 :
/* Take care of the final (partial) word store. At this point
the end-of-count bit is set in t8 iff it applies.
On entry to this basic block we have:
t0 == the source word containing the null
t8 == the cmpbge mask that found it. */
$a_eos:
negq t8, t12 # e0 : find low bit set
and t8, t12, t12 # e1 (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
and t12, 0x80, t6 # e0 :
bne t6, 1f # .. e1 (zdb)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t1, 0(a0) # e0 :
subq t12, 1, t6 # .. e1 :
or t12, t6, t8 # e0 :
unop #
zapnot t0, t8, t0 # e0 : clear src bytes > null
zap t1, t8, t1 # .. e1 : clear dst bytes <= null
or t0, t1, t0 # e1 :
1: stq_u t0, 0(a0)
br $finish_up
/* Add the end-of-count bit to the eos detection bitmask. */
$a_eoc:
or t10, t8, t8
br $a_eos
/*** The Function Entry Point ***/
.align 3
__strncpy_from_user:
mov a0, v0 # save the string start
beq a2, $zerolength
/* Are source and destination co-aligned? */
xor a0, a1, t1 # e0 :
and a0, 7, t0 # .. e1 : find dest misalignment
and t1, 7, t1 # e0 :
addq a2, t0, a2 # .. e1 : bias count by dest misalignment
subq a2, 1, a2 # e0 :
and a2, 7, t2 # e1 :
srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8
addq zero, 1, t10 # .. e1 :
sll t10, t2, t10 # e0 : t10 = bitmask of last count byte
bne t1, $unaligned # .. e1 :
/* We are co-aligned; take care of a partial first word. */
EX( ldq_u t1, 0(a1) ) # e0 : load first src word
addq a1, 8, a1 # .. e1 :
beq t0, $aligned # avoid loading dest word if not needed
ldq_u t0, 0(a0) # e0 :
br $aligned # .. e1 :
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 3
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, unmasked
t1 == the shifted low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
EX( ldq_u t2, 8(a1) ) # e0 : load second src word
addq a1, 8, a1 # .. e1 :
mskql t0, a0, t0 # e0 : mask trailing garbage in dst
extqh t2, a1, t4 # e0 :
or t1, t4, t1 # e1 : first aligned src word complete
mskqh t1, a0, t1 # e0 : mask leading garbage in src
or t0, t1, t0 # e0 : first output word complete
or t0, t6, t6 # e1 : mask original data for zero test
cmpbge zero, t6, t8 # e0 :
beq a2, $u_eocfin # .. e1 :
bne t8, $u_final # e1 :
lda t6, -1 # e1 : mask out the bits we have
mskql t6, a1, t6 # e0 : already seen
stq_u t0, 0(a0) # e0 : store first output word
or t6, t2, t2 # .. e1 :
cmpbge zero, t2, t8 # e0 : find nulls in second partial
addq a0, 8, a0 # .. e1 :
subq a2, 1, a2 # e0 :
bne t8, $u_late_head_exit # .. e1 :
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
extql t2, a1, t1 # e0 : position hi-bits of lo word
EX( ldq_u t2, 8(a1) ) # .. e1 : read next high-order source word
addq a1, 8, a1 # e0 :
cmpbge zero, t2, t8 # e1 (stall)
beq a2, $u_eoc # e1 :
bne t8, $u_eos # e1 :
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
.align 3
$u_loop:
extqh t2, a1, t0 # e0 : extract high bits for current word
addq a1, 8, a1 # .. e1 :
extql t2, a1, t3 # e0 : extract low bits for next time
addq a0, 8, a0 # .. e1 :
or t0, t1, t0 # e0 : current dst word now complete
EX( ldq_u t2, 0(a1) ) # .. e1 : load high word for next time
stq_u t0, -8(a0) # e0 : save the current word
mov t3, t1 # .. e1 :
subq a2, 1, a2 # e0 :
cmpbge zero, t2, t8 # .. e1 : test new word for eos
beq a2, $u_eoc # e1 :
beq t8, $u_loop # e1 :
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
$u_eos:
extqh t2, a1, t0 # e0 :
or t0, t1, t0 # e1 : first (partial) source word complete
cmpbge zero, t0, t8 # e0 : is the null in this first bit?
bne t8, $u_final # .. e1 (zdb)
stq_u t0, 0(a0) # e0 : the null was in the high-order bits
addq a0, 8, a0 # .. e1 :
subq a2, 1, a2 # e1 :
$u_late_head_exit:
extql t2, a1, t0 # .. e0 :
cmpbge zero, t0, t8 # e0 :
or t8, t10, t6 # e1 :
cmoveq a2, t6, t8 # e0 :
nop # .. e1 :
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t0 == assembled source word
t8 == cmpbge mask that found the null. */
$u_final:
negq t8, t6 # e0 : isolate low bit set
and t6, t8, t12 # e1 :
and t12, 0x80, t6 # e0 : avoid dest word load if we can
bne t6, 1f # .. e1 (zdb)
ldq_u t1, 0(a0) # e0 :
subq t12, 1, t6 # .. e1 :
or t6, t12, t8 # e0 :
zapnot t0, t8, t0 # .. e1 : kill source bytes > null
zap t1, t8, t1 # e0 : kill dest bytes <= null
or t0, t1, t0 # e1 :
1: stq_u t0, 0(a0) # e0 :
br $finish_up
$u_eoc: # end-of-count
extqh t2, a1, t0
or t0, t1, t0
cmpbge zero, t0, t8
$u_eocfin: # end-of-count, final word
or t10, t8, t8
br $u_final
/* Unaligned copy entry point. */
.align 3
$unaligned:
EX( ldq_u t1, 0(a1) ) # e0 : load first source word
and a0, 7, t4 # .. e1 : find dest misalignment
and a1, 7, t5 # e0 : find src misalignment
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t0 # .. e1 :
mov zero, t6 # e0 :
beq t4, 1f # .. e1 :
ldq_u t0, 0(a0) # e0 :
lda t6, -1 # .. e1 :
mskql t6, a0, t6 # e0 :
1:
subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
cmplt t4, t5, t12 # e1 :
extql t1, a1, t1 # .. e0 : shift src into place
lda t2, -1 # e0 : for creating masks later
beq t12, $u_head # e1 :
mskqh t2, t5, t2 # e0 : begin src byte validity mask
cmpbge zero, t1, t8 # .. e1 : is there a zero?
extql t2, a1, t2 # e0 :
or t8, t10, t5 # .. e1 : test for end-of-count too
cmpbge zero, t2, t3 # e0 :
cmoveq a2, t5, t8 # .. e1 :
andnot t8, t3, t8 # e0 :
beq t8, $u_head # .. e1 (zdb)
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # e0 :
negq t8, t6 # .. e1 : build bitmask of bytes <= zero
mskqh t1, t4, t1 # e0 :
and t6, t8, t12 # .. e1 :
subq t12, 1, t6 # e0 :
or t6, t12, t8 # e1 :
zapnot t2, t8, t2 # e0 : prepare source word; mirror changes
zapnot t1, t8, t1 # .. e1 : to source validity mask
andnot t0, t2, t0 # e0 : zero place for source to reside
or t0, t1, t0 # e1 : and put it there
stq_u t0, 0(a0) # e0 :
$finish_up:
zapnot t0, t12, t4 # was last byte written null?
cmovne t4, 1, t4
and t12, 0xf0, t3 # binary search for the address of the
and t12, 0xcc, t2 # last byte written
and t12, 0xaa, t1
bic a0, 7, t0
cmovne t3, 4, t3
cmovne t2, 2, t2
cmovne t1, 1, t1
addq t0, t3, t0
addq t1, t2, t1
addq t0, t1, t0
addq t0, t4, t0 # add one if we filled the buffer
subq t0, v0, v0 # find string length
ret
$zerolength:
clr v0
$exception:
ret
.end __strncpy_from_user

87
arch/alpha/lib/strrchr.S Normal file
View File

@@ -0,0 +1,87 @@
/*
* arch/alpha/lib/strrchr.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Return the address of the last occurrence of a given character
* within a null-terminated string, or null if it is not found.
*/
#include <asm/regdef.h>
.set noreorder
.set noat
.align 3
.ent strrchr
.globl strrchr
strrchr:
.frame sp, 0, ra
.prologue 0
zapnot a1, 1, a1 # e0 : zero extend our test character
mov zero, t6 # .. e1 : t6 is last match aligned addr
sll a1, 8, t5 # e0 : replicate our test character
mov zero, t8 # .. e1 : t8 is last match byte compare mask
or t5, a1, a1 # e0 :
ldq_u t0, 0(a0) # .. e1 : load first quadword
sll a1, 16, t5 # e0 :
andnot a0, 7, v0 # .. e1 : align source addr
or t5, a1, a1 # e0 :
lda t4, -1 # .. e1 : build garbage mask
sll a1, 32, t5 # e0 :
cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero
mskqh t4, a0, t4 # e0 :
or t5, a1, a1 # .. e1 : character replication complete
xor t0, a1, t2 # e0 : make bytes == c zero
cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage
cmpbge zero, t2, t3 # e0 : bits set iff byte == c
andnot t1, t4, t1 # .. e1 : clear garbage from null test
andnot t3, t4, t3 # e0 : clear garbage from char test
bne t1, $eos # .. e1 : did we already hit the terminator?
/* Character search main loop */
$loop:
ldq t0, 8(v0) # e0 : load next quadword
cmovne t3, v0, t6 # .. e1 : save previous comparisons match
cmovne t3, t3, t8 # e0 :
addq v0, 8, v0 # .. e1 :
xor t0, a1, t2 # e0 :
cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero
cmpbge zero, t2, t3 # e0 : bits set iff byte == c
beq t1, $loop # .. e1 : if we havnt seen a null, loop
/* Mask out character matches after terminator */
$eos:
negq t1, t4 # e0 : isolate first null byte match
and t1, t4, t4 # e1 :
subq t4, 1, t5 # e0 : build a mask of the bytes upto...
or t4, t5, t4 # e1 : ... and including the null
and t3, t4, t3 # e0 : mask out char matches after null
cmovne t3, t3, t8 # .. e1 : save it, if match found
cmovne t3, v0, t6 # e0 :
/* Locate the address of the last matched character */
/* Retain the early exit for the ev4 -- the ev5 mispredict penalty
is 5 cycles -- the same as just falling through. */
beq t8, $retnull # .. e1 :
and t8, 0xf0, t2 # e0 : binary search for the high bit set
cmovne t2, t2, t8 # .. e1 (zdb)
cmovne t2, 4, t2 # e0 :
and t8, 0xcc, t1 # .. e1 :
cmovne t1, t1, t8 # e0 :
cmovne t1, 2, t1 # .. e1 :
and t8, 0xaa, t0 # e0 :
cmovne t0, 1, t0 # .. e1 (zdb)
addq t2, t1, t1 # e0 :
addq t6, t0, v0 # .. e1 : add our aligned base ptr to the mix
addq v0, t1, v0 # e0 :
ret # .. e1 :
$retnull:
mov zero, v0 # e0 :
ret # .. e1 :
.end strrchr

289
arch/alpha/lib/stxcpy.S Normal file
View File

@@ -0,0 +1,289 @@
/*
* arch/alpha/lib/stxcpy.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Copy a null-terminated string from SRC to DST.
*
* This is an internal routine used by strcpy, stpcpy, and strcat.
* As such, it uses special linkage conventions to make implementation
* of these public functions more efficient.
*
* On input:
* t9 = return address
* a0 = DST
* a1 = SRC
*
* On output:
* t12 = bitmask (with one bit set) indicating the last byte written
* a0 = unaligned address of the last *word* written
*
* Furthermore, v0, a3-a5, t11, and t12 are untouched.
*/
#include <asm/regdef.h>
.set noat
.set noreorder
.text
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
doesn't like putting the entry point for a procedure somewhere in the
middle of the procedure descriptor. Work around this by putting the
aligned copy in its own procedure descriptor */
.ent stxcpy_aligned
.align 3
stxcpy_aligned:
.frame sp, 0, t9
.prologue 0
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == the first source word. */
/* Create the 1st output word and detect 0's in the 1st input word. */
lda t2, -1 # e1 : build a mask against false zero
mskqh t2, a1, t2 # e0 : detection in the src word
mskqh t1, a1, t3 # e0 :
ornot t1, t2, t2 # .. e1 :
mskql t0, a1, t0 # e0 : assemble the first output word
cmpbge zero, t2, t8 # .. e1 : bits set iff null found
or t0, t3, t1 # e0 :
bne t8, $a_eos # .. e1 :
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == a source word not containing a null. */
$a_loop:
stq_u t1, 0(a0) # e0 :
addq a0, 8, a0 # .. e1 :
ldq_u t1, 0(a1) # e0 :
addq a1, 8, a1 # .. e1 :
cmpbge zero, t1, t8 # e0 (stall)
beq t8, $a_loop # .. e1 (zdb)
/* Take care of the final (partial) word store.
On entry to this basic block we have:
t1 == the source word containing the null
t8 == the cmpbge mask that found it. */
$a_eos:
negq t8, t6 # e0 : find low bit set
and t8, t6, t12 # e1 (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
and t12, 0x80, t6 # e0 :
bne t6, 1f # .. e1 (zdb)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t0, 0(a0) # e0 :
subq t12, 1, t6 # .. e1 :
zapnot t1, t6, t1 # e0 : clear src bytes >= null
or t12, t6, t8 # .. e1 :
zap t0, t8, t0 # e0 : clear dst bytes <= null
or t0, t1, t1 # e1 :
1: stq_u t1, 0(a0) # e0 :
ret (t9) # .. e1 :
.end stxcpy_aligned
.align 3
.ent __stxcpy
.globl __stxcpy
__stxcpy:
.frame sp, 0, t9
.prologue 0
/* Are source and destination co-aligned? */
xor a0, a1, t0 # e0 :
unop # :
and t0, 7, t0 # e0 :
bne t0, $unaligned # .. e1 :
/* We are co-aligned; take care of a partial first word. */
ldq_u t1, 0(a1) # e0 : load first src word
and a0, 7, t0 # .. e1 : take care not to load a word ...
addq a1, 8, a1 # e0 :
beq t0, stxcpy_aligned # .. e1 : ... if we wont need it
ldq_u t0, 0(a0) # e0 :
br stxcpy_aligned # .. e1 :
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 3
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, for masking back in, if needed else 0
t1 == the low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
ldq_u t2, 8(a1) # e0 :
addq a1, 8, a1 # .. e1 :
extql t1, a1, t1 # e0 :
extqh t2, a1, t4 # e0 :
mskql t0, a0, t0 # e0 :
or t1, t4, t1 # .. e1 :
mskqh t1, a0, t1 # e0 :
or t0, t1, t1 # e1 :
or t1, t6, t6 # e0 :
cmpbge zero, t6, t8 # .. e1 :
lda t6, -1 # e0 : for masking just below
bne t8, $u_final # .. e1 :
mskql t6, a1, t6 # e0 : mask out the bits we have
or t6, t2, t2 # e1 : already extracted before
cmpbge zero, t2, t8 # e0 : testing eos
bne t8, $u_late_head_exit # .. e1 (zdb)
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
stq_u t1, 0(a0) # e0 : store first output word
addq a0, 8, a0 # .. e1 :
extql t2, a1, t0 # e0 : position ho-bits of lo word
ldq_u t2, 8(a1) # .. e1 : read next high-order source word
addq a1, 8, a1 # e0 :
cmpbge zero, t2, t8 # .. e1 :
nop # e0 :
bne t8, $u_eos # .. e1 :
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t0 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
.align 3
$u_loop:
extqh t2, a1, t1 # e0 : extract high bits for current word
addq a1, 8, a1 # .. e1 :
extql t2, a1, t3 # e0 : extract low bits for next time
addq a0, 8, a0 # .. e1 :
or t0, t1, t1 # e0 : current dst word now complete
ldq_u t2, 0(a1) # .. e1 : load high word for next time
stq_u t1, -8(a0) # e0 : save the current word
mov t3, t0 # .. e1 :
cmpbge zero, t2, t8 # e0 : test new word for eos
beq t8, $u_loop # .. e1 :
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t0 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
$u_eos:
extqh t2, a1, t1 # e0 :
or t0, t1, t1 # e1 : first (partial) source word complete
cmpbge zero, t1, t8 # e0 : is the null in this first bit?
bne t8, $u_final # .. e1 (zdb)
$u_late_head_exit:
stq_u t1, 0(a0) # e0 : the null was in the high-order bits
addq a0, 8, a0 # .. e1 :
extql t2, a1, t1 # e0 :
cmpbge zero, t1, t8 # .. e1 :
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t1 == assembled source word
t8 == cmpbge mask that found the null. */
$u_final:
negq t8, t6 # e0 : isolate low bit set
and t6, t8, t12 # e1 :
and t12, 0x80, t6 # e0 : avoid dest word load if we can
bne t6, 1f # .. e1 (zdb)
ldq_u t0, 0(a0) # e0 :
subq t12, 1, t6 # .. e1 :
or t6, t12, t8 # e0 :
zapnot t1, t6, t1 # .. e1 : kill source bytes >= null
zap t0, t8, t0 # e0 : kill dest bytes <= null
or t0, t1, t1 # e1 :
1: stq_u t1, 0(a0) # e0 :
ret (t9) # .. e1 :
/* Unaligned copy entry point. */
.align 3
$unaligned:
ldq_u t1, 0(a1) # e0 : load first source word
and a0, 7, t4 # .. e1 : find dest misalignment
and a1, 7, t5 # e0 : find src misalignment
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t0 # .. e1 :
mov zero, t6 # e0 :
beq t4, 1f # .. e1 :
ldq_u t0, 0(a0) # e0 :
lda t6, -1 # .. e1 :
mskql t6, a0, t6 # e0 :
1:
subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
cmplt t4, t5, t12 # e0 :
beq t12, $u_head # .. e1 (zdb)
lda t2, -1 # e1 : mask out leading garbage in source
mskqh t2, t5, t2 # e0 :
nop # e0 :
ornot t1, t2, t3 # .. e1 :
cmpbge zero, t3, t8 # e0 : is there a zero?
beq t8, $u_head # .. e1 (zdb)
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # e0 :
negq t8, t6 # .. e1 : build bitmask of bytes <= zero
and t6, t8, t12 # e0 :
and a1, 7, t5 # .. e1 :
subq t12, 1, t6 # e0 :
or t6, t12, t8 # e1 :
srl t12, t5, t12 # e0 : adjust final null return value
zapnot t2, t8, t2 # .. e1 : prepare source word; mirror changes
and t1, t2, t1 # e1 : to source validity mask
extql t2, a1, t2 # .. e0 :
extql t1, a1, t1 # e0 :
andnot t0, t2, t0 # .. e1 : zero place for source to reside
or t0, t1, t1 # e1 : and put it there
stq_u t1, 0(a0) # .. e0 :
ret (t9) # e1 :
.end __stxcpy

345
arch/alpha/lib/stxncpy.S Normal file
View File

@@ -0,0 +1,345 @@
/*
* arch/alpha/lib/stxncpy.S
* Contributed by Richard Henderson (rth@tamu.edu)
*
* Copy no more than COUNT bytes of the null-terminated string from
* SRC to DST.
*
* This is an internal routine used by strncpy, stpncpy, and strncat.
* As such, it uses special linkage conventions to make implementation
* of these public functions more efficient.
*
* On input:
* t9 = return address
* a0 = DST
* a1 = SRC
* a2 = COUNT
*
* Furthermore, COUNT may not be zero.
*
* On output:
* t0 = last word written
* t10 = bitmask (with one bit set) indicating the byte position of
* the end of the range specified by COUNT
* t12 = bitmask (with one bit set) indicating the last byte written
* a0 = unaligned address of the last *word* written
* a2 = the number of full words left in COUNT
*
* Furthermore, v0, a3-a5, t11, and $at are untouched.
*/
#include <asm/regdef.h>
.set noat
.set noreorder
.text
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
doesn't like putting the entry point for a procedure somewhere in the
middle of the procedure descriptor. Work around this by putting the
aligned copy in its own procedure descriptor */
.ent stxncpy_aligned
.align 3
stxncpy_aligned:
.frame sp, 0, t9, 0
.prologue 0
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == the first source word. */
/* Create the 1st output word and detect 0's in the 1st input word. */
lda t2, -1 # e1 : build a mask against false zero
mskqh t2, a1, t2 # e0 : detection in the src word
mskqh t1, a1, t3 # e0 :
ornot t1, t2, t2 # .. e1 :
mskql t0, a1, t0 # e0 : assemble the first output word
cmpbge zero, t2, t8 # .. e1 : bits set iff null found
or t0, t3, t0 # e0 :
beq a2, $a_eoc # .. e1 :
bne t8, $a_eos # .. e1 :
/* On entry to this basic block:
t0 == a source word not containing a null. */
$a_loop:
stq_u t0, 0(a0) # e0 :
addq a0, 8, a0 # .. e1 :
ldq_u t0, 0(a1) # e0 :
addq a1, 8, a1 # .. e1 :
subq a2, 1, a2 # e0 :
cmpbge zero, t0, t8 # .. e1 (stall)
beq a2, $a_eoc # e1 :
beq t8, $a_loop # e1 :
/* Take care of the final (partial) word store. At this point
the end-of-count bit is set in t8 iff it applies.
On entry to this basic block we have:
t0 == the source word containing the null
t8 == the cmpbge mask that found it. */
$a_eos:
negq t8, t12 # e0 : find low bit set
and t8, t12, t12 # e1 (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
and t12, 0x80, t6 # e0 :
bne t6, 1f # .. e1 (zdb)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t1, 0(a0) # e0 :
subq t12, 1, t6 # .. e1 :
or t12, t6, t8 # e0 :
unop #
zapnot t0, t8, t0 # e0 : clear src bytes > null
zap t1, t8, t1 # .. e1 : clear dst bytes <= null
or t0, t1, t0 # e1 :
1: stq_u t0, 0(a0) # e0 :
ret (t9) # e1 :
/* Add the end-of-count bit to the eos detection bitmask. */
$a_eoc:
or t10, t8, t8
br $a_eos
.end stxncpy_aligned
.align 3
.ent __stxncpy
.globl __stxncpy
__stxncpy:
.frame sp, 0, t9, 0
.prologue 0
/* Are source and destination co-aligned? */
xor a0, a1, t1 # e0 :
and a0, 7, t0 # .. e1 : find dest misalignment
and t1, 7, t1 # e0 :
addq a2, t0, a2 # .. e1 : bias count by dest misalignment
subq a2, 1, a2 # e0 :
and a2, 7, t2 # e1 :
srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8
addq zero, 1, t10 # .. e1 :
sll t10, t2, t10 # e0 : t10 = bitmask of last count byte
bne t1, $unaligned # .. e1 :
/* We are co-aligned; take care of a partial first word. */
ldq_u t1, 0(a1) # e0 : load first src word
addq a1, 8, a1 # .. e1 :
beq t0, stxncpy_aligned # avoid loading dest word if not needed
ldq_u t0, 0(a0) # e0 :
br stxncpy_aligned # .. e1 :
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 3
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, unmasked
t1 == the shifted low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
ldq_u t2, 8(a1) # e0 : load second src word
addq a1, 8, a1 # .. e1 :
mskql t0, a0, t0 # e0 : mask trailing garbage in dst
extqh t2, a1, t4 # e0 :
or t1, t4, t1 # e1 : first aligned src word complete
mskqh t1, a0, t1 # e0 : mask leading garbage in src
or t0, t1, t0 # e0 : first output word complete
or t0, t6, t6 # e1 : mask original data for zero test
cmpbge zero, t6, t8 # e0 :
beq a2, $u_eocfin # .. e1 :
lda t6, -1 # e0 :
bne t8, $u_final # .. e1 :
mskql t6, a1, t6 # e0 : mask out bits already seen
nop # .. e1 :
stq_u t0, 0(a0) # e0 : store first output word
or t6, t2, t2 # .. e1 :
cmpbge zero, t2, t8 # e0 : find nulls in second partial
addq a0, 8, a0 # .. e1 :
subq a2, 1, a2 # e0 :
bne t8, $u_late_head_exit # .. e1 :
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
extql t2, a1, t1 # e0 : position hi-bits of lo word
beq a2, $u_eoc # .. e1 :
ldq_u t2, 8(a1) # e0 : read next high-order source word
addq a1, 8, a1 # .. e1 :
extqh t2, a1, t0 # e0 : position lo-bits of hi word (stall)
cmpbge zero, t2, t8 # .. e1 :
nop # e0 :
bne t8, $u_eos # .. e1 :
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t0 == the shifted low-order bits from the current source word
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
.align 3
$u_loop:
or t0, t1, t0 # e0 : current dst word now complete
subq a2, 1, a2 # .. e1 : decrement word count
stq_u t0, 0(a0) # e0 : save the current word
addq a0, 8, a0 # .. e1 :
extql t2, a1, t1 # e0 : extract high bits for next time
beq a2, $u_eoc # .. e1 :
ldq_u t2, 8(a1) # e0 : load high word for next time
addq a1, 8, a1 # .. e1 :
nop # e0 :
cmpbge zero, t2, t8 # e1 : test new word for eos (stall)
extqh t2, a1, t0 # e0 : extract low bits for current word
beq t8, $u_loop # .. e1 :
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t0 == the shifted low-order bits from the current source word
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
$u_eos:
or t0, t1, t0 # e0 : first (partial) source word complete
nop # .. e1 :
cmpbge zero, t0, t8 # e0 : is the null in this first bit?
bne t8, $u_final # .. e1 (zdb)
stq_u t0, 0(a0) # e0 : the null was in the high-order bits
addq a0, 8, a0 # .. e1 :
subq a2, 1, a2 # e1 :
$u_late_head_exit:
extql t2, a1, t0 # .. e0 :
cmpbge zero, t0, t8 # e0 :
or t8, t10, t6 # e1 :
cmoveq a2, t6, t8 # e0 :
nop # .. e1 :
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t0 == assembled source word
t8 == cmpbge mask that found the null. */
$u_final:
negq t8, t6 # e0 : isolate low bit set
and t6, t8, t12 # e1 :
and t12, 0x80, t6 # e0 : avoid dest word load if we can
bne t6, 1f # .. e1 (zdb)
ldq_u t1, 0(a0) # e0 :
subq t12, 1, t6 # .. e1 :
or t6, t12, t8 # e0 :
zapnot t0, t8, t0 # .. e1 : kill source bytes > null
zap t1, t8, t1 # e0 : kill dest bytes <= null
or t0, t1, t0 # e1 :
1: stq_u t0, 0(a0) # e0 :
ret (t9) # .. e1 :
/* Got to end-of-count before end of string.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word */
$u_eoc:
and a1, 7, t6 # e1 :
sll t10, t6, t6 # e0 :
and t6, 0xff, t6 # e0 :
bne t6, 1f # .. e1 :
ldq_u t2, 8(a1) # e0 : load final src word
nop # .. e1 :
extqh t2, a1, t0 # e0 : extract low bits for last word
or t1, t0, t1 # e1 :
1: cmpbge zero, t1, t8
mov t1, t0
$u_eocfin: # end-of-count, final word
or t10, t8, t8
br $u_final
/* Unaligned copy entry point. */
.align 3
$unaligned:
ldq_u t1, 0(a1) # e0 : load first source word
and a0, 7, t4 # .. e1 : find dest misalignment
and a1, 7, t5 # e0 : find src misalignment
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t0 # .. e1 :
mov zero, t6 # e0 :
beq t4, 1f # .. e1 :
ldq_u t0, 0(a0) # e0 :
lda t6, -1 # .. e1 :
mskql t6, a0, t6 # e0 :
subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
1: cmplt t4, t5, t12 # e1 :
extql t1, a1, t1 # .. e0 : shift src into place
lda t2, -1 # e0 : for creating masks later
beq t12, $u_head # .. e1 :
extql t2, a1, t2 # e0 :
cmpbge zero, t1, t8 # .. e1 : is there a zero?
andnot t2, t6, t12 # e0 : dest mask for a single word copy
or t8, t10, t5 # .. e1 : test for end-of-count too
cmpbge zero, t12, t3 # e0 :
cmoveq a2, t5, t8 # .. e1 :
andnot t8, t3, t8 # e0 :
beq t8, $u_head # .. e1 (zdb)
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # e0 :
negq t8, t6 # .. e1 : build bitmask of bytes <= zero
mskqh t1, t4, t1 # e0 :
and t6, t8, t2 # .. e1 :
subq t2, 1, t6 # e0 :
or t6, t2, t8 # e1 :
zapnot t12, t8, t12 # e0 : prepare source word; mirror changes
zapnot t1, t8, t1 # .. e1 : to source validity mask
andnot t0, t12, t0 # e0 : zero place for source to reside
or t0, t1, t0 # e1 : and put it there
stq_u t0, 0(a0) # e0 :
ret (t9) # .. e1 :
.end __stxncpy

55
arch/alpha/lib/udelay.c Normal file
View File

@@ -0,0 +1,55 @@
/*
* Copyright (C) 1993, 2000 Linus Torvalds
*
* Delay routines, using a pre-computed "loops_per_jiffy" value.
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h> /* for udelay's use of smp_processor_id */
#include <asm/param.h>
#include <asm/smp.h>
#include <linux/delay.h>
/*
* Use only for very small delays (< 1 msec).
*
* The active part of our cycle counter is only 32-bits wide, and
* we're treating the difference between two marks as signed. On
* a 1GHz box, that's about 2 seconds.
*/
void
__delay(int loops)
{
int tmp;
__asm__ __volatile__(
" rpcc %0\n"
" addl %1,%0,%1\n"
"1: rpcc %0\n"
" subl %1,%0,%0\n"
" bgt %0,1b"
: "=&r" (tmp), "=r" (loops) : "1"(loops));
}
#ifdef CONFIG_SMP
#define LPJ cpu_data[smp_processor_id()].loops_per_jiffy
#else
#define LPJ loops_per_jiffy
#endif
void
udelay(unsigned long usecs)
{
usecs *= (((unsigned long)HZ << 32) / 1000000) * LPJ;
__delay((long)usecs >> 32);
}
EXPORT_SYMBOL(udelay);
void
ndelay(unsigned long nsecs)
{
nsecs *= (((unsigned long)HZ << 32) / 1000000000) * LPJ;
__delay((long)nsecs >> 32);
}
EXPORT_SYMBOL(ndelay);