1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273 |
- #!/usr/bin/env perl
- # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
- #
- # ====================================================================
- # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
- # project.
- # ====================================================================
- # Poly1305 hash for MIPS.
- #
- # May 2016
- #
- # Numbers are cycles per processed byte with poly1305_blocks alone.
- #
- # IALU/gcc
- # R1x000 ~5.5/+130% (big-endian)
- # Octeon II 2.50/+70% (little-endian)
- #
- # March 2019
- #
- # Add 32-bit code path.
- #
- # October 2019
- #
- # Modulo-scheduling reduction allows to omit dependency chain at the
- # end of inner loop and improve performance. Also optimize MIPS32R2
- # code path for MIPS 1004K core. Per René von Dorst's suggestions.
- #
- # IALU/gcc
- # R1x000 ~9.8/? (big-endian)
- # Octeon II 3.65/+140% (little-endian)
- # MT7621/1004K 4.75/? (little-endian)
- #
- ######################################################################
- # There is a number of MIPS ABI in use, O32 and N32/64 are most
- # widely used. Then there is a new contender: NUBI. It appears that if
- # one picks the latter, it's possible to arrange code in ABI neutral
- # manner. Therefore let's stick to NUBI register layout:
- #
- ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
- ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
- ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
- ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
- #
- # The return value is placed in $a0. Following coding rules facilitate
- # interoperability:
- #
- # - never ever touch $tp, "thread pointer", former $gp [o32 can be
- # excluded from the rule, because it's specified volatile];
- # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
- # old code];
- # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
- #
- # For reference here is register layout for N32/64 MIPS ABIs:
- #
- # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
- # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
- # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
- # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
- # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
- #
- # <[email protected]>
- #
- ######################################################################
- $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
- $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
- if ($flavour =~ /64|n32/i) {{{
- ######################################################################
- # 64-bit code path
- #
- my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
- my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
- $code.=<<___;
- #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
- defined(_MIPS_ARCH_MIPS64R6)) \\
- && !defined(_MIPS_ARCH_MIPS64R2)
- # define _MIPS_ARCH_MIPS64R2
- #endif
- #if defined(_MIPS_ARCH_MIPS64R6)
- # define dmultu(rs,rt)
- # define mflo(rd,rs,rt) dmulu rd,rs,rt
- # define mfhi(rd,rs,rt) dmuhu rd,rs,rt
- #else
- # define dmultu(rs,rt) dmultu rs,rt
- # define mflo(rd,rs,rt) mflo rd
- # define mfhi(rd,rs,rt) mfhi rd
- #endif
- #ifdef __KERNEL__
- # define poly1305_init poly1305_init_mips
- # define poly1305_blocks poly1305_blocks_mips
- # define poly1305_emit poly1305_emit_mips
- #endif
- #if defined(__MIPSEB__) && !defined(MIPSEB)
- # define MIPSEB
- #endif
- #ifdef MIPSEB
- # define MSB 0
- # define LSB 7
- #else
- # define MSB 7
- # define LSB 0
- #endif
- .text
- .set noat
- .set noreorder
- .align 5
- .globl poly1305_init
- .ent poly1305_init
- poly1305_init:
- .frame $sp,0,$ra
- .set reorder
- sd $zero,0($ctx)
- sd $zero,8($ctx)
- sd $zero,16($ctx)
- beqz $inp,.Lno_key
- #if defined(_MIPS_ARCH_MIPS64R6)
- andi $tmp0,$inp,7 # $inp % 8
- dsubu $inp,$inp,$tmp0 # align $inp
- sll $tmp0,$tmp0,3 # byte to bit offset
- ld $in0,0($inp)
- ld $in1,8($inp)
- beqz $tmp0,.Laligned_key
- ld $tmp2,16($inp)
- subu $tmp1,$zero,$tmp0
- # ifdef MIPSEB
- dsllv $in0,$in0,$tmp0
- dsrlv $tmp3,$in1,$tmp1
- dsllv $in1,$in1,$tmp0
- dsrlv $tmp2,$tmp2,$tmp1
- # else
- dsrlv $in0,$in0,$tmp0
- dsllv $tmp3,$in1,$tmp1
- dsrlv $in1,$in1,$tmp0
- dsllv $tmp2,$tmp2,$tmp1
- # endif
- or $in0,$in0,$tmp3
- or $in1,$in1,$tmp2
- .Laligned_key:
- #else
- ldl $in0,0+MSB($inp)
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
- #endif
- #ifdef MIPSEB
- # if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
- # else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
- # endif
- #endif
- li $tmp0,1
- dsll $tmp0,32 # 0x0000000100000000
- daddiu $tmp0,-63 # 0x00000000ffffffc1
- dsll $tmp0,28 # 0x0ffffffc10000000
- daddiu $tmp0,-1 # 0x0ffffffc0fffffff
- and $in0,$tmp0
- daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
- and $in1,$tmp0
- sd $in0,24($ctx)
- dsrl $tmp0,$in1,2
- sd $in1,32($ctx)
- daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
- sd $tmp0,40($ctx)
- .Lno_key:
- li $v0,0 # return 0
- jr $ra
- .end poly1305_init
- ___
- {
- my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
- my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
- ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
- my ($shr,$shl) = ($s6,$s7); # used on R6
- $code.=<<___;
- .align 5
- .globl poly1305_blocks
- .ent poly1305_blocks
- poly1305_blocks:
- .set noreorder
- dsrl $len,4 # number of complete blocks
- bnez $len,poly1305_blocks_internal
- nop
- jr $ra
- nop
- .end poly1305_blocks
- .align 5
- .ent poly1305_blocks_internal
- poly1305_blocks_internal:
- .set noreorder
- #if defined(_MIPS_ARCH_MIPS64R6)
- .frame $sp,8*8,$ra
- .mask $SAVED_REGS_MASK|0x000c0000,-8
- dsubu $sp,8*8
- sd $s7,56($sp)
- sd $s6,48($sp)
- #else
- .frame $sp,6*8,$ra
- .mask $SAVED_REGS_MASK,-8
- dsubu $sp,6*8
- #endif
- sd $s5,40($sp)
- sd $s4,32($sp)
- ___
- $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- sd $s3,24($sp)
- sd $s2,16($sp)
- sd $s1,8($sp)
- sd $s0,0($sp)
- ___
- $code.=<<___;
- .set reorder
- #if defined(_MIPS_ARCH_MIPS64R6)
- andi $shr,$inp,7
- dsubu $inp,$inp,$shr # align $inp
- sll $shr,$shr,3 # byte to bit offset
- subu $shl,$zero,$shr
- #endif
- ld $h0,0($ctx) # load hash value
- ld $h1,8($ctx)
- ld $h2,16($ctx)
- ld $r0,24($ctx) # load key
- ld $r1,32($ctx)
- ld $rs1,40($ctx)
- dsll $len,4
- daddu $len,$inp # end of buffer
- b .Loop
- .align 4
- .Loop:
- #if defined(_MIPS_ARCH_MIPS64R6)
- ld $in0,0($inp) # load input
- ld $in1,8($inp)
- beqz $shr,.Laligned_inp
- ld $tmp2,16($inp)
- # ifdef MIPSEB
- dsllv $in0,$in0,$shr
- dsrlv $tmp3,$in1,$shl
- dsllv $in1,$in1,$shr
- dsrlv $tmp2,$tmp2,$shl
- # else
- dsrlv $in0,$in0,$shr
- dsllv $tmp3,$in1,$shl
- dsrlv $in1,$in1,$shr
- dsllv $tmp2,$tmp2,$shl
- # endif
- or $in0,$in0,$tmp3
- or $in1,$in1,$tmp2
- .Laligned_inp:
- #else
- ldl $in0,0+MSB($inp) # load input
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
- #endif
- daddiu $inp,16
- #ifdef MIPSEB
- # if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
- # else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
- # endif
- #endif
- dsrl $tmp1,$h2,2 # modulo-scheduled reduction
- andi $h2,$h2,3
- dsll $tmp0,$tmp1,2
- daddu $d0,$h0,$in0 # accumulate input
- daddu $tmp1,$tmp0
- sltu $tmp0,$d0,$h0
- daddu $d0,$d0,$tmp1 # ... and residue
- sltu $tmp1,$d0,$tmp1
- daddu $d1,$h1,$in1
- daddu $tmp0,$tmp1
- sltu $tmp1,$d1,$h1
- daddu $d1,$tmp0
- dmultu ($r0,$d0) # h0*r0
- daddu $d2,$h2,$padbit
- sltu $tmp0,$d1,$tmp0
- mflo ($h0,$r0,$d0)
- mfhi ($h1,$r0,$d0)
- dmultu ($rs1,$d1) # h1*5*r1
- daddu $d2,$tmp1
- daddu $d2,$tmp0
- mflo ($tmp0,$rs1,$d1)
- mfhi ($tmp1,$rs1,$d1)
- dmultu ($r1,$d0) # h0*r1
- mflo ($tmp2,$r1,$d0)
- mfhi ($h2,$r1,$d0)
- daddu $h0,$tmp0
- daddu $h1,$tmp1
- sltu $tmp0,$h0,$tmp0
- dmultu ($r0,$d1) # h1*r0
- daddu $h1,$tmp0
- daddu $h1,$tmp2
- mflo ($tmp0,$r0,$d1)
- mfhi ($tmp1,$r0,$d1)
- dmultu ($rs1,$d2) # h2*5*r1
- sltu $tmp2,$h1,$tmp2
- daddu $h2,$tmp2
- mflo ($tmp2,$rs1,$d2)
- dmultu ($r0,$d2) # h2*r0
- daddu $h1,$tmp0
- daddu $h2,$tmp1
- mflo ($tmp3,$r0,$d2)
- sltu $tmp0,$h1,$tmp0
- daddu $h2,$tmp0
- daddu $h1,$tmp2
- sltu $tmp2,$h1,$tmp2
- daddu $h2,$tmp2
- daddu $h2,$tmp3
- bne $inp,$len,.Loop
- sd $h0,0($ctx) # store hash value
- sd $h1,8($ctx)
- sd $h2,16($ctx)
- .set noreorder
- #if defined(_MIPS_ARCH_MIPS64R6)
- ld $s7,56($sp)
- ld $s6,48($sp)
- #endif
- ld $s5,40($sp) # epilogue
- ld $s4,32($sp)
- ___
- $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
- ld $s3,24($sp)
- ld $s2,16($sp)
- ld $s1,8($sp)
- ld $s0,0($sp)
- ___
- $code.=<<___;
- jr $ra
- #if defined(_MIPS_ARCH_MIPS64R6)
- daddu $sp,8*8
- #else
- daddu $sp,6*8
- #endif
- .end poly1305_blocks_internal
- ___
- }
- {
- my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
- $code.=<<___;
- .align 5
- .globl poly1305_emit
- .ent poly1305_emit
- poly1305_emit:
- .frame $sp,0,$ra
- .set reorder
- ld $tmp2,16($ctx)
- ld $tmp0,0($ctx)
- ld $tmp1,8($ctx)
- li $in0,-4 # final reduction
- dsrl $in1,$tmp2,2
- and $in0,$tmp2
- andi $tmp2,$tmp2,3
- daddu $in0,$in1
- daddu $tmp0,$tmp0,$in0
- sltu $in1,$tmp0,$in0
- daddiu $in0,$tmp0,5 # compare to modulus
- daddu $tmp1,$tmp1,$in1
- sltiu $tmp3,$in0,5
- sltu $tmp4,$tmp1,$in1
- daddu $in1,$tmp1,$tmp3
- daddu $tmp2,$tmp2,$tmp4
- sltu $tmp3,$in1,$tmp3
- daddu $tmp2,$tmp2,$tmp3
- dsrl $tmp2,2 # see if it carried/borrowed
- dsubu $tmp2,$zero,$tmp2
- xor $in0,$tmp0
- xor $in1,$tmp1
- and $in0,$tmp2
- and $in1,$tmp2
- xor $in0,$tmp0
- xor $in1,$tmp1
- lwu $tmp0,0($nonce) # load nonce
- lwu $tmp1,4($nonce)
- lwu $tmp2,8($nonce)
- lwu $tmp3,12($nonce)
- dsll $tmp1,32
- dsll $tmp3,32
- or $tmp0,$tmp1
- or $tmp2,$tmp3
- daddu $in0,$tmp0 # accumulate nonce
- daddu $in1,$tmp2
- sltu $tmp0,$in0,$tmp0
- daddu $in1,$tmp0
- dsrl $tmp0,$in0,8 # write mac value
- dsrl $tmp1,$in0,16
- dsrl $tmp2,$in0,24
- sb $in0,0($mac)
- dsrl $tmp3,$in0,32
- sb $tmp0,1($mac)
- dsrl $tmp0,$in0,40
- sb $tmp1,2($mac)
- dsrl $tmp1,$in0,48
- sb $tmp2,3($mac)
- dsrl $tmp2,$in0,56
- sb $tmp3,4($mac)
- dsrl $tmp3,$in1,8
- sb $tmp0,5($mac)
- dsrl $tmp0,$in1,16
- sb $tmp1,6($mac)
- dsrl $tmp1,$in1,24
- sb $tmp2,7($mac)
- sb $in1,8($mac)
- dsrl $tmp2,$in1,32
- sb $tmp3,9($mac)
- dsrl $tmp3,$in1,40
- sb $tmp0,10($mac)
- dsrl $tmp0,$in1,48
- sb $tmp1,11($mac)
- dsrl $tmp1,$in1,56
- sb $tmp2,12($mac)
- sb $tmp3,13($mac)
- sb $tmp0,14($mac)
- sb $tmp1,15($mac)
- jr $ra
- .end poly1305_emit
- .rdata
- .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
- .align 2
- ___
- }
- }}} else {{{
- ######################################################################
- # 32-bit code path
- #
- my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
- my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
- ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
- $code.=<<___;
- #if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
- defined(_MIPS_ARCH_MIPS32R6)) \\
- && !defined(_MIPS_ARCH_MIPS32R2)
- # define _MIPS_ARCH_MIPS32R2
- #endif
- #if defined(_MIPS_ARCH_MIPS32R6)
- # define multu(rs,rt)
- # define mflo(rd,rs,rt) mulu rd,rs,rt
- # define mfhi(rd,rs,rt) muhu rd,rs,rt
- #else
- # define multu(rs,rt) multu rs,rt
- # define mflo(rd,rs,rt) mflo rd
- # define mfhi(rd,rs,rt) mfhi rd
- #endif
- #ifdef __KERNEL__
- # define poly1305_init poly1305_init_mips
- # define poly1305_blocks poly1305_blocks_mips
- # define poly1305_emit poly1305_emit_mips
- #endif
- #if defined(__MIPSEB__) && !defined(MIPSEB)
- # define MIPSEB
- #endif
- #ifdef MIPSEB
- # define MSB 0
- # define LSB 3
- #else
- # define MSB 3
- # define LSB 0
- #endif
- .text
- .set noat
- .set noreorder
- .align 5
- .globl poly1305_init
- .ent poly1305_init
- poly1305_init:
- .frame $sp,0,$ra
- .set reorder
- sw $zero,0($ctx)
- sw $zero,4($ctx)
- sw $zero,8($ctx)
- sw $zero,12($ctx)
- sw $zero,16($ctx)
- beqz $inp,.Lno_key
- #if defined(_MIPS_ARCH_MIPS32R6)
- andi $tmp0,$inp,3 # $inp % 4
- subu $inp,$inp,$tmp0 # align $inp
- sll $tmp0,$tmp0,3 # byte to bit offset
- lw $in0,0($inp)
- lw $in1,4($inp)
- lw $in2,8($inp)
- lw $in3,12($inp)
- beqz $tmp0,.Laligned_key
- lw $tmp2,16($inp)
- subu $tmp1,$zero,$tmp0
- # ifdef MIPSEB
- sllv $in0,$in0,$tmp0
- srlv $tmp3,$in1,$tmp1
- sllv $in1,$in1,$tmp0
- or $in0,$in0,$tmp3
- srlv $tmp3,$in2,$tmp1
- sllv $in2,$in2,$tmp0
- or $in1,$in1,$tmp3
- srlv $tmp3,$in3,$tmp1
- sllv $in3,$in3,$tmp0
- or $in2,$in2,$tmp3
- srlv $tmp2,$tmp2,$tmp1
- or $in3,$in3,$tmp2
- # else
- srlv $in0,$in0,$tmp0
- sllv $tmp3,$in1,$tmp1
- srlv $in1,$in1,$tmp0
- or $in0,$in0,$tmp3
- sllv $tmp3,$in2,$tmp1
- srlv $in2,$in2,$tmp0
- or $in1,$in1,$tmp3
- sllv $tmp3,$in3,$tmp1
- srlv $in3,$in3,$tmp0
- or $in2,$in2,$tmp3
- sllv $tmp2,$tmp2,$tmp1
- or $in3,$in3,$tmp2
- # endif
- .Laligned_key:
- #else
- lwl $in0,0+MSB($inp)
- lwl $in1,4+MSB($inp)
- lwl $in2,8+MSB($inp)
- lwl $in3,12+MSB($inp)
- lwr $in0,0+LSB($inp)
- lwr $in1,4+LSB($inp)
- lwr $in2,8+LSB($inp)
- lwr $in3,12+LSB($inp)
- #endif
- #ifdef MIPSEB
- # if defined(_MIPS_ARCH_MIPS32R2)
- wsbh $in0,$in0 # byte swap
- wsbh $in1,$in1
- wsbh $in2,$in2
- wsbh $in3,$in3
- rotr $in0,$in0,16
- rotr $in1,$in1,16
- rotr $in2,$in2,16
- rotr $in3,$in3,16
- # else
- srl $tmp0,$in0,24 # byte swap
- srl $tmp1,$in0,8
- andi $tmp2,$in0,0xFF00
- sll $in0,$in0,24
- andi $tmp1,0xFF00
- sll $tmp2,$tmp2,8
- or $in0,$tmp0
- srl $tmp0,$in1,24
- or $tmp1,$tmp2
- srl $tmp2,$in1,8
- or $in0,$tmp1
- andi $tmp1,$in1,0xFF00
- sll $in1,$in1,24
- andi $tmp2,0xFF00
- sll $tmp1,$tmp1,8
- or $in1,$tmp0
- srl $tmp0,$in2,24
- or $tmp2,$tmp1
- srl $tmp1,$in2,8
- or $in1,$tmp2
- andi $tmp2,$in2,0xFF00
- sll $in2,$in2,24
- andi $tmp1,0xFF00
- sll $tmp2,$tmp2,8
- or $in2,$tmp0
- srl $tmp0,$in3,24
- or $tmp1,$tmp2
- srl $tmp2,$in3,8
- or $in2,$tmp1
- andi $tmp1,$in3,0xFF00
- sll $in3,$in3,24
- andi $tmp2,0xFF00
- sll $tmp1,$tmp1,8
- or $in3,$tmp0
- or $tmp2,$tmp1
- or $in3,$tmp2
- # endif
- #endif
- lui $tmp0,0x0fff
- ori $tmp0,0xffff # 0x0fffffff
- and $in0,$in0,$tmp0
- subu $tmp0,3 # 0x0ffffffc
- and $in1,$in1,$tmp0
- and $in2,$in2,$tmp0
- and $in3,$in3,$tmp0
- sw $in0,20($ctx)
- sw $in1,24($ctx)
- sw $in2,28($ctx)
- sw $in3,32($ctx)
- srl $tmp1,$in1,2
- srl $tmp2,$in2,2
- srl $tmp3,$in3,2
- addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
- addu $in2,$in2,$tmp2
- addu $in3,$in3,$tmp3
- sw $in1,36($ctx)
- sw $in2,40($ctx)
- sw $in3,44($ctx)
- .Lno_key:
- li $v0,0
- jr $ra
- .end poly1305_init
- ___
- {
- my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
- my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
- ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
- my ($d0,$d1,$d2,$d3) =
- ($a4,$a5,$a6,$a7);
- my $shr = $t2; # used on R6
- my $one = $t2; # used on R2
- $code.=<<___;
- .globl poly1305_blocks
- .align 5
- .ent poly1305_blocks
- poly1305_blocks:
- .frame $sp,16*4,$ra
- .mask $SAVED_REGS_MASK,-4
- .set noreorder
- subu $sp, $sp,4*12
- sw $s11,4*11($sp)
- sw $s10,4*10($sp)
- sw $s9, 4*9($sp)
- sw $s8, 4*8($sp)
- sw $s7, 4*7($sp)
- sw $s6, 4*6($sp)
- sw $s5, 4*5($sp)
- sw $s4, 4*4($sp)
- ___
- $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- sw $s3, 4*3($sp)
- sw $s2, 4*2($sp)
- sw $s1, 4*1($sp)
- sw $s0, 4*0($sp)
- ___
- $code.=<<___;
- .set reorder
- srl $len,4 # number of complete blocks
- li $one,1
- beqz $len,.Labort
- #if defined(_MIPS_ARCH_MIPS32R6)
- andi $shr,$inp,3
- subu $inp,$inp,$shr # align $inp
- sll $shr,$shr,3 # byte to bit offset
- #endif
- lw $h0,0($ctx) # load hash value
- lw $h1,4($ctx)
- lw $h2,8($ctx)
- lw $h3,12($ctx)
- lw $h4,16($ctx)
- lw $r0,20($ctx) # load key
- lw $r1,24($ctx)
- lw $r2,28($ctx)
- lw $r3,32($ctx)
- lw $rs1,36($ctx)
- lw $rs2,40($ctx)
- lw $rs3,44($ctx)
- sll $len,4
- addu $len,$len,$inp # end of buffer
- b .Loop
- .align 4
- .Loop:
- #if defined(_MIPS_ARCH_MIPS32R6)
- lw $d0,0($inp) # load input
- lw $d1,4($inp)
- lw $d2,8($inp)
- lw $d3,12($inp)
- beqz $shr,.Laligned_inp
- lw $t0,16($inp)
- subu $t1,$zero,$shr
- # ifdef MIPSEB
- sllv $d0,$d0,$shr
- srlv $at,$d1,$t1
- sllv $d1,$d1,$shr
- or $d0,$d0,$at
- srlv $at,$d2,$t1
- sllv $d2,$d2,$shr
- or $d1,$d1,$at
- srlv $at,$d3,$t1
- sllv $d3,$d3,$shr
- or $d2,$d2,$at
- srlv $t0,$t0,$t1
- or $d3,$d3,$t0
- # else
- srlv $d0,$d0,$shr
- sllv $at,$d1,$t1
- srlv $d1,$d1,$shr
- or $d0,$d0,$at
- sllv $at,$d2,$t1
- srlv $d2,$d2,$shr
- or $d1,$d1,$at
- sllv $at,$d3,$t1
- srlv $d3,$d3,$shr
- or $d2,$d2,$at
- sllv $t0,$t0,$t1
- or $d3,$d3,$t0
- # endif
- .Laligned_inp:
- #else
- lwl $d0,0+MSB($inp) # load input
- lwl $d1,4+MSB($inp)
- lwl $d2,8+MSB($inp)
- lwl $d3,12+MSB($inp)
- lwr $d0,0+LSB($inp)
- lwr $d1,4+LSB($inp)
- lwr $d2,8+LSB($inp)
- lwr $d3,12+LSB($inp)
- #endif
- #ifdef MIPSEB
- # if defined(_MIPS_ARCH_MIPS32R2)
- wsbh $d0,$d0 # byte swap
- wsbh $d1,$d1
- wsbh $d2,$d2
- wsbh $d3,$d3
- rotr $d0,$d0,16
- rotr $d1,$d1,16
- rotr $d2,$d2,16
- rotr $d3,$d3,16
- # else
- srl $at,$d0,24 # byte swap
- srl $t0,$d0,8
- andi $t1,$d0,0xFF00
- sll $d0,$d0,24
- andi $t0,0xFF00
- sll $t1,$t1,8
- or $d0,$at
- srl $at,$d1,24
- or $t0,$t1
- srl $t1,$d1,8
- or $d0,$t0
- andi $t0,$d1,0xFF00
- sll $d1,$d1,24
- andi $t1,0xFF00
- sll $t0,$t0,8
- or $d1,$at
- srl $at,$d2,24
- or $t1,$t0
- srl $t0,$d2,8
- or $d1,$t1
- andi $t1,$d2,0xFF00
- sll $d2,$d2,24
- andi $t0,0xFF00
- sll $t1,$t1,8
- or $d2,$at
- srl $at,$d3,24
- or $t0,$t1
- srl $t1,$d3,8
- or $d2,$t0
- andi $t0,$d3,0xFF00
- sll $d3,$d3,24
- andi $t1,0xFF00
- sll $t0,$t0,8
- or $d3,$at
- or $t1,$t0
- or $d3,$t1
- # endif
- #endif
- srl $t0,$h4,2 # modulo-scheduled reduction
- andi $h4,$h4,3
- sll $at,$t0,2
- addu $d0,$d0,$h0 # accumulate input
- addu $t0,$t0,$at
- sltu $h0,$d0,$h0
- addu $d0,$d0,$t0 # ... and residue
- sltu $at,$d0,$t0
- addu $d1,$d1,$h1
- addu $h0,$h0,$at # carry
- sltu $h1,$d1,$h1
- addu $d1,$d1,$h0
- sltu $h0,$d1,$h0
- addu $d2,$d2,$h2
- addu $h1,$h1,$h0 # carry
- sltu $h2,$d2,$h2
- addu $d2,$d2,$h1
- sltu $h1,$d2,$h1
- addu $d3,$d3,$h3
- addu $h2,$h2,$h1 # carry
- sltu $h3,$d3,$h3
- addu $d3,$d3,$h2
- #if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
- multu $r0,$d0 # d0*r0
- sltu $h2,$d3,$h2
- maddu $rs3,$d1 # d1*s3
- addu $h3,$h3,$h2 # carry
- maddu $rs2,$d2 # d2*s2
- addu $h4,$h4,$padbit
- maddu $rs1,$d3 # d3*s1
- addu $h4,$h4,$h3
- mfhi $at
- mflo $h0
- multu $r1,$d0 # d0*r1
- maddu $r0,$d1 # d1*r0
- maddu $rs3,$d2 # d2*s3
- maddu $rs2,$d3 # d3*s2
- maddu $rs1,$h4 # h4*s1
- maddu $at,$one # hi*1
- mfhi $at
- mflo $h1
- multu $r2,$d0 # d0*r2
- maddu $r1,$d1 # d1*r1
- maddu $r0,$d2 # d2*r0
- maddu $rs3,$d3 # d3*s3
- maddu $rs2,$h4 # h4*s2
- maddu $at,$one # hi*1
- mfhi $at
- mflo $h2
- mul $t0,$r0,$h4 # h4*r0
- multu $r3,$d0 # d0*r3
- maddu $r2,$d1 # d1*r2
- maddu $r1,$d2 # d2*r1
- maddu $r0,$d3 # d3*r0
- maddu $rs3,$h4 # h4*s3
- maddu $at,$one # hi*1
- mfhi $at
- mflo $h3
- addiu $inp,$inp,16
- addu $h4,$t0,$at
- #else
- multu ($r0,$d0) # d0*r0
- mflo ($h0,$r0,$d0)
- mfhi ($h1,$r0,$d0)
- sltu $h2,$d3,$h2
- addu $h3,$h3,$h2 # carry
- multu ($rs3,$d1) # d1*s3
- mflo ($at,$rs3,$d1)
- mfhi ($t0,$rs3,$d1)
- addu $h4,$h4,$padbit
- addiu $inp,$inp,16
- addu $h4,$h4,$h3
- multu ($rs2,$d2) # d2*s2
- mflo ($a3,$rs2,$d2)
- mfhi ($t1,$rs2,$d2)
- addu $h0,$h0,$at
- addu $h1,$h1,$t0
- multu ($rs1,$d3) # d3*s1
- sltu $at,$h0,$at
- addu $h1,$h1,$at
- mflo ($at,$rs1,$d3)
- mfhi ($t0,$rs1,$d3)
- addu $h0,$h0,$a3
- addu $h1,$h1,$t1
- multu ($r1,$d0) # d0*r1
- sltu $a3,$h0,$a3
- addu $h1,$h1,$a3
- mflo ($a3,$r1,$d0)
- mfhi ($h2,$r1,$d0)
- addu $h0,$h0,$at
- addu $h1,$h1,$t0
- multu ($r0,$d1) # d1*r0
- sltu $at,$h0,$at
- addu $h1,$h1,$at
- mflo ($at,$r0,$d1)
- mfhi ($t0,$r0,$d1)
- addu $h1,$h1,$a3
- sltu $a3,$h1,$a3
- multu ($rs3,$d2) # d2*s3
- addu $h2,$h2,$a3
- mflo ($a3,$rs3,$d2)
- mfhi ($t1,$rs3,$d2)
- addu $h1,$h1,$at
- addu $h2,$h2,$t0
- multu ($rs2,$d3) # d3*s2
- sltu $at,$h1,$at
- addu $h2,$h2,$at
- mflo ($at,$rs2,$d3)
- mfhi ($t0,$rs2,$d3)
- addu $h1,$h1,$a3
- addu $h2,$h2,$t1
- multu ($rs1,$h4) # h4*s1
- sltu $a3,$h1,$a3
- addu $h2,$h2,$a3
- mflo ($a3,$rs1,$h4)
- addu $h1,$h1,$at
- addu $h2,$h2,$t0
- multu ($r2,$d0) # d0*r2
- sltu $at,$h1,$at
- addu $h2,$h2,$at
- mflo ($at,$r2,$d0)
- mfhi ($h3,$r2,$d0)
- addu $h1,$h1,$a3
- sltu $a3,$h1,$a3
- multu ($r1,$d1) # d1*r1
- addu $h2,$h2,$a3
- mflo ($a3,$r1,$d1)
- mfhi ($t1,$r1,$d1)
- addu $h2,$h2,$at
- sltu $at,$h2,$at
- multu ($r0,$d2) # d2*r0
- addu $h3,$h3,$at
- mflo ($at,$r0,$d2)
- mfhi ($t0,$r0,$d2)
- addu $h2,$h2,$a3
- addu $h3,$h3,$t1
- multu ($rs3,$d3) # d3*s3
- sltu $a3,$h2,$a3
- addu $h3,$h3,$a3
- mflo ($a3,$rs3,$d3)
- mfhi ($t1,$rs3,$d3)
- addu $h2,$h2,$at
- addu $h3,$h3,$t0
- multu ($rs2,$h4) # h4*s2
- sltu $at,$h2,$at
- addu $h3,$h3,$at
- mflo ($at,$rs2,$h4)
- addu $h2,$h2,$a3
- addu $h3,$h3,$t1
- multu ($r3,$d0) # d0*r3
- sltu $a3,$h2,$a3
- addu $h3,$h3,$a3
- mflo ($a3,$r3,$d0)
- mfhi ($t1,$r3,$d0)
- addu $h2,$h2,$at
- sltu $at,$h2,$at
- multu ($r2,$d1) # d1*r2
- addu $h3,$h3,$at
- mflo ($at,$r2,$d1)
- mfhi ($t0,$r2,$d1)
- addu $h3,$h3,$a3
- sltu $a3,$h3,$a3
- multu ($r0,$d3) # d3*r0
- addu $t1,$t1,$a3
- mflo ($a3,$r0,$d3)
- mfhi ($d3,$r0,$d3)
- addu $h3,$h3,$at
- addu $t1,$t1,$t0
- multu ($r1,$d2) # d2*r1
- sltu $at,$h3,$at
- addu $t1,$t1,$at
- mflo ($at,$r1,$d2)
- mfhi ($t0,$r1,$d2)
- addu $h3,$h3,$a3
- addu $t1,$t1,$d3
- multu ($rs3,$h4) # h4*s3
- sltu $a3,$h3,$a3
- addu $t1,$t1,$a3
- mflo ($a3,$rs3,$h4)
- addu $h3,$h3,$at
- addu $t1,$t1,$t0
- multu ($r0,$h4) # h4*r0
- sltu $at,$h3,$at
- addu $t1,$t1,$at
- mflo ($h4,$r0,$h4)
- addu $h3,$h3,$a3
- sltu $a3,$h3,$a3
- addu $t1,$t1,$a3
- addu $h4,$h4,$t1
- li $padbit,1 # if we loop, padbit is 1
- #endif
- bne $inp,$len,.Loop
- sw $h0,0($ctx) # store hash value
- sw $h1,4($ctx)
- sw $h2,8($ctx)
- sw $h3,12($ctx)
- sw $h4,16($ctx)
- .set noreorder
- .Labort:
- lw $s11,4*11($sp)
- lw $s10,4*10($sp)
- lw $s9, 4*9($sp)
- lw $s8, 4*8($sp)
- lw $s7, 4*7($sp)
- lw $s6, 4*6($sp)
- lw $s5, 4*5($sp)
- lw $s4, 4*4($sp)
- ___
- $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- lw $s3, 4*3($sp)
- lw $s2, 4*2($sp)
- lw $s1, 4*1($sp)
- lw $s0, 4*0($sp)
- ___
- $code.=<<___;
- jr $ra
- addu $sp,$sp,4*12
- .end poly1305_blocks
- ___
- }
- {
- my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
- $code.=<<___;
- .align 5
- .globl poly1305_emit
- .ent poly1305_emit
- poly1305_emit:
- .frame $sp,0,$ra
- .set reorder
- lw $tmp4,16($ctx)
- lw $tmp0,0($ctx)
- lw $tmp1,4($ctx)
- lw $tmp2,8($ctx)
- lw $tmp3,12($ctx)
- li $in0,-4 # final reduction
- srl $ctx,$tmp4,2
- and $in0,$in0,$tmp4
- andi $tmp4,$tmp4,3
- addu $ctx,$ctx,$in0
- addu $tmp0,$tmp0,$ctx
- sltu $ctx,$tmp0,$ctx
- addiu $in0,$tmp0,5 # compare to modulus
- addu $tmp1,$tmp1,$ctx
- sltiu $in1,$in0,5
- sltu $ctx,$tmp1,$ctx
- addu $in1,$in1,$tmp1
- addu $tmp2,$tmp2,$ctx
- sltu $in2,$in1,$tmp1
- sltu $ctx,$tmp2,$ctx
- addu $in2,$in2,$tmp2
- addu $tmp3,$tmp3,$ctx
- sltu $in3,$in2,$tmp2
- sltu $ctx,$tmp3,$ctx
- addu $in3,$in3,$tmp3
- addu $tmp4,$tmp4,$ctx
- sltu $ctx,$in3,$tmp3
- addu $ctx,$tmp4
- srl $ctx,2 # see if it carried/borrowed
- subu $ctx,$zero,$ctx
- xor $in0,$tmp0
- xor $in1,$tmp1
- xor $in2,$tmp2
- xor $in3,$tmp3
- and $in0,$ctx
- and $in1,$ctx
- and $in2,$ctx
- and $in3,$ctx
- xor $in0,$tmp0
- xor $in1,$tmp1
- xor $in2,$tmp2
- xor $in3,$tmp3
- lw $tmp0,0($nonce) # load nonce
- lw $tmp1,4($nonce)
- lw $tmp2,8($nonce)
- lw $tmp3,12($nonce)
- addu $in0,$tmp0 # accumulate nonce
- sltu $ctx,$in0,$tmp0
- addu $in1,$tmp1
- sltu $tmp1,$in1,$tmp1
- addu $in1,$ctx
- sltu $ctx,$in1,$ctx
- addu $ctx,$tmp1
- addu $in2,$tmp2
- sltu $tmp2,$in2,$tmp2
- addu $in2,$ctx
- sltu $ctx,$in2,$ctx
- addu $ctx,$tmp2
- addu $in3,$tmp3
- addu $in3,$ctx
- srl $tmp0,$in0,8 # write mac value
- srl $tmp1,$in0,16
- srl $tmp2,$in0,24
- sb $in0, 0($mac)
- sb $tmp0,1($mac)
- srl $tmp0,$in1,8
- sb $tmp1,2($mac)
- srl $tmp1,$in1,16
- sb $tmp2,3($mac)
- srl $tmp2,$in1,24
- sb $in1, 4($mac)
- sb $tmp0,5($mac)
- srl $tmp0,$in2,8
- sb $tmp1,6($mac)
- srl $tmp1,$in2,16
- sb $tmp2,7($mac)
- srl $tmp2,$in2,24
- sb $in2, 8($mac)
- sb $tmp0,9($mac)
- srl $tmp0,$in3,8
- sb $tmp1,10($mac)
- srl $tmp1,$in3,16
- sb $tmp2,11($mac)
- srl $tmp2,$in3,24
- sb $in3, 12($mac)
- sb $tmp0,13($mac)
- sb $tmp1,14($mac)
- sb $tmp2,15($mac)
- jr $ra
- .end poly1305_emit
- .rdata
- .asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
- .align 2
- ___
- }
- }}}
- $output=pop and open STDOUT,">$output";
- print $code;
- close STDOUT;
|