poly1305-mips.pl 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273
  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
  3. #
  4. # ====================================================================
  5. # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
  6. # project.
  7. # ====================================================================
  8. # Poly1305 hash for MIPS.
  9. #
  10. # May 2016
  11. #
  12. # Numbers are cycles per processed byte with poly1305_blocks alone.
  13. #
  14. # IALU/gcc
  15. # R1x000 ~5.5/+130% (big-endian)
  16. # Octeon II 2.50/+70% (little-endian)
  17. #
  18. # March 2019
  19. #
  20. # Add 32-bit code path.
  21. #
  22. # October 2019
  23. #
  24. # Modulo-scheduling reduction allows to omit dependency chain at the
  25. # end of inner loop and improve performance. Also optimize MIPS32R2
  26. # code path for MIPS 1004K core. Per René von Dorst's suggestions.
  27. #
  28. # IALU/gcc
  29. # R1x000 ~9.8/? (big-endian)
  30. # Octeon II 3.65/+140% (little-endian)
  31. # MT7621/1004K 4.75/? (little-endian)
  32. #
  33. ######################################################################
  34. # There is a number of MIPS ABI in use, O32 and N32/64 are most
  35. # widely used. Then there is a new contender: NUBI. It appears that if
  36. # one picks the latter, it's possible to arrange code in ABI neutral
  37. # manner. Therefore let's stick to NUBI register layout:
  38. #
  39. ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
  40. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  41. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
  42. ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
  43. #
  44. # The return value is placed in $a0. Following coding rules facilitate
  45. # interoperability:
  46. #
  47. # - never ever touch $tp, "thread pointer", former $gp [o32 can be
  48. # excluded from the rule, because it's specified volatile];
  49. # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  50. # old code];
  51. # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
  52. #
  53. # For reference here is register layout for N32/64 MIPS ABIs:
  54. #
  55. # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
  56. # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  57. # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
  58. # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  59. # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  60. #
  61. # <[email protected]>
  62. #
  63. ######################################################################
  64. $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
  65. $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
  66. if ($flavour =~ /64|n32/i) {{{
  67. ######################################################################
  68. # 64-bit code path
  69. #
  70. my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
  71. my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
  72. $code.=<<___;
  73. #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
  74. defined(_MIPS_ARCH_MIPS64R6)) \\
  75. && !defined(_MIPS_ARCH_MIPS64R2)
  76. # define _MIPS_ARCH_MIPS64R2
  77. #endif
  78. #if defined(_MIPS_ARCH_MIPS64R6)
  79. # define dmultu(rs,rt)
  80. # define mflo(rd,rs,rt) dmulu rd,rs,rt
  81. # define mfhi(rd,rs,rt) dmuhu rd,rs,rt
  82. #else
  83. # define dmultu(rs,rt) dmultu rs,rt
  84. # define mflo(rd,rs,rt) mflo rd
  85. # define mfhi(rd,rs,rt) mfhi rd
  86. #endif
  87. #ifdef __KERNEL__
  88. # define poly1305_init poly1305_init_mips
  89. # define poly1305_blocks poly1305_blocks_mips
  90. # define poly1305_emit poly1305_emit_mips
  91. #endif
  92. #if defined(__MIPSEB__) && !defined(MIPSEB)
  93. # define MIPSEB
  94. #endif
  95. #ifdef MIPSEB
  96. # define MSB 0
  97. # define LSB 7
  98. #else
  99. # define MSB 7
  100. # define LSB 0
  101. #endif
  102. .text
  103. .set noat
  104. .set noreorder
  105. .align 5
  106. .globl poly1305_init
  107. .ent poly1305_init
  108. poly1305_init:
  109. .frame $sp,0,$ra
  110. .set reorder
  111. sd $zero,0($ctx)
  112. sd $zero,8($ctx)
  113. sd $zero,16($ctx)
  114. beqz $inp,.Lno_key
  115. #if defined(_MIPS_ARCH_MIPS64R6)
  116. andi $tmp0,$inp,7 # $inp % 8
  117. dsubu $inp,$inp,$tmp0 # align $inp
  118. sll $tmp0,$tmp0,3 # byte to bit offset
  119. ld $in0,0($inp)
  120. ld $in1,8($inp)
  121. beqz $tmp0,.Laligned_key
  122. ld $tmp2,16($inp)
  123. subu $tmp1,$zero,$tmp0
  124. # ifdef MIPSEB
  125. dsllv $in0,$in0,$tmp0
  126. dsrlv $tmp3,$in1,$tmp1
  127. dsllv $in1,$in1,$tmp0
  128. dsrlv $tmp2,$tmp2,$tmp1
  129. # else
  130. dsrlv $in0,$in0,$tmp0
  131. dsllv $tmp3,$in1,$tmp1
  132. dsrlv $in1,$in1,$tmp0
  133. dsllv $tmp2,$tmp2,$tmp1
  134. # endif
  135. or $in0,$in0,$tmp3
  136. or $in1,$in1,$tmp2
  137. .Laligned_key:
  138. #else
  139. ldl $in0,0+MSB($inp)
  140. ldl $in1,8+MSB($inp)
  141. ldr $in0,0+LSB($inp)
  142. ldr $in1,8+LSB($inp)
  143. #endif
  144. #ifdef MIPSEB
  145. # if defined(_MIPS_ARCH_MIPS64R2)
  146. dsbh $in0,$in0 # byte swap
  147. dsbh $in1,$in1
  148. dshd $in0,$in0
  149. dshd $in1,$in1
  150. # else
  151. ori $tmp0,$zero,0xFF
  152. dsll $tmp2,$tmp0,32
  153. or $tmp0,$tmp2 # 0x000000FF000000FF
  154. and $tmp1,$in0,$tmp0 # byte swap
  155. and $tmp3,$in1,$tmp0
  156. dsrl $tmp2,$in0,24
  157. dsrl $tmp4,$in1,24
  158. dsll $tmp1,24
  159. dsll $tmp3,24
  160. and $tmp2,$tmp0
  161. and $tmp4,$tmp0
  162. dsll $tmp0,8 # 0x0000FF000000FF00
  163. or $tmp1,$tmp2
  164. or $tmp3,$tmp4
  165. and $tmp2,$in0,$tmp0
  166. and $tmp4,$in1,$tmp0
  167. dsrl $in0,8
  168. dsrl $in1,8
  169. dsll $tmp2,8
  170. dsll $tmp4,8
  171. and $in0,$tmp0
  172. and $in1,$tmp0
  173. or $tmp1,$tmp2
  174. or $tmp3,$tmp4
  175. or $in0,$tmp1
  176. or $in1,$tmp3
  177. dsrl $tmp1,$in0,32
  178. dsrl $tmp3,$in1,32
  179. dsll $in0,32
  180. dsll $in1,32
  181. or $in0,$tmp1
  182. or $in1,$tmp3
  183. # endif
  184. #endif
  185. li $tmp0,1
  186. dsll $tmp0,32 # 0x0000000100000000
  187. daddiu $tmp0,-63 # 0x00000000ffffffc1
  188. dsll $tmp0,28 # 0x0ffffffc10000000
  189. daddiu $tmp0,-1 # 0x0ffffffc0fffffff
  190. and $in0,$tmp0
  191. daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
  192. and $in1,$tmp0
  193. sd $in0,24($ctx)
  194. dsrl $tmp0,$in1,2
  195. sd $in1,32($ctx)
  196. daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
  197. sd $tmp0,40($ctx)
  198. .Lno_key:
  199. li $v0,0 # return 0
  200. jr $ra
  201. .end poly1305_init
  202. ___
  203. {
  204. my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
  205. my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
  206. ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
  207. my ($shr,$shl) = ($s6,$s7); # used on R6
  208. $code.=<<___;
  209. .align 5
  210. .globl poly1305_blocks
  211. .ent poly1305_blocks
  212. poly1305_blocks:
  213. .set noreorder
  214. dsrl $len,4 # number of complete blocks
  215. bnez $len,poly1305_blocks_internal
  216. nop
  217. jr $ra
  218. nop
  219. .end poly1305_blocks
  220. .align 5
  221. .ent poly1305_blocks_internal
  222. poly1305_blocks_internal:
  223. .set noreorder
  224. #if defined(_MIPS_ARCH_MIPS64R6)
  225. .frame $sp,8*8,$ra
  226. .mask $SAVED_REGS_MASK|0x000c0000,-8
  227. dsubu $sp,8*8
  228. sd $s7,56($sp)
  229. sd $s6,48($sp)
  230. #else
  231. .frame $sp,6*8,$ra
  232. .mask $SAVED_REGS_MASK,-8
  233. dsubu $sp,6*8
  234. #endif
  235. sd $s5,40($sp)
  236. sd $s4,32($sp)
  237. ___
  238. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
  239. sd $s3,24($sp)
  240. sd $s2,16($sp)
  241. sd $s1,8($sp)
  242. sd $s0,0($sp)
  243. ___
  244. $code.=<<___;
  245. .set reorder
  246. #if defined(_MIPS_ARCH_MIPS64R6)
  247. andi $shr,$inp,7
  248. dsubu $inp,$inp,$shr # align $inp
  249. sll $shr,$shr,3 # byte to bit offset
  250. subu $shl,$zero,$shr
  251. #endif
  252. ld $h0,0($ctx) # load hash value
  253. ld $h1,8($ctx)
  254. ld $h2,16($ctx)
  255. ld $r0,24($ctx) # load key
  256. ld $r1,32($ctx)
  257. ld $rs1,40($ctx)
  258. dsll $len,4
  259. daddu $len,$inp # end of buffer
  260. b .Loop
  261. .align 4
  262. .Loop:
  263. #if defined(_MIPS_ARCH_MIPS64R6)
  264. ld $in0,0($inp) # load input
  265. ld $in1,8($inp)
  266. beqz $shr,.Laligned_inp
  267. ld $tmp2,16($inp)
  268. # ifdef MIPSEB
  269. dsllv $in0,$in0,$shr
  270. dsrlv $tmp3,$in1,$shl
  271. dsllv $in1,$in1,$shr
  272. dsrlv $tmp2,$tmp2,$shl
  273. # else
  274. dsrlv $in0,$in0,$shr
  275. dsllv $tmp3,$in1,$shl
  276. dsrlv $in1,$in1,$shr
  277. dsllv $tmp2,$tmp2,$shl
  278. # endif
  279. or $in0,$in0,$tmp3
  280. or $in1,$in1,$tmp2
  281. .Laligned_inp:
  282. #else
  283. ldl $in0,0+MSB($inp) # load input
  284. ldl $in1,8+MSB($inp)
  285. ldr $in0,0+LSB($inp)
  286. ldr $in1,8+LSB($inp)
  287. #endif
  288. daddiu $inp,16
  289. #ifdef MIPSEB
  290. # if defined(_MIPS_ARCH_MIPS64R2)
  291. dsbh $in0,$in0 # byte swap
  292. dsbh $in1,$in1
  293. dshd $in0,$in0
  294. dshd $in1,$in1
  295. # else
  296. ori $tmp0,$zero,0xFF
  297. dsll $tmp2,$tmp0,32
  298. or $tmp0,$tmp2 # 0x000000FF000000FF
  299. and $tmp1,$in0,$tmp0 # byte swap
  300. and $tmp3,$in1,$tmp0
  301. dsrl $tmp2,$in0,24
  302. dsrl $tmp4,$in1,24
  303. dsll $tmp1,24
  304. dsll $tmp3,24
  305. and $tmp2,$tmp0
  306. and $tmp4,$tmp0
  307. dsll $tmp0,8 # 0x0000FF000000FF00
  308. or $tmp1,$tmp2
  309. or $tmp3,$tmp4
  310. and $tmp2,$in0,$tmp0
  311. and $tmp4,$in1,$tmp0
  312. dsrl $in0,8
  313. dsrl $in1,8
  314. dsll $tmp2,8
  315. dsll $tmp4,8
  316. and $in0,$tmp0
  317. and $in1,$tmp0
  318. or $tmp1,$tmp2
  319. or $tmp3,$tmp4
  320. or $in0,$tmp1
  321. or $in1,$tmp3
  322. dsrl $tmp1,$in0,32
  323. dsrl $tmp3,$in1,32
  324. dsll $in0,32
  325. dsll $in1,32
  326. or $in0,$tmp1
  327. or $in1,$tmp3
  328. # endif
  329. #endif
  330. dsrl $tmp1,$h2,2 # modulo-scheduled reduction
  331. andi $h2,$h2,3
  332. dsll $tmp0,$tmp1,2
  333. daddu $d0,$h0,$in0 # accumulate input
  334. daddu $tmp1,$tmp0
  335. sltu $tmp0,$d0,$h0
  336. daddu $d0,$d0,$tmp1 # ... and residue
  337. sltu $tmp1,$d0,$tmp1
  338. daddu $d1,$h1,$in1
  339. daddu $tmp0,$tmp1
  340. sltu $tmp1,$d1,$h1
  341. daddu $d1,$tmp0
  342. dmultu ($r0,$d0) # h0*r0
  343. daddu $d2,$h2,$padbit
  344. sltu $tmp0,$d1,$tmp0
  345. mflo ($h0,$r0,$d0)
  346. mfhi ($h1,$r0,$d0)
  347. dmultu ($rs1,$d1) # h1*5*r1
  348. daddu $d2,$tmp1
  349. daddu $d2,$tmp0
  350. mflo ($tmp0,$rs1,$d1)
  351. mfhi ($tmp1,$rs1,$d1)
  352. dmultu ($r1,$d0) # h0*r1
  353. mflo ($tmp2,$r1,$d0)
  354. mfhi ($h2,$r1,$d0)
  355. daddu $h0,$tmp0
  356. daddu $h1,$tmp1
  357. sltu $tmp0,$h0,$tmp0
  358. dmultu ($r0,$d1) # h1*r0
  359. daddu $h1,$tmp0
  360. daddu $h1,$tmp2
  361. mflo ($tmp0,$r0,$d1)
  362. mfhi ($tmp1,$r0,$d1)
  363. dmultu ($rs1,$d2) # h2*5*r1
  364. sltu $tmp2,$h1,$tmp2
  365. daddu $h2,$tmp2
  366. mflo ($tmp2,$rs1,$d2)
  367. dmultu ($r0,$d2) # h2*r0
  368. daddu $h1,$tmp0
  369. daddu $h2,$tmp1
  370. mflo ($tmp3,$r0,$d2)
  371. sltu $tmp0,$h1,$tmp0
  372. daddu $h2,$tmp0
  373. daddu $h1,$tmp2
  374. sltu $tmp2,$h1,$tmp2
  375. daddu $h2,$tmp2
  376. daddu $h2,$tmp3
  377. bne $inp,$len,.Loop
  378. sd $h0,0($ctx) # store hash value
  379. sd $h1,8($ctx)
  380. sd $h2,16($ctx)
  381. .set noreorder
  382. #if defined(_MIPS_ARCH_MIPS64R6)
  383. ld $s7,56($sp)
  384. ld $s6,48($sp)
  385. #endif
  386. ld $s5,40($sp) # epilogue
  387. ld $s4,32($sp)
  388. ___
  389. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
  390. ld $s3,24($sp)
  391. ld $s2,16($sp)
  392. ld $s1,8($sp)
  393. ld $s0,0($sp)
  394. ___
  395. $code.=<<___;
  396. jr $ra
  397. #if defined(_MIPS_ARCH_MIPS64R6)
  398. daddu $sp,8*8
  399. #else
  400. daddu $sp,6*8
  401. #endif
  402. .end poly1305_blocks_internal
  403. ___
  404. }
  405. {
  406. my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
  407. $code.=<<___;
  408. .align 5
  409. .globl poly1305_emit
  410. .ent poly1305_emit
  411. poly1305_emit:
  412. .frame $sp,0,$ra
  413. .set reorder
  414. ld $tmp2,16($ctx)
  415. ld $tmp0,0($ctx)
  416. ld $tmp1,8($ctx)
  417. li $in0,-4 # final reduction
  418. dsrl $in1,$tmp2,2
  419. and $in0,$tmp2
  420. andi $tmp2,$tmp2,3
  421. daddu $in0,$in1
  422. daddu $tmp0,$tmp0,$in0
  423. sltu $in1,$tmp0,$in0
  424. daddiu $in0,$tmp0,5 # compare to modulus
  425. daddu $tmp1,$tmp1,$in1
  426. sltiu $tmp3,$in0,5
  427. sltu $tmp4,$tmp1,$in1
  428. daddu $in1,$tmp1,$tmp3
  429. daddu $tmp2,$tmp2,$tmp4
  430. sltu $tmp3,$in1,$tmp3
  431. daddu $tmp2,$tmp2,$tmp3
  432. dsrl $tmp2,2 # see if it carried/borrowed
  433. dsubu $tmp2,$zero,$tmp2
  434. xor $in0,$tmp0
  435. xor $in1,$tmp1
  436. and $in0,$tmp2
  437. and $in1,$tmp2
  438. xor $in0,$tmp0
  439. xor $in1,$tmp1
  440. lwu $tmp0,0($nonce) # load nonce
  441. lwu $tmp1,4($nonce)
  442. lwu $tmp2,8($nonce)
  443. lwu $tmp3,12($nonce)
  444. dsll $tmp1,32
  445. dsll $tmp3,32
  446. or $tmp0,$tmp1
  447. or $tmp2,$tmp3
  448. daddu $in0,$tmp0 # accumulate nonce
  449. daddu $in1,$tmp2
  450. sltu $tmp0,$in0,$tmp0
  451. daddu $in1,$tmp0
  452. dsrl $tmp0,$in0,8 # write mac value
  453. dsrl $tmp1,$in0,16
  454. dsrl $tmp2,$in0,24
  455. sb $in0,0($mac)
  456. dsrl $tmp3,$in0,32
  457. sb $tmp0,1($mac)
  458. dsrl $tmp0,$in0,40
  459. sb $tmp1,2($mac)
  460. dsrl $tmp1,$in0,48
  461. sb $tmp2,3($mac)
  462. dsrl $tmp2,$in0,56
  463. sb $tmp3,4($mac)
  464. dsrl $tmp3,$in1,8
  465. sb $tmp0,5($mac)
  466. dsrl $tmp0,$in1,16
  467. sb $tmp1,6($mac)
  468. dsrl $tmp1,$in1,24
  469. sb $tmp2,7($mac)
  470. sb $in1,8($mac)
  471. dsrl $tmp2,$in1,32
  472. sb $tmp3,9($mac)
  473. dsrl $tmp3,$in1,40
  474. sb $tmp0,10($mac)
  475. dsrl $tmp0,$in1,48
  476. sb $tmp1,11($mac)
  477. dsrl $tmp1,$in1,56
  478. sb $tmp2,12($mac)
  479. sb $tmp3,13($mac)
  480. sb $tmp0,14($mac)
  481. sb $tmp1,15($mac)
  482. jr $ra
  483. .end poly1305_emit
  484. .rdata
  485. .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
  486. .align 2
  487. ___
  488. }
  489. }}} else {{{
  490. ######################################################################
  491. # 32-bit code path
  492. #
  493. my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
  494. my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
  495. ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
  496. $code.=<<___;
  497. #if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
  498. defined(_MIPS_ARCH_MIPS32R6)) \\
  499. && !defined(_MIPS_ARCH_MIPS32R2)
  500. # define _MIPS_ARCH_MIPS32R2
  501. #endif
  502. #if defined(_MIPS_ARCH_MIPS32R6)
  503. # define multu(rs,rt)
  504. # define mflo(rd,rs,rt) mulu rd,rs,rt
  505. # define mfhi(rd,rs,rt) muhu rd,rs,rt
  506. #else
  507. # define multu(rs,rt) multu rs,rt
  508. # define mflo(rd,rs,rt) mflo rd
  509. # define mfhi(rd,rs,rt) mfhi rd
  510. #endif
  511. #ifdef __KERNEL__
  512. # define poly1305_init poly1305_init_mips
  513. # define poly1305_blocks poly1305_blocks_mips
  514. # define poly1305_emit poly1305_emit_mips
  515. #endif
  516. #if defined(__MIPSEB__) && !defined(MIPSEB)
  517. # define MIPSEB
  518. #endif
  519. #ifdef MIPSEB
  520. # define MSB 0
  521. # define LSB 3
  522. #else
  523. # define MSB 3
  524. # define LSB 0
  525. #endif
  526. .text
  527. .set noat
  528. .set noreorder
  529. .align 5
  530. .globl poly1305_init
  531. .ent poly1305_init
  532. poly1305_init:
  533. .frame $sp,0,$ra
  534. .set reorder
  535. sw $zero,0($ctx)
  536. sw $zero,4($ctx)
  537. sw $zero,8($ctx)
  538. sw $zero,12($ctx)
  539. sw $zero,16($ctx)
  540. beqz $inp,.Lno_key
  541. #if defined(_MIPS_ARCH_MIPS32R6)
  542. andi $tmp0,$inp,3 # $inp % 4
  543. subu $inp,$inp,$tmp0 # align $inp
  544. sll $tmp0,$tmp0,3 # byte to bit offset
  545. lw $in0,0($inp)
  546. lw $in1,4($inp)
  547. lw $in2,8($inp)
  548. lw $in3,12($inp)
  549. beqz $tmp0,.Laligned_key
  550. lw $tmp2,16($inp)
  551. subu $tmp1,$zero,$tmp0
  552. # ifdef MIPSEB
  553. sllv $in0,$in0,$tmp0
  554. srlv $tmp3,$in1,$tmp1
  555. sllv $in1,$in1,$tmp0
  556. or $in0,$in0,$tmp3
  557. srlv $tmp3,$in2,$tmp1
  558. sllv $in2,$in2,$tmp0
  559. or $in1,$in1,$tmp3
  560. srlv $tmp3,$in3,$tmp1
  561. sllv $in3,$in3,$tmp0
  562. or $in2,$in2,$tmp3
  563. srlv $tmp2,$tmp2,$tmp1
  564. or $in3,$in3,$tmp2
  565. # else
  566. srlv $in0,$in0,$tmp0
  567. sllv $tmp3,$in1,$tmp1
  568. srlv $in1,$in1,$tmp0
  569. or $in0,$in0,$tmp3
  570. sllv $tmp3,$in2,$tmp1
  571. srlv $in2,$in2,$tmp0
  572. or $in1,$in1,$tmp3
  573. sllv $tmp3,$in3,$tmp1
  574. srlv $in3,$in3,$tmp0
  575. or $in2,$in2,$tmp3
  576. sllv $tmp2,$tmp2,$tmp1
  577. or $in3,$in3,$tmp2
  578. # endif
  579. .Laligned_key:
  580. #else
  581. lwl $in0,0+MSB($inp)
  582. lwl $in1,4+MSB($inp)
  583. lwl $in2,8+MSB($inp)
  584. lwl $in3,12+MSB($inp)
  585. lwr $in0,0+LSB($inp)
  586. lwr $in1,4+LSB($inp)
  587. lwr $in2,8+LSB($inp)
  588. lwr $in3,12+LSB($inp)
  589. #endif
  590. #ifdef MIPSEB
  591. # if defined(_MIPS_ARCH_MIPS32R2)
  592. wsbh $in0,$in0 # byte swap
  593. wsbh $in1,$in1
  594. wsbh $in2,$in2
  595. wsbh $in3,$in3
  596. rotr $in0,$in0,16
  597. rotr $in1,$in1,16
  598. rotr $in2,$in2,16
  599. rotr $in3,$in3,16
  600. # else
  601. srl $tmp0,$in0,24 # byte swap
  602. srl $tmp1,$in0,8
  603. andi $tmp2,$in0,0xFF00
  604. sll $in0,$in0,24
  605. andi $tmp1,0xFF00
  606. sll $tmp2,$tmp2,8
  607. or $in0,$tmp0
  608. srl $tmp0,$in1,24
  609. or $tmp1,$tmp2
  610. srl $tmp2,$in1,8
  611. or $in0,$tmp1
  612. andi $tmp1,$in1,0xFF00
  613. sll $in1,$in1,24
  614. andi $tmp2,0xFF00
  615. sll $tmp1,$tmp1,8
  616. or $in1,$tmp0
  617. srl $tmp0,$in2,24
  618. or $tmp2,$tmp1
  619. srl $tmp1,$in2,8
  620. or $in1,$tmp2
  621. andi $tmp2,$in2,0xFF00
  622. sll $in2,$in2,24
  623. andi $tmp1,0xFF00
  624. sll $tmp2,$tmp2,8
  625. or $in2,$tmp0
  626. srl $tmp0,$in3,24
  627. or $tmp1,$tmp2
  628. srl $tmp2,$in3,8
  629. or $in2,$tmp1
  630. andi $tmp1,$in3,0xFF00
  631. sll $in3,$in3,24
  632. andi $tmp2,0xFF00
  633. sll $tmp1,$tmp1,8
  634. or $in3,$tmp0
  635. or $tmp2,$tmp1
  636. or $in3,$tmp2
  637. # endif
  638. #endif
  639. lui $tmp0,0x0fff
  640. ori $tmp0,0xffff # 0x0fffffff
  641. and $in0,$in0,$tmp0
  642. subu $tmp0,3 # 0x0ffffffc
  643. and $in1,$in1,$tmp0
  644. and $in2,$in2,$tmp0
  645. and $in3,$in3,$tmp0
  646. sw $in0,20($ctx)
  647. sw $in1,24($ctx)
  648. sw $in2,28($ctx)
  649. sw $in3,32($ctx)
  650. srl $tmp1,$in1,2
  651. srl $tmp2,$in2,2
  652. srl $tmp3,$in3,2
  653. addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
  654. addu $in2,$in2,$tmp2
  655. addu $in3,$in3,$tmp3
  656. sw $in1,36($ctx)
  657. sw $in2,40($ctx)
  658. sw $in3,44($ctx)
  659. .Lno_key:
  660. li $v0,0
  661. jr $ra
  662. .end poly1305_init
  663. ___
  664. {
  665. my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
  666. my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
  667. ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
  668. my ($d0,$d1,$d2,$d3) =
  669. ($a4,$a5,$a6,$a7);
  670. my $shr = $t2; # used on R6
  671. my $one = $t2; # used on R2
  672. $code.=<<___;
  673. .globl poly1305_blocks
  674. .align 5
  675. .ent poly1305_blocks
  676. poly1305_blocks:
  677. .frame $sp,16*4,$ra
  678. .mask $SAVED_REGS_MASK,-4
  679. .set noreorder
  680. subu $sp, $sp,4*12
  681. sw $s11,4*11($sp)
  682. sw $s10,4*10($sp)
  683. sw $s9, 4*9($sp)
  684. sw $s8, 4*8($sp)
  685. sw $s7, 4*7($sp)
  686. sw $s6, 4*6($sp)
  687. sw $s5, 4*5($sp)
  688. sw $s4, 4*4($sp)
  689. ___
  690. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
  691. sw $s3, 4*3($sp)
  692. sw $s2, 4*2($sp)
  693. sw $s1, 4*1($sp)
  694. sw $s0, 4*0($sp)
  695. ___
  696. $code.=<<___;
  697. .set reorder
  698. srl $len,4 # number of complete blocks
  699. li $one,1
  700. beqz $len,.Labort
  701. #if defined(_MIPS_ARCH_MIPS32R6)
  702. andi $shr,$inp,3
  703. subu $inp,$inp,$shr # align $inp
  704. sll $shr,$shr,3 # byte to bit offset
  705. #endif
  706. lw $h0,0($ctx) # load hash value
  707. lw $h1,4($ctx)
  708. lw $h2,8($ctx)
  709. lw $h3,12($ctx)
  710. lw $h4,16($ctx)
  711. lw $r0,20($ctx) # load key
  712. lw $r1,24($ctx)
  713. lw $r2,28($ctx)
  714. lw $r3,32($ctx)
  715. lw $rs1,36($ctx)
  716. lw $rs2,40($ctx)
  717. lw $rs3,44($ctx)
  718. sll $len,4
  719. addu $len,$len,$inp # end of buffer
  720. b .Loop
  721. .align 4
  722. .Loop:
  723. #if defined(_MIPS_ARCH_MIPS32R6)
  724. lw $d0,0($inp) # load input
  725. lw $d1,4($inp)
  726. lw $d2,8($inp)
  727. lw $d3,12($inp)
  728. beqz $shr,.Laligned_inp
  729. lw $t0,16($inp)
  730. subu $t1,$zero,$shr
  731. # ifdef MIPSEB
  732. sllv $d0,$d0,$shr
  733. srlv $at,$d1,$t1
  734. sllv $d1,$d1,$shr
  735. or $d0,$d0,$at
  736. srlv $at,$d2,$t1
  737. sllv $d2,$d2,$shr
  738. or $d1,$d1,$at
  739. srlv $at,$d3,$t1
  740. sllv $d3,$d3,$shr
  741. or $d2,$d2,$at
  742. srlv $t0,$t0,$t1
  743. or $d3,$d3,$t0
  744. # else
  745. srlv $d0,$d0,$shr
  746. sllv $at,$d1,$t1
  747. srlv $d1,$d1,$shr
  748. or $d0,$d0,$at
  749. sllv $at,$d2,$t1
  750. srlv $d2,$d2,$shr
  751. or $d1,$d1,$at
  752. sllv $at,$d3,$t1
  753. srlv $d3,$d3,$shr
  754. or $d2,$d2,$at
  755. sllv $t0,$t0,$t1
  756. or $d3,$d3,$t0
  757. # endif
  758. .Laligned_inp:
  759. #else
  760. lwl $d0,0+MSB($inp) # load input
  761. lwl $d1,4+MSB($inp)
  762. lwl $d2,8+MSB($inp)
  763. lwl $d3,12+MSB($inp)
  764. lwr $d0,0+LSB($inp)
  765. lwr $d1,4+LSB($inp)
  766. lwr $d2,8+LSB($inp)
  767. lwr $d3,12+LSB($inp)
  768. #endif
  769. #ifdef MIPSEB
  770. # if defined(_MIPS_ARCH_MIPS32R2)
  771. wsbh $d0,$d0 # byte swap
  772. wsbh $d1,$d1
  773. wsbh $d2,$d2
  774. wsbh $d3,$d3
  775. rotr $d0,$d0,16
  776. rotr $d1,$d1,16
  777. rotr $d2,$d2,16
  778. rotr $d3,$d3,16
  779. # else
  780. srl $at,$d0,24 # byte swap
  781. srl $t0,$d0,8
  782. andi $t1,$d0,0xFF00
  783. sll $d0,$d0,24
  784. andi $t0,0xFF00
  785. sll $t1,$t1,8
  786. or $d0,$at
  787. srl $at,$d1,24
  788. or $t0,$t1
  789. srl $t1,$d1,8
  790. or $d0,$t0
  791. andi $t0,$d1,0xFF00
  792. sll $d1,$d1,24
  793. andi $t1,0xFF00
  794. sll $t0,$t0,8
  795. or $d1,$at
  796. srl $at,$d2,24
  797. or $t1,$t0
  798. srl $t0,$d2,8
  799. or $d1,$t1
  800. andi $t1,$d2,0xFF00
  801. sll $d2,$d2,24
  802. andi $t0,0xFF00
  803. sll $t1,$t1,8
  804. or $d2,$at
  805. srl $at,$d3,24
  806. or $t0,$t1
  807. srl $t1,$d3,8
  808. or $d2,$t0
  809. andi $t0,$d3,0xFF00
  810. sll $d3,$d3,24
  811. andi $t1,0xFF00
  812. sll $t0,$t0,8
  813. or $d3,$at
  814. or $t1,$t0
  815. or $d3,$t1
  816. # endif
  817. #endif
  818. srl $t0,$h4,2 # modulo-scheduled reduction
  819. andi $h4,$h4,3
  820. sll $at,$t0,2
  821. addu $d0,$d0,$h0 # accumulate input
  822. addu $t0,$t0,$at
  823. sltu $h0,$d0,$h0
  824. addu $d0,$d0,$t0 # ... and residue
  825. sltu $at,$d0,$t0
  826. addu $d1,$d1,$h1
  827. addu $h0,$h0,$at # carry
  828. sltu $h1,$d1,$h1
  829. addu $d1,$d1,$h0
  830. sltu $h0,$d1,$h0
  831. addu $d2,$d2,$h2
  832. addu $h1,$h1,$h0 # carry
  833. sltu $h2,$d2,$h2
  834. addu $d2,$d2,$h1
  835. sltu $h1,$d2,$h1
  836. addu $d3,$d3,$h3
  837. addu $h2,$h2,$h1 # carry
  838. sltu $h3,$d3,$h3
  839. addu $d3,$d3,$h2
  840. #if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
  841. multu $r0,$d0 # d0*r0
  842. sltu $h2,$d3,$h2
  843. maddu $rs3,$d1 # d1*s3
  844. addu $h3,$h3,$h2 # carry
  845. maddu $rs2,$d2 # d2*s2
  846. addu $h4,$h4,$padbit
  847. maddu $rs1,$d3 # d3*s1
  848. addu $h4,$h4,$h3
  849. mfhi $at
  850. mflo $h0
  851. multu $r1,$d0 # d0*r1
  852. maddu $r0,$d1 # d1*r0
  853. maddu $rs3,$d2 # d2*s3
  854. maddu $rs2,$d3 # d3*s2
  855. maddu $rs1,$h4 # h4*s1
  856. maddu $at,$one # hi*1
  857. mfhi $at
  858. mflo $h1
  859. multu $r2,$d0 # d0*r2
  860. maddu $r1,$d1 # d1*r1
  861. maddu $r0,$d2 # d2*r0
  862. maddu $rs3,$d3 # d3*s3
  863. maddu $rs2,$h4 # h4*s2
  864. maddu $at,$one # hi*1
  865. mfhi $at
  866. mflo $h2
  867. mul $t0,$r0,$h4 # h4*r0
  868. multu $r3,$d0 # d0*r3
  869. maddu $r2,$d1 # d1*r2
  870. maddu $r1,$d2 # d2*r1
  871. maddu $r0,$d3 # d3*r0
  872. maddu $rs3,$h4 # h4*s3
  873. maddu $at,$one # hi*1
  874. mfhi $at
  875. mflo $h3
  876. addiu $inp,$inp,16
  877. addu $h4,$t0,$at
  878. #else
  879. multu ($r0,$d0) # d0*r0
  880. mflo ($h0,$r0,$d0)
  881. mfhi ($h1,$r0,$d0)
  882. sltu $h2,$d3,$h2
  883. addu $h3,$h3,$h2 # carry
  884. multu ($rs3,$d1) # d1*s3
  885. mflo ($at,$rs3,$d1)
  886. mfhi ($t0,$rs3,$d1)
  887. addu $h4,$h4,$padbit
  888. addiu $inp,$inp,16
  889. addu $h4,$h4,$h3
  890. multu ($rs2,$d2) # d2*s2
  891. mflo ($a3,$rs2,$d2)
  892. mfhi ($t1,$rs2,$d2)
  893. addu $h0,$h0,$at
  894. addu $h1,$h1,$t0
  895. multu ($rs1,$d3) # d3*s1
  896. sltu $at,$h0,$at
  897. addu $h1,$h1,$at
  898. mflo ($at,$rs1,$d3)
  899. mfhi ($t0,$rs1,$d3)
  900. addu $h0,$h0,$a3
  901. addu $h1,$h1,$t1
  902. multu ($r1,$d0) # d0*r1
  903. sltu $a3,$h0,$a3
  904. addu $h1,$h1,$a3
  905. mflo ($a3,$r1,$d0)
  906. mfhi ($h2,$r1,$d0)
  907. addu $h0,$h0,$at
  908. addu $h1,$h1,$t0
  909. multu ($r0,$d1) # d1*r0
  910. sltu $at,$h0,$at
  911. addu $h1,$h1,$at
  912. mflo ($at,$r0,$d1)
  913. mfhi ($t0,$r0,$d1)
  914. addu $h1,$h1,$a3
  915. sltu $a3,$h1,$a3
  916. multu ($rs3,$d2) # d2*s3
  917. addu $h2,$h2,$a3
  918. mflo ($a3,$rs3,$d2)
  919. mfhi ($t1,$rs3,$d2)
  920. addu $h1,$h1,$at
  921. addu $h2,$h2,$t0
  922. multu ($rs2,$d3) # d3*s2
  923. sltu $at,$h1,$at
  924. addu $h2,$h2,$at
  925. mflo ($at,$rs2,$d3)
  926. mfhi ($t0,$rs2,$d3)
  927. addu $h1,$h1,$a3
  928. addu $h2,$h2,$t1
  929. multu ($rs1,$h4) # h4*s1
  930. sltu $a3,$h1,$a3
  931. addu $h2,$h2,$a3
  932. mflo ($a3,$rs1,$h4)
  933. addu $h1,$h1,$at
  934. addu $h2,$h2,$t0
  935. multu ($r2,$d0) # d0*r2
  936. sltu $at,$h1,$at
  937. addu $h2,$h2,$at
  938. mflo ($at,$r2,$d0)
  939. mfhi ($h3,$r2,$d0)
  940. addu $h1,$h1,$a3
  941. sltu $a3,$h1,$a3
  942. multu ($r1,$d1) # d1*r1
  943. addu $h2,$h2,$a3
  944. mflo ($a3,$r1,$d1)
  945. mfhi ($t1,$r1,$d1)
  946. addu $h2,$h2,$at
  947. sltu $at,$h2,$at
  948. multu ($r0,$d2) # d2*r0
  949. addu $h3,$h3,$at
  950. mflo ($at,$r0,$d2)
  951. mfhi ($t0,$r0,$d2)
  952. addu $h2,$h2,$a3
  953. addu $h3,$h3,$t1
  954. multu ($rs3,$d3) # d3*s3
  955. sltu $a3,$h2,$a3
  956. addu $h3,$h3,$a3
  957. mflo ($a3,$rs3,$d3)
  958. mfhi ($t1,$rs3,$d3)
  959. addu $h2,$h2,$at
  960. addu $h3,$h3,$t0
  961. multu ($rs2,$h4) # h4*s2
  962. sltu $at,$h2,$at
  963. addu $h3,$h3,$at
  964. mflo ($at,$rs2,$h4)
  965. addu $h2,$h2,$a3
  966. addu $h3,$h3,$t1
  967. multu ($r3,$d0) # d0*r3
  968. sltu $a3,$h2,$a3
  969. addu $h3,$h3,$a3
  970. mflo ($a3,$r3,$d0)
  971. mfhi ($t1,$r3,$d0)
  972. addu $h2,$h2,$at
  973. sltu $at,$h2,$at
  974. multu ($r2,$d1) # d1*r2
  975. addu $h3,$h3,$at
  976. mflo ($at,$r2,$d1)
  977. mfhi ($t0,$r2,$d1)
  978. addu $h3,$h3,$a3
  979. sltu $a3,$h3,$a3
  980. multu ($r0,$d3) # d3*r0
  981. addu $t1,$t1,$a3
  982. mflo ($a3,$r0,$d3)
  983. mfhi ($d3,$r0,$d3)
  984. addu $h3,$h3,$at
  985. addu $t1,$t1,$t0
  986. multu ($r1,$d2) # d2*r1
  987. sltu $at,$h3,$at
  988. addu $t1,$t1,$at
  989. mflo ($at,$r1,$d2)
  990. mfhi ($t0,$r1,$d2)
  991. addu $h3,$h3,$a3
  992. addu $t1,$t1,$d3
  993. multu ($rs3,$h4) # h4*s3
  994. sltu $a3,$h3,$a3
  995. addu $t1,$t1,$a3
  996. mflo ($a3,$rs3,$h4)
  997. addu $h3,$h3,$at
  998. addu $t1,$t1,$t0
  999. multu ($r0,$h4) # h4*r0
  1000. sltu $at,$h3,$at
  1001. addu $t1,$t1,$at
  1002. mflo ($h4,$r0,$h4)
  1003. addu $h3,$h3,$a3
  1004. sltu $a3,$h3,$a3
  1005. addu $t1,$t1,$a3
  1006. addu $h4,$h4,$t1
  1007. li $padbit,1 # if we loop, padbit is 1
  1008. #endif
  1009. bne $inp,$len,.Loop
  1010. sw $h0,0($ctx) # store hash value
  1011. sw $h1,4($ctx)
  1012. sw $h2,8($ctx)
  1013. sw $h3,12($ctx)
  1014. sw $h4,16($ctx)
  1015. .set noreorder
  1016. .Labort:
  1017. lw $s11,4*11($sp)
  1018. lw $s10,4*10($sp)
  1019. lw $s9, 4*9($sp)
  1020. lw $s8, 4*8($sp)
  1021. lw $s7, 4*7($sp)
  1022. lw $s6, 4*6($sp)
  1023. lw $s5, 4*5($sp)
  1024. lw $s4, 4*4($sp)
  1025. ___
  1026. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
  1027. lw $s3, 4*3($sp)
  1028. lw $s2, 4*2($sp)
  1029. lw $s1, 4*1($sp)
  1030. lw $s0, 4*0($sp)
  1031. ___
  1032. $code.=<<___;
  1033. jr $ra
  1034. addu $sp,$sp,4*12
  1035. .end poly1305_blocks
  1036. ___
  1037. }
  1038. {
  1039. my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
  1040. $code.=<<___;
  1041. .align 5
  1042. .globl poly1305_emit
  1043. .ent poly1305_emit
  1044. poly1305_emit:
  1045. .frame $sp,0,$ra
  1046. .set reorder
  1047. lw $tmp4,16($ctx)
  1048. lw $tmp0,0($ctx)
  1049. lw $tmp1,4($ctx)
  1050. lw $tmp2,8($ctx)
  1051. lw $tmp3,12($ctx)
  1052. li $in0,-4 # final reduction
  1053. srl $ctx,$tmp4,2
  1054. and $in0,$in0,$tmp4
  1055. andi $tmp4,$tmp4,3
  1056. addu $ctx,$ctx,$in0
  1057. addu $tmp0,$tmp0,$ctx
  1058. sltu $ctx,$tmp0,$ctx
  1059. addiu $in0,$tmp0,5 # compare to modulus
  1060. addu $tmp1,$tmp1,$ctx
  1061. sltiu $in1,$in0,5
  1062. sltu $ctx,$tmp1,$ctx
  1063. addu $in1,$in1,$tmp1
  1064. addu $tmp2,$tmp2,$ctx
  1065. sltu $in2,$in1,$tmp1
  1066. sltu $ctx,$tmp2,$ctx
  1067. addu $in2,$in2,$tmp2
  1068. addu $tmp3,$tmp3,$ctx
  1069. sltu $in3,$in2,$tmp2
  1070. sltu $ctx,$tmp3,$ctx
  1071. addu $in3,$in3,$tmp3
  1072. addu $tmp4,$tmp4,$ctx
  1073. sltu $ctx,$in3,$tmp3
  1074. addu $ctx,$tmp4
  1075. srl $ctx,2 # see if it carried/borrowed
  1076. subu $ctx,$zero,$ctx
  1077. xor $in0,$tmp0
  1078. xor $in1,$tmp1
  1079. xor $in2,$tmp2
  1080. xor $in3,$tmp3
  1081. and $in0,$ctx
  1082. and $in1,$ctx
  1083. and $in2,$ctx
  1084. and $in3,$ctx
  1085. xor $in0,$tmp0
  1086. xor $in1,$tmp1
  1087. xor $in2,$tmp2
  1088. xor $in3,$tmp3
  1089. lw $tmp0,0($nonce) # load nonce
  1090. lw $tmp1,4($nonce)
  1091. lw $tmp2,8($nonce)
  1092. lw $tmp3,12($nonce)
  1093. addu $in0,$tmp0 # accumulate nonce
  1094. sltu $ctx,$in0,$tmp0
  1095. addu $in1,$tmp1
  1096. sltu $tmp1,$in1,$tmp1
  1097. addu $in1,$ctx
  1098. sltu $ctx,$in1,$ctx
  1099. addu $ctx,$tmp1
  1100. addu $in2,$tmp2
  1101. sltu $tmp2,$in2,$tmp2
  1102. addu $in2,$ctx
  1103. sltu $ctx,$in2,$ctx
  1104. addu $ctx,$tmp2
  1105. addu $in3,$tmp3
  1106. addu $in3,$ctx
  1107. srl $tmp0,$in0,8 # write mac value
  1108. srl $tmp1,$in0,16
  1109. srl $tmp2,$in0,24
  1110. sb $in0, 0($mac)
  1111. sb $tmp0,1($mac)
  1112. srl $tmp0,$in1,8
  1113. sb $tmp1,2($mac)
  1114. srl $tmp1,$in1,16
  1115. sb $tmp2,3($mac)
  1116. srl $tmp2,$in1,24
  1117. sb $in1, 4($mac)
  1118. sb $tmp0,5($mac)
  1119. srl $tmp0,$in2,8
  1120. sb $tmp1,6($mac)
  1121. srl $tmp1,$in2,16
  1122. sb $tmp2,7($mac)
  1123. srl $tmp2,$in2,24
  1124. sb $in2, 8($mac)
  1125. sb $tmp0,9($mac)
  1126. srl $tmp0,$in3,8
  1127. sb $tmp1,10($mac)
  1128. srl $tmp1,$in3,16
  1129. sb $tmp2,11($mac)
  1130. srl $tmp2,$in3,24
  1131. sb $in3, 12($mac)
  1132. sb $tmp0,13($mac)
  1133. sb $tmp1,14($mac)
  1134. sb $tmp2,15($mac)
  1135. jr $ra
  1136. .end poly1305_emit
  1137. .rdata
  1138. .asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
  1139. .align 2
  1140. ___
  1141. }
  1142. }}}
  1143. $output=pop and open STDOUT,">$output";
  1144. print $code;
  1145. close STDOUT;