csum_partial.S 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Quick'n'dirty IP checksum ...
  7. *
  8. * Copyright (C) 1998, 1999 Ralf Baechle
  9. * Copyright (C) 1999 Silicon Graphics, Inc.
  10. * Copyright (C) 2007 Maciej W. Rozycki
  11. * Copyright (C) 2014 Imagination Technologies Ltd.
  12. */
  13. #include <linux/errno.h>
  14. #include <asm/asm.h>
  15. #include <asm/asm-offsets.h>
  16. #include <asm/export.h>
  17. #include <asm/regdef.h>
  18. #ifdef CONFIG_64BIT
  19. /*
  20. * As we are sharing code base with the mips32 tree (which use the o32 ABI
  21. * register definitions). We need to redefine the register definitions from
  22. * the n64 ABI register naming to the o32 ABI register naming.
  23. */
  24. #undef t0
  25. #undef t1
  26. #undef t2
  27. #undef t3
  28. #define t0 $8
  29. #define t1 $9
  30. #define t2 $10
  31. #define t3 $11
  32. #define t4 $12
  33. #define t5 $13
  34. #define t6 $14
  35. #define t7 $15
  36. #define USE_DOUBLE
  37. #endif
  38. #ifdef USE_DOUBLE
  39. #define LOAD ld
  40. #define LOAD32 lwu
  41. #define ADD daddu
  42. #define NBYTES 8
  43. #else
  44. #define LOAD lw
  45. #define LOAD32 lw
  46. #define ADD addu
  47. #define NBYTES 4
  48. #endif /* USE_DOUBLE */
  49. #define UNIT(unit) ((unit)*NBYTES)
  50. #define ADDC(sum,reg) \
  51. .set push; \
  52. .set noat; \
  53. ADD sum, reg; \
  54. sltu v1, sum, reg; \
  55. ADD sum, v1; \
  56. .set pop
  57. #define ADDC32(sum,reg) \
  58. .set push; \
  59. .set noat; \
  60. addu sum, reg; \
  61. sltu v1, sum, reg; \
  62. addu sum, v1; \
  63. .set pop
  64. #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \
  65. LOAD _t0, (offset + UNIT(0))(src); \
  66. LOAD _t1, (offset + UNIT(1))(src); \
  67. LOAD _t2, (offset + UNIT(2))(src); \
  68. LOAD _t3, (offset + UNIT(3))(src); \
  69. ADDC(_t0, _t1); \
  70. ADDC(_t2, _t3); \
  71. ADDC(sum, _t0); \
  72. ADDC(sum, _t2)
  73. #ifdef USE_DOUBLE
  74. #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
  75. CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
  76. #else
  77. #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
  78. CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3); \
  79. CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
  80. #endif
  81. /*
  82. * a0: source address
  83. * a1: length of the area to checksum
  84. * a2: partial checksum
  85. */
  86. #define src a0
  87. #define sum v0
  88. .text
  89. .set noreorder
  90. .align 5
  91. LEAF(csum_partial)
  92. EXPORT_SYMBOL(csum_partial)
  93. move sum, zero
  94. move t7, zero
  95. sltiu t8, a1, 0x8
  96. bnez t8, .Lsmall_csumcpy /* < 8 bytes to copy */
  97. move t2, a1
  98. andi t7, src, 0x1 /* odd buffer? */
  99. .Lhword_align:
  100. beqz t7, .Lword_align
  101. andi t8, src, 0x2
  102. lbu t0, (src)
  103. LONG_SUBU a1, a1, 0x1
  104. #ifdef __MIPSEL__
  105. sll t0, t0, 8
  106. #endif
  107. ADDC(sum, t0)
  108. PTR_ADDU src, src, 0x1
  109. andi t8, src, 0x2
  110. .Lword_align:
  111. beqz t8, .Ldword_align
  112. sltiu t8, a1, 56
  113. lhu t0, (src)
  114. LONG_SUBU a1, a1, 0x2
  115. ADDC(sum, t0)
  116. sltiu t8, a1, 56
  117. PTR_ADDU src, src, 0x2
  118. .Ldword_align:
  119. bnez t8, .Ldo_end_words
  120. move t8, a1
  121. andi t8, src, 0x4
  122. beqz t8, .Lqword_align
  123. andi t8, src, 0x8
  124. LOAD32 t0, 0x00(src)
  125. LONG_SUBU a1, a1, 0x4
  126. ADDC(sum, t0)
  127. PTR_ADDU src, src, 0x4
  128. andi t8, src, 0x8
  129. .Lqword_align:
  130. beqz t8, .Loword_align
  131. andi t8, src, 0x10
  132. #ifdef USE_DOUBLE
  133. ld t0, 0x00(src)
  134. LONG_SUBU a1, a1, 0x8
  135. ADDC(sum, t0)
  136. #else
  137. lw t0, 0x00(src)
  138. lw t1, 0x04(src)
  139. LONG_SUBU a1, a1, 0x8
  140. ADDC(sum, t0)
  141. ADDC(sum, t1)
  142. #endif
  143. PTR_ADDU src, src, 0x8
  144. andi t8, src, 0x10
  145. .Loword_align:
  146. beqz t8, .Lbegin_movement
  147. LONG_SRL t8, a1, 0x7
  148. #ifdef USE_DOUBLE
  149. ld t0, 0x00(src)
  150. ld t1, 0x08(src)
  151. ADDC(sum, t0)
  152. ADDC(sum, t1)
  153. #else
  154. CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
  155. #endif
  156. LONG_SUBU a1, a1, 0x10
  157. PTR_ADDU src, src, 0x10
  158. LONG_SRL t8, a1, 0x7
  159. .Lbegin_movement:
  160. beqz t8, 1f
  161. andi t2, a1, 0x40
  162. .Lmove_128bytes:
  163. CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
  164. CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
  165. CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
  166. CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
  167. LONG_SUBU t8, t8, 0x01
  168. .set reorder /* DADDI_WAR */
  169. PTR_ADDU src, src, 0x80
  170. bnez t8, .Lmove_128bytes
  171. .set noreorder
  172. 1:
  173. beqz t2, 1f
  174. andi t2, a1, 0x20
  175. .Lmove_64bytes:
  176. CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
  177. CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
  178. PTR_ADDU src, src, 0x40
  179. 1:
  180. beqz t2, .Ldo_end_words
  181. andi t8, a1, 0x1c
  182. .Lmove_32bytes:
  183. CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
  184. andi t8, a1, 0x1c
  185. PTR_ADDU src, src, 0x20
  186. .Ldo_end_words:
  187. beqz t8, .Lsmall_csumcpy
  188. andi t2, a1, 0x3
  189. LONG_SRL t8, t8, 0x2
  190. .Lend_words:
  191. LOAD32 t0, (src)
  192. LONG_SUBU t8, t8, 0x1
  193. ADDC(sum, t0)
  194. .set reorder /* DADDI_WAR */
  195. PTR_ADDU src, src, 0x4
  196. bnez t8, .Lend_words
  197. .set noreorder
  198. /* unknown src alignment and < 8 bytes to go */
  199. .Lsmall_csumcpy:
  200. move a1, t2
  201. andi t0, a1, 4
  202. beqz t0, 1f
  203. andi t0, a1, 2
  204. /* Still a full word to go */
  205. ulw t1, (src)
  206. PTR_ADDIU src, 4
  207. #ifdef USE_DOUBLE
  208. dsll t1, t1, 32 /* clear lower 32bit */
  209. #endif
  210. ADDC(sum, t1)
  211. 1: move t1, zero
  212. beqz t0, 1f
  213. andi t0, a1, 1
  214. /* Still a halfword to go */
  215. ulhu t1, (src)
  216. PTR_ADDIU src, 2
  217. 1: beqz t0, 1f
  218. sll t1, t1, 16
  219. lbu t2, (src)
  220. nop
  221. #ifdef __MIPSEB__
  222. sll t2, t2, 8
  223. #endif
  224. or t1, t2
  225. 1: ADDC(sum, t1)
  226. /* fold checksum */
  227. #ifdef USE_DOUBLE
  228. dsll32 v1, sum, 0
  229. daddu sum, v1
  230. sltu v1, sum, v1
  231. dsra32 sum, sum, 0
  232. addu sum, v1
  233. #endif
  234. /* odd buffer alignment? */
  235. #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
  236. defined(CONFIG_CPU_LOONGSON64)
  237. .set push
  238. .set arch=mips32r2
  239. wsbh v1, sum
  240. movn sum, v1, t7
  241. .set pop
  242. #else
  243. beqz t7, 1f /* odd buffer alignment? */
  244. lui v1, 0x00ff
  245. addu v1, 0x00ff
  246. and t0, sum, v1
  247. sll t0, t0, 8
  248. srl sum, sum, 8
  249. and sum, sum, v1
  250. or sum, sum, t0
  251. 1:
  252. #endif
  253. .set reorder
  254. /* Add the passed partial csum. */
  255. ADDC32(sum, a2)
  256. jr ra
  257. .set noreorder
  258. END(csum_partial)
  259. /*
  260. * checksum and copy routines based on memcpy.S
  261. *
  262. * csum_partial_copy_nocheck(src, dst, len)
  263. * __csum_partial_copy_kernel(src, dst, len)
  264. *
  265. * See "Spec" in memcpy.S for details. Unlike __copy_user, all
  266. * function in this file use the standard calling convention.
  267. */
  268. #define src a0
  269. #define dst a1
  270. #define len a2
  271. #define sum v0
  272. #define odd t8
  273. /*
  274. * All exception handlers simply return 0.
  275. */
  276. /* Instruction type */
  277. #define LD_INSN 1
  278. #define ST_INSN 2
  279. #define LEGACY_MODE 1
  280. #define EVA_MODE 2
  281. #define USEROP 1
  282. #define KERNELOP 2
  283. /*
  284. * Wrapper to add an entry in the exception table
  285. * in case the insn causes a memory exception.
  286. * Arguments:
  287. * insn : Load/store instruction
  288. * type : Instruction type
  289. * reg : Register
  290. * addr : Address
  291. * handler : Exception handler
  292. */
  293. #define EXC(insn, type, reg, addr) \
  294. .if \mode == LEGACY_MODE; \
  295. 9: insn reg, addr; \
  296. .section __ex_table,"a"; \
  297. PTR_WD 9b, .L_exc; \
  298. .previous; \
  299. /* This is enabled in EVA mode */ \
  300. .else; \
  301. /* If loading from user or storing to user */ \
  302. .if ((\from == USEROP) && (type == LD_INSN)) || \
  303. ((\to == USEROP) && (type == ST_INSN)); \
  304. 9: __BUILD_EVA_INSN(insn##e, reg, addr); \
  305. .section __ex_table,"a"; \
  306. PTR_WD 9b, .L_exc; \
  307. .previous; \
  308. .else; \
  309. /* EVA without exception */ \
  310. insn reg, addr; \
  311. .endif; \
  312. .endif
  313. #undef LOAD
  314. #ifdef USE_DOUBLE
  315. #define LOADK ld /* No exception */
  316. #define LOAD(reg, addr) EXC(ld, LD_INSN, reg, addr)
  317. #define LOADBU(reg, addr) EXC(lbu, LD_INSN, reg, addr)
  318. #define LOADL(reg, addr) EXC(ldl, LD_INSN, reg, addr)
  319. #define LOADR(reg, addr) EXC(ldr, LD_INSN, reg, addr)
  320. #define STOREB(reg, addr) EXC(sb, ST_INSN, reg, addr)
  321. #define STOREL(reg, addr) EXC(sdl, ST_INSN, reg, addr)
  322. #define STORER(reg, addr) EXC(sdr, ST_INSN, reg, addr)
  323. #define STORE(reg, addr) EXC(sd, ST_INSN, reg, addr)
  324. #define ADD daddu
  325. #define SUB dsubu
  326. #define SRL dsrl
  327. #define SLL dsll
  328. #define SLLV dsllv
  329. #define SRLV dsrlv
  330. #define NBYTES 8
  331. #define LOG_NBYTES 3
  332. #else
  333. #define LOADK lw /* No exception */
  334. #define LOAD(reg, addr) EXC(lw, LD_INSN, reg, addr)
  335. #define LOADBU(reg, addr) EXC(lbu, LD_INSN, reg, addr)
  336. #define LOADL(reg, addr) EXC(lwl, LD_INSN, reg, addr)
  337. #define LOADR(reg, addr) EXC(lwr, LD_INSN, reg, addr)
  338. #define STOREB(reg, addr) EXC(sb, ST_INSN, reg, addr)
  339. #define STOREL(reg, addr) EXC(swl, ST_INSN, reg, addr)
  340. #define STORER(reg, addr) EXC(swr, ST_INSN, reg, addr)
  341. #define STORE(reg, addr) EXC(sw, ST_INSN, reg, addr)
  342. #define ADD addu
  343. #define SUB subu
  344. #define SRL srl
  345. #define SLL sll
  346. #define SLLV sllv
  347. #define SRLV srlv
  348. #define NBYTES 4
  349. #define LOG_NBYTES 2
  350. #endif /* USE_DOUBLE */
  351. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  352. #define LDFIRST LOADR
  353. #define LDREST LOADL
  354. #define STFIRST STORER
  355. #define STREST STOREL
  356. #define SHIFT_DISCARD SLLV
  357. #define SHIFT_DISCARD_REVERT SRLV
  358. #else
  359. #define LDFIRST LOADL
  360. #define LDREST LOADR
  361. #define STFIRST STOREL
  362. #define STREST STORER
  363. #define SHIFT_DISCARD SRLV
  364. #define SHIFT_DISCARD_REVERT SLLV
  365. #endif
  366. #define FIRST(unit) ((unit)*NBYTES)
  367. #define REST(unit) (FIRST(unit)+NBYTES-1)
  368. #define ADDRMASK (NBYTES-1)
  369. #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
  370. .set noat
  371. #else
  372. .set at=v1
  373. #endif
  374. .macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to
  375. li sum, -1
  376. move odd, zero
  377. /*
  378. * Note: dst & src may be unaligned, len may be 0
  379. * Temps
  380. */
  381. /*
  382. * The "issue break"s below are very approximate.
  383. * Issue delays for dcache fills will perturb the schedule, as will
  384. * load queue full replay traps, etc.
  385. *
  386. * If len < NBYTES use byte operations.
  387. */
  388. sltu t2, len, NBYTES
  389. and t1, dst, ADDRMASK
  390. bnez t2, .Lcopy_bytes_checklen\@
  391. and t0, src, ADDRMASK
  392. andi odd, dst, 0x1 /* odd buffer? */
  393. bnez t1, .Ldst_unaligned\@
  394. nop
  395. bnez t0, .Lsrc_unaligned_dst_aligned\@
  396. /*
  397. * use delay slot for fall-through
  398. * src and dst are aligned; need to compute rem
  399. */
  400. .Lboth_aligned\@:
  401. SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
  402. beqz t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
  403. nop
  404. SUB len, 8*NBYTES # subtract here for bgez loop
  405. .align 4
  406. 1:
  407. LOAD(t0, UNIT(0)(src))
  408. LOAD(t1, UNIT(1)(src))
  409. LOAD(t2, UNIT(2)(src))
  410. LOAD(t3, UNIT(3)(src))
  411. LOAD(t4, UNIT(4)(src))
  412. LOAD(t5, UNIT(5)(src))
  413. LOAD(t6, UNIT(6)(src))
  414. LOAD(t7, UNIT(7)(src))
  415. SUB len, len, 8*NBYTES
  416. ADD src, src, 8*NBYTES
  417. STORE(t0, UNIT(0)(dst))
  418. ADDC(t0, t1)
  419. STORE(t1, UNIT(1)(dst))
  420. ADDC(sum, t0)
  421. STORE(t2, UNIT(2)(dst))
  422. ADDC(t2, t3)
  423. STORE(t3, UNIT(3)(dst))
  424. ADDC(sum, t2)
  425. STORE(t4, UNIT(4)(dst))
  426. ADDC(t4, t5)
  427. STORE(t5, UNIT(5)(dst))
  428. ADDC(sum, t4)
  429. STORE(t6, UNIT(6)(dst))
  430. ADDC(t6, t7)
  431. STORE(t7, UNIT(7)(dst))
  432. ADDC(sum, t6)
  433. .set reorder /* DADDI_WAR */
  434. ADD dst, dst, 8*NBYTES
  435. bgez len, 1b
  436. .set noreorder
  437. ADD len, 8*NBYTES # revert len (see above)
  438. /*
  439. * len == the number of bytes left to copy < 8*NBYTES
  440. */
  441. .Lcleanup_both_aligned\@:
  442. #define rem t7
  443. beqz len, .Ldone\@
  444. sltu t0, len, 4*NBYTES
  445. bnez t0, .Lless_than_4units\@
  446. and rem, len, (NBYTES-1) # rem = len % NBYTES
  447. /*
  448. * len >= 4*NBYTES
  449. */
  450. LOAD(t0, UNIT(0)(src))
  451. LOAD(t1, UNIT(1)(src))
  452. LOAD(t2, UNIT(2)(src))
  453. LOAD(t3, UNIT(3)(src))
  454. SUB len, len, 4*NBYTES
  455. ADD src, src, 4*NBYTES
  456. STORE(t0, UNIT(0)(dst))
  457. ADDC(t0, t1)
  458. STORE(t1, UNIT(1)(dst))
  459. ADDC(sum, t0)
  460. STORE(t2, UNIT(2)(dst))
  461. ADDC(t2, t3)
  462. STORE(t3, UNIT(3)(dst))
  463. ADDC(sum, t2)
  464. .set reorder /* DADDI_WAR */
  465. ADD dst, dst, 4*NBYTES
  466. beqz len, .Ldone\@
  467. .set noreorder
  468. .Lless_than_4units\@:
  469. /*
  470. * rem = len % NBYTES
  471. */
  472. beq rem, len, .Lcopy_bytes\@
  473. nop
  474. 1:
  475. LOAD(t0, 0(src))
  476. ADD src, src, NBYTES
  477. SUB len, len, NBYTES
  478. STORE(t0, 0(dst))
  479. ADDC(sum, t0)
  480. .set reorder /* DADDI_WAR */
  481. ADD dst, dst, NBYTES
  482. bne rem, len, 1b
  483. .set noreorder
  484. /*
  485. * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
  486. * A loop would do only a byte at a time with possible branch
  487. * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
  488. * because can't assume read-access to dst. Instead, use
  489. * STREST dst, which doesn't require read access to dst.
  490. *
  491. * This code should perform better than a simple loop on modern,
  492. * wide-issue mips processors because the code has fewer branches and
  493. * more instruction-level parallelism.
  494. */
  495. #define bits t2
  496. beqz len, .Ldone\@
  497. ADD t1, dst, len # t1 is just past last byte of dst
  498. li bits, 8*NBYTES
  499. SLL rem, len, 3 # rem = number of bits to keep
  500. LOAD(t0, 0(src))
  501. SUB bits, bits, rem # bits = number of bits to discard
  502. SHIFT_DISCARD t0, t0, bits
  503. STREST(t0, -1(t1))
  504. SHIFT_DISCARD_REVERT t0, t0, bits
  505. .set reorder
  506. ADDC(sum, t0)
  507. b .Ldone\@
  508. .set noreorder
  509. .Ldst_unaligned\@:
  510. /*
  511. * dst is unaligned
  512. * t0 = src & ADDRMASK
  513. * t1 = dst & ADDRMASK; T1 > 0
  514. * len >= NBYTES
  515. *
  516. * Copy enough bytes to align dst
  517. * Set match = (src and dst have same alignment)
  518. */
  519. #define match rem
  520. LDFIRST(t3, FIRST(0)(src))
  521. ADD t2, zero, NBYTES
  522. LDREST(t3, REST(0)(src))
  523. SUB t2, t2, t1 # t2 = number of bytes copied
  524. xor match, t0, t1
  525. STFIRST(t3, FIRST(0)(dst))
  526. SLL t4, t1, 3 # t4 = number of bits to discard
  527. SHIFT_DISCARD t3, t3, t4
  528. /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
  529. ADDC(sum, t3)
  530. beq len, t2, .Ldone\@
  531. SUB len, len, t2
  532. ADD dst, dst, t2
  533. beqz match, .Lboth_aligned\@
  534. ADD src, src, t2
  535. .Lsrc_unaligned_dst_aligned\@:
  536. SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  537. beqz t0, .Lcleanup_src_unaligned\@
  538. and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  539. 1:
  540. /*
  541. * Avoid consecutive LD*'s to the same register since some mips
  542. * implementations can't issue them in the same cycle.
  543. * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  544. * are to the same unit (unless src is aligned, but it's not).
  545. */
  546. LDFIRST(t0, FIRST(0)(src))
  547. LDFIRST(t1, FIRST(1)(src))
  548. SUB len, len, 4*NBYTES
  549. LDREST(t0, REST(0)(src))
  550. LDREST(t1, REST(1)(src))
  551. LDFIRST(t2, FIRST(2)(src))
  552. LDFIRST(t3, FIRST(3)(src))
  553. LDREST(t2, REST(2)(src))
  554. LDREST(t3, REST(3)(src))
  555. ADD src, src, 4*NBYTES
  556. #ifdef CONFIG_CPU_SB1
  557. nop # improves slotting
  558. #endif
  559. STORE(t0, UNIT(0)(dst))
  560. ADDC(t0, t1)
  561. STORE(t1, UNIT(1)(dst))
  562. ADDC(sum, t0)
  563. STORE(t2, UNIT(2)(dst))
  564. ADDC(t2, t3)
  565. STORE(t3, UNIT(3)(dst))
  566. ADDC(sum, t2)
  567. .set reorder /* DADDI_WAR */
  568. ADD dst, dst, 4*NBYTES
  569. bne len, rem, 1b
  570. .set noreorder
  571. .Lcleanup_src_unaligned\@:
  572. beqz len, .Ldone\@
  573. and rem, len, NBYTES-1 # rem = len % NBYTES
  574. beq rem, len, .Lcopy_bytes\@
  575. nop
  576. 1:
  577. LDFIRST(t0, FIRST(0)(src))
  578. LDREST(t0, REST(0)(src))
  579. ADD src, src, NBYTES
  580. SUB len, len, NBYTES
  581. STORE(t0, 0(dst))
  582. ADDC(sum, t0)
  583. .set reorder /* DADDI_WAR */
  584. ADD dst, dst, NBYTES
  585. bne len, rem, 1b
  586. .set noreorder
  587. .Lcopy_bytes_checklen\@:
  588. beqz len, .Ldone\@
  589. nop
  590. .Lcopy_bytes\@:
  591. /* 0 < len < NBYTES */
  592. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  593. #define SHIFT_START 0
  594. #define SHIFT_INC 8
  595. #else
  596. #define SHIFT_START 8*(NBYTES-1)
  597. #define SHIFT_INC -8
  598. #endif
  599. move t2, zero # partial word
  600. li t3, SHIFT_START # shift
  601. #define COPY_BYTE(N) \
  602. LOADBU(t0, N(src)); \
  603. SUB len, len, 1; \
  604. STOREB(t0, N(dst)); \
  605. SLLV t0, t0, t3; \
  606. addu t3, SHIFT_INC; \
  607. beqz len, .Lcopy_bytes_done\@; \
  608. or t2, t0
  609. COPY_BYTE(0)
  610. COPY_BYTE(1)
  611. #ifdef USE_DOUBLE
  612. COPY_BYTE(2)
  613. COPY_BYTE(3)
  614. COPY_BYTE(4)
  615. COPY_BYTE(5)
  616. #endif
  617. LOADBU(t0, NBYTES-2(src))
  618. SUB len, len, 1
  619. STOREB(t0, NBYTES-2(dst))
  620. SLLV t0, t0, t3
  621. or t2, t0
  622. .Lcopy_bytes_done\@:
  623. ADDC(sum, t2)
  624. .Ldone\@:
  625. /* fold checksum */
  626. .set push
  627. .set noat
  628. #ifdef USE_DOUBLE
  629. dsll32 v1, sum, 0
  630. daddu sum, v1
  631. sltu v1, sum, v1
  632. dsra32 sum, sum, 0
  633. addu sum, v1
  634. #endif
  635. #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
  636. defined(CONFIG_CPU_LOONGSON64)
  637. .set push
  638. .set arch=mips32r2
  639. wsbh v1, sum
  640. movn sum, v1, odd
  641. .set pop
  642. #else
  643. beqz odd, 1f /* odd buffer alignment? */
  644. lui v1, 0x00ff
  645. addu v1, 0x00ff
  646. and t0, sum, v1
  647. sll t0, t0, 8
  648. srl sum, sum, 8
  649. and sum, sum, v1
  650. or sum, sum, t0
  651. 1:
  652. #endif
  653. .set pop
  654. .set reorder
  655. jr ra
  656. .set noreorder
  657. .endm
  658. .set noreorder
  659. .L_exc:
  660. jr ra
  661. li v0, 0
  662. FEXPORT(__csum_partial_copy_nocheck)
  663. EXPORT_SYMBOL(__csum_partial_copy_nocheck)
  664. #ifndef CONFIG_EVA
  665. FEXPORT(__csum_partial_copy_to_user)
  666. EXPORT_SYMBOL(__csum_partial_copy_to_user)
  667. FEXPORT(__csum_partial_copy_from_user)
  668. EXPORT_SYMBOL(__csum_partial_copy_from_user)
  669. #endif
  670. __BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP
  671. #ifdef CONFIG_EVA
  672. LEAF(__csum_partial_copy_to_user)
  673. __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP
  674. END(__csum_partial_copy_to_user)
  675. LEAF(__csum_partial_copy_from_user)
  676. __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP
  677. END(__csum_partial_copy_from_user)
  678. #endif