checksum_64.S 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * This file contains assembly-language implementations
  4. * of IP-style 1's complement checksum routines.
  5. *
  6. * Copyright (C) 1995-1996 Gary Thomas ([email protected])
  7. *
  8. * Severely hacked about by Paul Mackerras ([email protected]).
  9. */
  10. #include <linux/sys.h>
  11. #include <asm/processor.h>
  12. #include <asm/errno.h>
  13. #include <asm/ppc_asm.h>
  14. #include <asm/export.h>
  15. /*
  16. * Computes the checksum of a memory block at buff, length len,
  17. * and adds in "sum" (32-bit).
  18. *
  19. * __csum_partial(r3=buff, r4=len, r5=sum)
  20. */
  21. _GLOBAL(__csum_partial)
  22. addic r0,r5,0 /* clear carry */
  23. srdi. r6,r4,3 /* less than 8 bytes? */
  24. beq .Lcsum_tail_word
  25. /*
  26. * If only halfword aligned, align to a double word. Since odd
  27. * aligned addresses should be rare and they would require more
  28. * work to calculate the correct checksum, we ignore that case
  29. * and take the potential slowdown of unaligned loads.
  30. */
  31. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
  32. beq .Lcsum_aligned
  33. li r7,4
  34. sub r6,r7,r6
  35. mtctr r6
  36. 1:
  37. lhz r6,0(r3) /* align to doubleword */
  38. subi r4,r4,2
  39. addi r3,r3,2
  40. adde r0,r0,r6
  41. bdnz 1b
  42. .Lcsum_aligned:
  43. /*
  44. * We unroll the loop such that each iteration is 64 bytes with an
  45. * entry and exit limb of 64 bytes, meaning a minimum size of
  46. * 128 bytes.
  47. */
  48. srdi. r6,r4,7
  49. beq .Lcsum_tail_doublewords /* len < 128 */
  50. srdi r6,r4,6
  51. subi r6,r6,1
  52. mtctr r6
  53. stdu r1,-STACKFRAMESIZE(r1)
  54. std r14,STK_REG(R14)(r1)
  55. std r15,STK_REG(R15)(r1)
  56. std r16,STK_REG(R16)(r1)
  57. ld r6,0(r3)
  58. ld r9,8(r3)
  59. ld r10,16(r3)
  60. ld r11,24(r3)
  61. /*
  62. * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  63. * because of the XER dependency. This means the fastest this loop can
  64. * go is 16 cycles per iteration. The scheduling of the loop below has
  65. * been shown to hit this on both POWER6 and POWER7.
  66. */
  67. .align 5
  68. 2:
  69. adde r0,r0,r6
  70. ld r12,32(r3)
  71. ld r14,40(r3)
  72. adde r0,r0,r9
  73. ld r15,48(r3)
  74. ld r16,56(r3)
  75. addi r3,r3,64
  76. adde r0,r0,r10
  77. adde r0,r0,r11
  78. adde r0,r0,r12
  79. adde r0,r0,r14
  80. adde r0,r0,r15
  81. ld r6,0(r3)
  82. ld r9,8(r3)
  83. adde r0,r0,r16
  84. ld r10,16(r3)
  85. ld r11,24(r3)
  86. bdnz 2b
  87. adde r0,r0,r6
  88. ld r12,32(r3)
  89. ld r14,40(r3)
  90. adde r0,r0,r9
  91. ld r15,48(r3)
  92. ld r16,56(r3)
  93. addi r3,r3,64
  94. adde r0,r0,r10
  95. adde r0,r0,r11
  96. adde r0,r0,r12
  97. adde r0,r0,r14
  98. adde r0,r0,r15
  99. adde r0,r0,r16
  100. ld r14,STK_REG(R14)(r1)
  101. ld r15,STK_REG(R15)(r1)
  102. ld r16,STK_REG(R16)(r1)
  103. addi r1,r1,STACKFRAMESIZE
  104. andi. r4,r4,63
  105. .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
  106. srdi. r6,r4,3
  107. beq .Lcsum_tail_word
  108. mtctr r6
  109. 3:
  110. ld r6,0(r3)
  111. addi r3,r3,8
  112. adde r0,r0,r6
  113. bdnz 3b
  114. andi. r4,r4,7
  115. .Lcsum_tail_word: /* Up to 7 bytes to go */
  116. srdi. r6,r4,2
  117. beq .Lcsum_tail_halfword
  118. lwz r6,0(r3)
  119. addi r3,r3,4
  120. adde r0,r0,r6
  121. subi r4,r4,4
  122. .Lcsum_tail_halfword: /* Up to 3 bytes to go */
  123. srdi. r6,r4,1
  124. beq .Lcsum_tail_byte
  125. lhz r6,0(r3)
  126. addi r3,r3,2
  127. adde r0,r0,r6
  128. subi r4,r4,2
  129. .Lcsum_tail_byte: /* Up to 1 byte to go */
  130. andi. r6,r4,1
  131. beq .Lcsum_finish
  132. lbz r6,0(r3)
  133. #ifdef __BIG_ENDIAN__
  134. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  135. adde r0,r0,r9
  136. #else
  137. adde r0,r0,r6
  138. #endif
  139. .Lcsum_finish:
  140. addze r0,r0 /* add in final carry */
  141. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  142. add r3,r4,r0
  143. srdi r3,r3,32
  144. blr
  145. EXPORT_SYMBOL(__csum_partial)
  146. .macro srcnr
  147. 100:
  148. EX_TABLE(100b,.Lerror_nr)
  149. .endm
  150. .macro source
  151. 150:
  152. EX_TABLE(150b,.Lerror)
  153. .endm
  154. .macro dstnr
  155. 200:
  156. EX_TABLE(200b,.Lerror_nr)
  157. .endm
  158. .macro dest
  159. 250:
  160. EX_TABLE(250b,.Lerror)
  161. .endm
  162. /*
  163. * Computes the checksum of a memory block at src, length len,
  164. * and adds in 0xffffffff (32-bit), while copying the block to dst.
  165. * If an access exception occurs, it returns 0.
  166. *
  167. * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
  168. */
  169. _GLOBAL(csum_partial_copy_generic)
  170. li r6,-1
  171. addic r0,r6,0 /* clear carry */
  172. srdi. r6,r5,3 /* less than 8 bytes? */
  173. beq .Lcopy_tail_word
  174. /*
  175. * If only halfword aligned, align to a double word. Since odd
  176. * aligned addresses should be rare and they would require more
  177. * work to calculate the correct checksum, we ignore that case
  178. * and take the potential slowdown of unaligned loads.
  179. *
  180. * If the source and destination are relatively unaligned we only
  181. * align the source. This keeps things simple.
  182. */
  183. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
  184. beq .Lcopy_aligned
  185. li r9,4
  186. sub r6,r9,r6
  187. mtctr r6
  188. 1:
  189. srcnr; lhz r6,0(r3) /* align to doubleword */
  190. subi r5,r5,2
  191. addi r3,r3,2
  192. adde r0,r0,r6
  193. dstnr; sth r6,0(r4)
  194. addi r4,r4,2
  195. bdnz 1b
  196. .Lcopy_aligned:
  197. /*
  198. * We unroll the loop such that each iteration is 64 bytes with an
  199. * entry and exit limb of 64 bytes, meaning a minimum size of
  200. * 128 bytes.
  201. */
  202. srdi. r6,r5,7
  203. beq .Lcopy_tail_doublewords /* len < 128 */
  204. srdi r6,r5,6
  205. subi r6,r6,1
  206. mtctr r6
  207. stdu r1,-STACKFRAMESIZE(r1)
  208. std r14,STK_REG(R14)(r1)
  209. std r15,STK_REG(R15)(r1)
  210. std r16,STK_REG(R16)(r1)
  211. source; ld r6,0(r3)
  212. source; ld r9,8(r3)
  213. source; ld r10,16(r3)
  214. source; ld r11,24(r3)
  215. /*
  216. * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  217. * because of the XER dependency. This means the fastest this loop can
  218. * go is 16 cycles per iteration. The scheduling of the loop below has
  219. * been shown to hit this on both POWER6 and POWER7.
  220. */
  221. .align 5
  222. 2:
  223. adde r0,r0,r6
  224. source; ld r12,32(r3)
  225. source; ld r14,40(r3)
  226. adde r0,r0,r9
  227. source; ld r15,48(r3)
  228. source; ld r16,56(r3)
  229. addi r3,r3,64
  230. adde r0,r0,r10
  231. dest; std r6,0(r4)
  232. dest; std r9,8(r4)
  233. adde r0,r0,r11
  234. dest; std r10,16(r4)
  235. dest; std r11,24(r4)
  236. adde r0,r0,r12
  237. dest; std r12,32(r4)
  238. dest; std r14,40(r4)
  239. adde r0,r0,r14
  240. dest; std r15,48(r4)
  241. dest; std r16,56(r4)
  242. addi r4,r4,64
  243. adde r0,r0,r15
  244. source; ld r6,0(r3)
  245. source; ld r9,8(r3)
  246. adde r0,r0,r16
  247. source; ld r10,16(r3)
  248. source; ld r11,24(r3)
  249. bdnz 2b
  250. adde r0,r0,r6
  251. source; ld r12,32(r3)
  252. source; ld r14,40(r3)
  253. adde r0,r0,r9
  254. source; ld r15,48(r3)
  255. source; ld r16,56(r3)
  256. addi r3,r3,64
  257. adde r0,r0,r10
  258. dest; std r6,0(r4)
  259. dest; std r9,8(r4)
  260. adde r0,r0,r11
  261. dest; std r10,16(r4)
  262. dest; std r11,24(r4)
  263. adde r0,r0,r12
  264. dest; std r12,32(r4)
  265. dest; std r14,40(r4)
  266. adde r0,r0,r14
  267. dest; std r15,48(r4)
  268. dest; std r16,56(r4)
  269. addi r4,r4,64
  270. adde r0,r0,r15
  271. adde r0,r0,r16
  272. ld r14,STK_REG(R14)(r1)
  273. ld r15,STK_REG(R15)(r1)
  274. ld r16,STK_REG(R16)(r1)
  275. addi r1,r1,STACKFRAMESIZE
  276. andi. r5,r5,63
  277. .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
  278. srdi. r6,r5,3
  279. beq .Lcopy_tail_word
  280. mtctr r6
  281. 3:
  282. srcnr; ld r6,0(r3)
  283. addi r3,r3,8
  284. adde r0,r0,r6
  285. dstnr; std r6,0(r4)
  286. addi r4,r4,8
  287. bdnz 3b
  288. andi. r5,r5,7
  289. .Lcopy_tail_word: /* Up to 7 bytes to go */
  290. srdi. r6,r5,2
  291. beq .Lcopy_tail_halfword
  292. srcnr; lwz r6,0(r3)
  293. addi r3,r3,4
  294. adde r0,r0,r6
  295. dstnr; stw r6,0(r4)
  296. addi r4,r4,4
  297. subi r5,r5,4
  298. .Lcopy_tail_halfword: /* Up to 3 bytes to go */
  299. srdi. r6,r5,1
  300. beq .Lcopy_tail_byte
  301. srcnr; lhz r6,0(r3)
  302. addi r3,r3,2
  303. adde r0,r0,r6
  304. dstnr; sth r6,0(r4)
  305. addi r4,r4,2
  306. subi r5,r5,2
  307. .Lcopy_tail_byte: /* Up to 1 byte to go */
  308. andi. r6,r5,1
  309. beq .Lcopy_finish
  310. srcnr; lbz r6,0(r3)
  311. #ifdef __BIG_ENDIAN__
  312. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  313. adde r0,r0,r9
  314. #else
  315. adde r0,r0,r6
  316. #endif
  317. dstnr; stb r6,0(r4)
  318. .Lcopy_finish:
  319. addze r0,r0 /* add in final carry */
  320. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  321. add r3,r4,r0
  322. srdi r3,r3,32
  323. blr
  324. .Lerror:
  325. ld r14,STK_REG(R14)(r1)
  326. ld r15,STK_REG(R15)(r1)
  327. ld r16,STK_REG(R16)(r1)
  328. addi r1,r1,STACKFRAMESIZE
  329. .Lerror_nr:
  330. li r3,0
  331. blr
  332. EXPORT_SYMBOL(csum_partial_copy_generic)
  333. /*
  334. * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
  335. * const struct in6_addr *daddr,
  336. * __u32 len, __u8 proto, __wsum sum)
  337. */
  338. _GLOBAL(csum_ipv6_magic)
  339. ld r8, 0(r3)
  340. ld r9, 8(r3)
  341. add r5, r5, r6
  342. addc r0, r8, r9
  343. ld r10, 0(r4)
  344. ld r11, 8(r4)
  345. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  346. rotldi r5, r5, 8
  347. #endif
  348. adde r0, r0, r10
  349. add r5, r5, r7
  350. adde r0, r0, r11
  351. adde r0, r0, r5
  352. addze r0, r0
  353. rotldi r3, r0, 32 /* fold two 32 bit halves together */
  354. add r3, r0, r3
  355. srdi r0, r3, 32
  356. rotlwi r3, r0, 16 /* fold two 16 bit halves together */
  357. add r3, r0, r3
  358. not r3, r3
  359. rlwinm r3, r3, 16, 16, 31
  360. blr
  361. EXPORT_SYMBOL(csum_ipv6_magic)