checksum.S 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * IP/TCP/UDP checksumming routines
  8. *
  9. * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
  10. * Optimized by Joe Taylor
  11. */
  12. #include <linux/errno.h>
  13. #include <linux/linkage.h>
  14. #include <asm/asmmacro.h>
  15. #include <asm/core.h>
  16. /*
  17. * computes a partial checksum, e.g. for TCP/UDP fragments
  18. */
  19. /*
  20. * unsigned int csum_partial(const unsigned char *buf, int len,
  21. * unsigned int sum);
  22. * a2 = buf
  23. * a3 = len
  24. * a4 = sum
  25. *
  26. * This function assumes 2- or 4-byte alignment. Other alignments will fail!
  27. */
  28. /* ONES_ADD converts twos-complement math to ones-complement. */
  29. #define ONES_ADD(sum, val) \
  30. add sum, sum, val ; \
  31. bgeu sum, val, 99f ; \
  32. addi sum, sum, 1 ; \
  33. 99: ;
  34. .text
  35. ENTRY(csum_partial)
  36. /*
  37. * Experiments with Ethernet and SLIP connections show that buf
  38. * is aligned on either a 2-byte or 4-byte boundary.
  39. */
  40. abi_entry_default
  41. extui a5, a2, 0, 2
  42. bnez a5, 8f /* branch if 2-byte aligned */
  43. /* Fall-through on common case, 4-byte alignment */
  44. 1:
  45. srli a5, a3, 5 /* 32-byte chunks */
  46. #if XCHAL_HAVE_LOOPS
  47. loopgtz a5, 2f
  48. #else
  49. beqz a5, 2f
  50. slli a5, a5, 5
  51. add a5, a5, a2 /* a5 = end of last 32-byte chunk */
  52. .Loop1:
  53. #endif
  54. l32i a6, a2, 0
  55. l32i a7, a2, 4
  56. ONES_ADD(a4, a6)
  57. ONES_ADD(a4, a7)
  58. l32i a6, a2, 8
  59. l32i a7, a2, 12
  60. ONES_ADD(a4, a6)
  61. ONES_ADD(a4, a7)
  62. l32i a6, a2, 16
  63. l32i a7, a2, 20
  64. ONES_ADD(a4, a6)
  65. ONES_ADD(a4, a7)
  66. l32i a6, a2, 24
  67. l32i a7, a2, 28
  68. ONES_ADD(a4, a6)
  69. ONES_ADD(a4, a7)
  70. addi a2, a2, 4*8
  71. #if !XCHAL_HAVE_LOOPS
  72. blt a2, a5, .Loop1
  73. #endif
  74. 2:
  75. extui a5, a3, 2, 3 /* remaining 4-byte chunks */
  76. #if XCHAL_HAVE_LOOPS
  77. loopgtz a5, 3f
  78. #else
  79. beqz a5, 3f
  80. slli a5, a5, 2
  81. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  82. .Loop2:
  83. #endif
  84. l32i a6, a2, 0
  85. ONES_ADD(a4, a6)
  86. addi a2, a2, 4
  87. #if !XCHAL_HAVE_LOOPS
  88. blt a2, a5, .Loop2
  89. #endif
  90. 3:
  91. _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
  92. l16ui a6, a2, 0
  93. ONES_ADD(a4, a6)
  94. addi a2, a2, 2
  95. 5:
  96. _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
  97. 6: l8ui a6, a2, 0
  98. #ifdef __XTENSA_EB__
  99. slli a6, a6, 8 /* load byte into bits 8..15 */
  100. #endif
  101. ONES_ADD(a4, a6)
  102. 7:
  103. mov a2, a4
  104. abi_ret_default
  105. /* uncommon case, buf is 2-byte aligned */
  106. 8:
  107. beqz a3, 7b /* branch if len == 0 */
  108. beqi a3, 1, 6b /* branch if len == 1 */
  109. extui a5, a2, 0, 1
  110. bnez a5, 8f /* branch if 1-byte aligned */
  111. l16ui a6, a2, 0 /* common case, len >= 2 */
  112. ONES_ADD(a4, a6)
  113. addi a2, a2, 2 /* adjust buf */
  114. addi a3, a3, -2 /* adjust len */
  115. j 1b /* now buf is 4-byte aligned */
  116. /* case: odd-byte aligned, len > 1
  117. * This case is dog slow, so don't give us an odd address.
  118. * (I don't think this ever happens, but just in case.)
  119. */
  120. 8:
  121. srli a5, a3, 2 /* 4-byte chunks */
  122. #if XCHAL_HAVE_LOOPS
  123. loopgtz a5, 2f
  124. #else
  125. beqz a5, 2f
  126. slli a5, a5, 2
  127. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  128. .Loop3:
  129. #endif
  130. l8ui a6, a2, 0 /* bits 24..31 */
  131. l16ui a7, a2, 1 /* bits 8..23 */
  132. l8ui a8, a2, 3 /* bits 0.. 8 */
  133. #ifdef __XTENSA_EB__
  134. slli a6, a6, 24
  135. #else
  136. slli a8, a8, 24
  137. #endif
  138. slli a7, a7, 8
  139. or a7, a7, a6
  140. or a7, a7, a8
  141. ONES_ADD(a4, a7)
  142. addi a2, a2, 4
  143. #if !XCHAL_HAVE_LOOPS
  144. blt a2, a5, .Loop3
  145. #endif
  146. 2:
  147. _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
  148. l8ui a6, a2, 0
  149. l8ui a7, a2, 1
  150. #ifdef __XTENSA_EB__
  151. slli a6, a6, 8
  152. #else
  153. slli a7, a7, 8
  154. #endif
  155. or a7, a7, a6
  156. ONES_ADD(a4, a7)
  157. addi a2, a2, 2
  158. 3:
  159. j 5b /* branch to handle the remaining byte */
  160. ENDPROC(csum_partial)
  161. /*
  162. * Copy from ds while checksumming, otherwise like csum_partial
  163. */
  164. /*
  165. unsigned int csum_partial_copy_generic (const char *src, char *dst, int len)
  166. a2 = src
  167. a3 = dst
  168. a4 = len
  169. a5 = sum
  170. a8 = temp
  171. a9 = temp
  172. a10 = temp
  173. This function is optimized for 4-byte aligned addresses. Other
  174. alignments work, but not nearly as efficiently.
  175. */
  176. ENTRY(csum_partial_copy_generic)
  177. abi_entry_default
  178. movi a5, -1
  179. or a10, a2, a3
  180. /* We optimize the following alignment tests for the 4-byte
  181. aligned case. Two bbsi.l instructions might seem more optimal
  182. (commented out below). However, both labels 5: and 3: are out
  183. of the imm8 range, so the assembler relaxes them into
  184. equivalent bbci.l, j combinations, which is actually
  185. slower. */
  186. extui a9, a10, 0, 2
  187. beqz a9, 1f /* branch if both are 4-byte aligned */
  188. bbsi.l a10, 0, 5f /* branch if one address is odd */
  189. j 3f /* one address is 2-byte aligned */
  190. /* _bbsi.l a10, 0, 5f */ /* branch if odd address */
  191. /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
  192. 1:
  193. /* src and dst are both 4-byte aligned */
  194. srli a10, a4, 5 /* 32-byte chunks */
  195. #if XCHAL_HAVE_LOOPS
  196. loopgtz a10, 2f
  197. #else
  198. beqz a10, 2f
  199. slli a10, a10, 5
  200. add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
  201. .Loop5:
  202. #endif
  203. EX(10f) l32i a9, a2, 0
  204. EX(10f) l32i a8, a2, 4
  205. EX(10f) s32i a9, a3, 0
  206. EX(10f) s32i a8, a3, 4
  207. ONES_ADD(a5, a9)
  208. ONES_ADD(a5, a8)
  209. EX(10f) l32i a9, a2, 8
  210. EX(10f) l32i a8, a2, 12
  211. EX(10f) s32i a9, a3, 8
  212. EX(10f) s32i a8, a3, 12
  213. ONES_ADD(a5, a9)
  214. ONES_ADD(a5, a8)
  215. EX(10f) l32i a9, a2, 16
  216. EX(10f) l32i a8, a2, 20
  217. EX(10f) s32i a9, a3, 16
  218. EX(10f) s32i a8, a3, 20
  219. ONES_ADD(a5, a9)
  220. ONES_ADD(a5, a8)
  221. EX(10f) l32i a9, a2, 24
  222. EX(10f) l32i a8, a2, 28
  223. EX(10f) s32i a9, a3, 24
  224. EX(10f) s32i a8, a3, 28
  225. ONES_ADD(a5, a9)
  226. ONES_ADD(a5, a8)
  227. addi a2, a2, 32
  228. addi a3, a3, 32
  229. #if !XCHAL_HAVE_LOOPS
  230. blt a2, a10, .Loop5
  231. #endif
  232. 2:
  233. extui a10, a4, 2, 3 /* remaining 4-byte chunks */
  234. extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
  235. #if XCHAL_HAVE_LOOPS
  236. loopgtz a10, 3f
  237. #else
  238. beqz a10, 3f
  239. slli a10, a10, 2
  240. add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
  241. .Loop6:
  242. #endif
  243. EX(10f) l32i a9, a2, 0
  244. EX(10f) s32i a9, a3, 0
  245. ONES_ADD(a5, a9)
  246. addi a2, a2, 4
  247. addi a3, a3, 4
  248. #if !XCHAL_HAVE_LOOPS
  249. blt a2, a10, .Loop6
  250. #endif
  251. 3:
  252. /*
  253. Control comes to here in two cases: (1) It may fall through
  254. to here from the 4-byte alignment case to process, at most,
  255. one 2-byte chunk. (2) It branches to here from above if
  256. either src or dst is 2-byte aligned, and we process all bytes
  257. here, except for perhaps a trailing odd byte. It's
  258. inefficient, so align your addresses to 4-byte boundaries.
  259. a2 = src
  260. a3 = dst
  261. a4 = len
  262. a5 = sum
  263. */
  264. srli a10, a4, 1 /* 2-byte chunks */
  265. #if XCHAL_HAVE_LOOPS
  266. loopgtz a10, 4f
  267. #else
  268. beqz a10, 4f
  269. slli a10, a10, 1
  270. add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
  271. .Loop7:
  272. #endif
  273. EX(10f) l16ui a9, a2, 0
  274. EX(10f) s16i a9, a3, 0
  275. ONES_ADD(a5, a9)
  276. addi a2, a2, 2
  277. addi a3, a3, 2
  278. #if !XCHAL_HAVE_LOOPS
  279. blt a2, a10, .Loop7
  280. #endif
  281. 4:
  282. /* This section processes a possible trailing odd byte. */
  283. _bbci.l a4, 0, 8f /* 1-byte chunk */
  284. EX(10f) l8ui a9, a2, 0
  285. EX(10f) s8i a9, a3, 0
  286. #ifdef __XTENSA_EB__
  287. slli a9, a9, 8 /* shift byte to bits 8..15 */
  288. #endif
  289. ONES_ADD(a5, a9)
  290. 8:
  291. mov a2, a5
  292. abi_ret_default
  293. 5:
  294. /* Control branch to here when either src or dst is odd. We
  295. process all bytes using 8-bit accesses. Grossly inefficient,
  296. so don't feed us an odd address. */
  297. srli a10, a4, 1 /* handle in pairs for 16-bit csum */
  298. #if XCHAL_HAVE_LOOPS
  299. loopgtz a10, 6f
  300. #else
  301. beqz a10, 6f
  302. slli a10, a10, 1
  303. add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
  304. .Loop8:
  305. #endif
  306. EX(10f) l8ui a9, a2, 0
  307. EX(10f) l8ui a8, a2, 1
  308. EX(10f) s8i a9, a3, 0
  309. EX(10f) s8i a8, a3, 1
  310. #ifdef __XTENSA_EB__
  311. slli a9, a9, 8 /* combine into a single 16-bit value */
  312. #else /* for checksum computation */
  313. slli a8, a8, 8
  314. #endif
  315. or a9, a9, a8
  316. ONES_ADD(a5, a9)
  317. addi a2, a2, 2
  318. addi a3, a3, 2
  319. #if !XCHAL_HAVE_LOOPS
  320. blt a2, a10, .Loop8
  321. #endif
  322. 6:
  323. j 4b /* process the possible trailing odd byte */
  324. ENDPROC(csum_partial_copy_generic)
  325. # Exception handler:
  326. .section .fixup, "ax"
  327. 10:
  328. movi a2, 0
  329. abi_ret_default
  330. .previous