fastcopy.S 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. /*
  2. * Copyright (C) 2008-2009 Michal Simek <[email protected]>
  3. * Copyright (C) 2008-2009 PetaLogix
  4. * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
  5. *
  6. * This file is subject to the terms and conditions of the GNU General
  7. * Public License. See the file COPYING in the main directory of this
  8. * archive for more details.
  9. *
  10. * Written by Jim Law <[email protected]>
  11. *
  12. * intended to replace:
  13. * memcpy in memcpy.c and
  14. * memmove in memmove.c
  15. * ... in arch/microblaze/lib
  16. *
  17. *
  18. * assly_fastcopy.S
  19. *
  20. * Attempt at quicker memcpy and memmove for MicroBlaze
  21. * Input : Operand1 in Reg r5 - destination address
  22. * Operand2 in Reg r6 - source address
  23. * Operand3 in Reg r7 - number of bytes to transfer
  24. * Output: Result in Reg r3 - starting destinaition address
  25. *
  26. *
  27. * Explanation:
  28. * Perform (possibly unaligned) copy of a block of memory
  29. * between mem locations with size of xfer spec'd in bytes
  30. */
  31. #include <linux/linkage.h>
  32. .text
  33. .globl memcpy
  34. .type memcpy, @function
  35. .ent memcpy
  36. memcpy:
  37. fast_memcpy_ascending:
  38. /* move d to return register as value of function */
  39. addi r3, r5, 0
  40. addi r4, r0, 4 /* n = 4 */
  41. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  42. blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
  43. /* transfer first 0~3 bytes to get aligned dest address */
  44. andi r4, r5, 3 /* n = d & 3 */
  45. /* if zero, destination already aligned */
  46. beqi r4, a_dalign_done
  47. /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
  48. rsubi r4, r4, 4
  49. rsub r7, r4, r7 /* c = c - n adjust c */
  50. a_xfer_first_loop:
  51. /* if no bytes left to transfer, transfer the bulk */
  52. beqi r4, a_dalign_done
  53. lbui r11, r6, 0 /* h = *s */
  54. sbi r11, r5, 0 /* *d = h */
  55. addi r6, r6, 1 /* s++ */
  56. addi r5, r5, 1 /* d++ */
  57. brid a_xfer_first_loop /* loop */
  58. addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
  59. a_dalign_done:
  60. addi r4, r0, 32 /* n = 32 */
  61. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  62. /* if n < 0, less than one block to transfer */
  63. blti r4, a_block_done
  64. a_block_xfer:
  65. andi r4, r7, 0xffffffe0 /* n = c & ~31 */
  66. rsub r7, r4, r7 /* c = c - n */
  67. andi r9, r6, 3 /* t1 = s & 3 */
  68. /* if temp != 0, unaligned transfers needed */
  69. bnei r9, a_block_unaligned
  70. a_block_aligned:
  71. lwi r9, r6, 0 /* t1 = *(s + 0) */
  72. lwi r10, r6, 4 /* t2 = *(s + 4) */
  73. lwi r11, r6, 8 /* t3 = *(s + 8) */
  74. lwi r12, r6, 12 /* t4 = *(s + 12) */
  75. swi r9, r5, 0 /* *(d + 0) = t1 */
  76. swi r10, r5, 4 /* *(d + 4) = t2 */
  77. swi r11, r5, 8 /* *(d + 8) = t3 */
  78. swi r12, r5, 12 /* *(d + 12) = t4 */
  79. lwi r9, r6, 16 /* t1 = *(s + 16) */
  80. lwi r10, r6, 20 /* t2 = *(s + 20) */
  81. lwi r11, r6, 24 /* t3 = *(s + 24) */
  82. lwi r12, r6, 28 /* t4 = *(s + 28) */
  83. swi r9, r5, 16 /* *(d + 16) = t1 */
  84. swi r10, r5, 20 /* *(d + 20) = t2 */
  85. swi r11, r5, 24 /* *(d + 24) = t3 */
  86. swi r12, r5, 28 /* *(d + 28) = t4 */
  87. addi r6, r6, 32 /* s = s + 32 */
  88. addi r4, r4, -32 /* n = n - 32 */
  89. bneid r4, a_block_aligned /* while (n) loop */
  90. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  91. bri a_block_done
  92. a_block_unaligned:
  93. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  94. add r6, r6, r4 /* s = s + n */
  95. lwi r11, r8, 0 /* h = *(as + 0) */
  96. addi r9, r9, -1
  97. beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
  98. addi r9, r9, -1
  99. beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
  100. a_block_u3:
  101. bslli r11, r11, 24 /* h = h << 24 */
  102. a_bu3_loop:
  103. lwi r12, r8, 4 /* v = *(as + 4) */
  104. bsrli r9, r12, 8 /* t1 = v >> 8 */
  105. or r9, r11, r9 /* t1 = h | t1 */
  106. swi r9, r5, 0 /* *(d + 0) = t1 */
  107. bslli r11, r12, 24 /* h = v << 24 */
  108. lwi r12, r8, 8 /* v = *(as + 8) */
  109. bsrli r9, r12, 8 /* t1 = v >> 8 */
  110. or r9, r11, r9 /* t1 = h | t1 */
  111. swi r9, r5, 4 /* *(d + 4) = t1 */
  112. bslli r11, r12, 24 /* h = v << 24 */
  113. lwi r12, r8, 12 /* v = *(as + 12) */
  114. bsrli r9, r12, 8 /* t1 = v >> 8 */
  115. or r9, r11, r9 /* t1 = h | t1 */
  116. swi r9, r5, 8 /* *(d + 8) = t1 */
  117. bslli r11, r12, 24 /* h = v << 24 */
  118. lwi r12, r8, 16 /* v = *(as + 16) */
  119. bsrli r9, r12, 8 /* t1 = v >> 8 */
  120. or r9, r11, r9 /* t1 = h | t1 */
  121. swi r9, r5, 12 /* *(d + 12) = t1 */
  122. bslli r11, r12, 24 /* h = v << 24 */
  123. lwi r12, r8, 20 /* v = *(as + 20) */
  124. bsrli r9, r12, 8 /* t1 = v >> 8 */
  125. or r9, r11, r9 /* t1 = h | t1 */
  126. swi r9, r5, 16 /* *(d + 16) = t1 */
  127. bslli r11, r12, 24 /* h = v << 24 */
  128. lwi r12, r8, 24 /* v = *(as + 24) */
  129. bsrli r9, r12, 8 /* t1 = v >> 8 */
  130. or r9, r11, r9 /* t1 = h | t1 */
  131. swi r9, r5, 20 /* *(d + 20) = t1 */
  132. bslli r11, r12, 24 /* h = v << 24 */
  133. lwi r12, r8, 28 /* v = *(as + 28) */
  134. bsrli r9, r12, 8 /* t1 = v >> 8 */
  135. or r9, r11, r9 /* t1 = h | t1 */
  136. swi r9, r5, 24 /* *(d + 24) = t1 */
  137. bslli r11, r12, 24 /* h = v << 24 */
  138. lwi r12, r8, 32 /* v = *(as + 32) */
  139. bsrli r9, r12, 8 /* t1 = v >> 8 */
  140. or r9, r11, r9 /* t1 = h | t1 */
  141. swi r9, r5, 28 /* *(d + 28) = t1 */
  142. bslli r11, r12, 24 /* h = v << 24 */
  143. addi r8, r8, 32 /* as = as + 32 */
  144. addi r4, r4, -32 /* n = n - 32 */
  145. bneid r4, a_bu3_loop /* while (n) loop */
  146. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  147. bri a_block_done
  148. a_block_u1:
  149. bslli r11, r11, 8 /* h = h << 8 */
  150. a_bu1_loop:
  151. lwi r12, r8, 4 /* v = *(as + 4) */
  152. bsrli r9, r12, 24 /* t1 = v >> 24 */
  153. or r9, r11, r9 /* t1 = h | t1 */
  154. swi r9, r5, 0 /* *(d + 0) = t1 */
  155. bslli r11, r12, 8 /* h = v << 8 */
  156. lwi r12, r8, 8 /* v = *(as + 8) */
  157. bsrli r9, r12, 24 /* t1 = v >> 24 */
  158. or r9, r11, r9 /* t1 = h | t1 */
  159. swi r9, r5, 4 /* *(d + 4) = t1 */
  160. bslli r11, r12, 8 /* h = v << 8 */
  161. lwi r12, r8, 12 /* v = *(as + 12) */
  162. bsrli r9, r12, 24 /* t1 = v >> 24 */
  163. or r9, r11, r9 /* t1 = h | t1 */
  164. swi r9, r5, 8 /* *(d + 8) = t1 */
  165. bslli r11, r12, 8 /* h = v << 8 */
  166. lwi r12, r8, 16 /* v = *(as + 16) */
  167. bsrli r9, r12, 24 /* t1 = v >> 24 */
  168. or r9, r11, r9 /* t1 = h | t1 */
  169. swi r9, r5, 12 /* *(d + 12) = t1 */
  170. bslli r11, r12, 8 /* h = v << 8 */
  171. lwi r12, r8, 20 /* v = *(as + 20) */
  172. bsrli r9, r12, 24 /* t1 = v >> 24 */
  173. or r9, r11, r9 /* t1 = h | t1 */
  174. swi r9, r5, 16 /* *(d + 16) = t1 */
  175. bslli r11, r12, 8 /* h = v << 8 */
  176. lwi r12, r8, 24 /* v = *(as + 24) */
  177. bsrli r9, r12, 24 /* t1 = v >> 24 */
  178. or r9, r11, r9 /* t1 = h | t1 */
  179. swi r9, r5, 20 /* *(d + 20) = t1 */
  180. bslli r11, r12, 8 /* h = v << 8 */
  181. lwi r12, r8, 28 /* v = *(as + 28) */
  182. bsrli r9, r12, 24 /* t1 = v >> 24 */
  183. or r9, r11, r9 /* t1 = h | t1 */
  184. swi r9, r5, 24 /* *(d + 24) = t1 */
  185. bslli r11, r12, 8 /* h = v << 8 */
  186. lwi r12, r8, 32 /* v = *(as + 32) */
  187. bsrli r9, r12, 24 /* t1 = v >> 24 */
  188. or r9, r11, r9 /* t1 = h | t1 */
  189. swi r9, r5, 28 /* *(d + 28) = t1 */
  190. bslli r11, r12, 8 /* h = v << 8 */
  191. addi r8, r8, 32 /* as = as + 32 */
  192. addi r4, r4, -32 /* n = n - 32 */
  193. bneid r4, a_bu1_loop /* while (n) loop */
  194. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  195. bri a_block_done
  196. a_block_u2:
  197. bslli r11, r11, 16 /* h = h << 16 */
  198. a_bu2_loop:
  199. lwi r12, r8, 4 /* v = *(as + 4) */
  200. bsrli r9, r12, 16 /* t1 = v >> 16 */
  201. or r9, r11, r9 /* t1 = h | t1 */
  202. swi r9, r5, 0 /* *(d + 0) = t1 */
  203. bslli r11, r12, 16 /* h = v << 16 */
  204. lwi r12, r8, 8 /* v = *(as + 8) */
  205. bsrli r9, r12, 16 /* t1 = v >> 16 */
  206. or r9, r11, r9 /* t1 = h | t1 */
  207. swi r9, r5, 4 /* *(d + 4) = t1 */
  208. bslli r11, r12, 16 /* h = v << 16 */
  209. lwi r12, r8, 12 /* v = *(as + 12) */
  210. bsrli r9, r12, 16 /* t1 = v >> 16 */
  211. or r9, r11, r9 /* t1 = h | t1 */
  212. swi r9, r5, 8 /* *(d + 8) = t1 */
  213. bslli r11, r12, 16 /* h = v << 16 */
  214. lwi r12, r8, 16 /* v = *(as + 16) */
  215. bsrli r9, r12, 16 /* t1 = v >> 16 */
  216. or r9, r11, r9 /* t1 = h | t1 */
  217. swi r9, r5, 12 /* *(d + 12) = t1 */
  218. bslli r11, r12, 16 /* h = v << 16 */
  219. lwi r12, r8, 20 /* v = *(as + 20) */
  220. bsrli r9, r12, 16 /* t1 = v >> 16 */
  221. or r9, r11, r9 /* t1 = h | t1 */
  222. swi r9, r5, 16 /* *(d + 16) = t1 */
  223. bslli r11, r12, 16 /* h = v << 16 */
  224. lwi r12, r8, 24 /* v = *(as + 24) */
  225. bsrli r9, r12, 16 /* t1 = v >> 16 */
  226. or r9, r11, r9 /* t1 = h | t1 */
  227. swi r9, r5, 20 /* *(d + 20) = t1 */
  228. bslli r11, r12, 16 /* h = v << 16 */
  229. lwi r12, r8, 28 /* v = *(as + 28) */
  230. bsrli r9, r12, 16 /* t1 = v >> 16 */
  231. or r9, r11, r9 /* t1 = h | t1 */
  232. swi r9, r5, 24 /* *(d + 24) = t1 */
  233. bslli r11, r12, 16 /* h = v << 16 */
  234. lwi r12, r8, 32 /* v = *(as + 32) */
  235. bsrli r9, r12, 16 /* t1 = v >> 16 */
  236. or r9, r11, r9 /* t1 = h | t1 */
  237. swi r9, r5, 28 /* *(d + 28) = t1 */
  238. bslli r11, r12, 16 /* h = v << 16 */
  239. addi r8, r8, 32 /* as = as + 32 */
  240. addi r4, r4, -32 /* n = n - 32 */
  241. bneid r4, a_bu2_loop /* while (n) loop */
  242. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  243. a_block_done:
  244. addi r4, r0, 4 /* n = 4 */
  245. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  246. blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
  247. a_word_xfer:
  248. andi r4, r7, 0xfffffffc /* n = c & ~3 */
  249. addi r10, r0, 0 /* offset = 0 */
  250. andi r9, r6, 3 /* t1 = s & 3 */
  251. /* if temp != 0, unaligned transfers needed */
  252. bnei r9, a_word_unaligned
  253. a_word_aligned:
  254. lw r9, r6, r10 /* t1 = *(s+offset) */
  255. sw r9, r5, r10 /* *(d+offset) = t1 */
  256. addi r4, r4,-4 /* n-- */
  257. bneid r4, a_word_aligned /* loop */
  258. addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
  259. bri a_word_done
  260. a_word_unaligned:
  261. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  262. lwi r11, r8, 0 /* h = *(as + 0) */
  263. addi r8, r8, 4 /* as = as + 4 */
  264. addi r9, r9, -1
  265. beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
  266. addi r9, r9, -1
  267. beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
  268. a_word_u3:
  269. bslli r11, r11, 24 /* h = h << 24 */
  270. a_wu3_loop:
  271. lw r12, r8, r10 /* v = *(as + offset) */
  272. bsrli r9, r12, 8 /* t1 = v >> 8 */
  273. or r9, r11, r9 /* t1 = h | t1 */
  274. sw r9, r5, r10 /* *(d + offset) = t1 */
  275. bslli r11, r12, 24 /* h = v << 24 */
  276. addi r4, r4,-4 /* n = n - 4 */
  277. bneid r4, a_wu3_loop /* while (n) loop */
  278. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  279. bri a_word_done
  280. a_word_u1:
  281. bslli r11, r11, 8 /* h = h << 8 */
  282. a_wu1_loop:
  283. lw r12, r8, r10 /* v = *(as + offset) */
  284. bsrli r9, r12, 24 /* t1 = v >> 24 */
  285. or r9, r11, r9 /* t1 = h | t1 */
  286. sw r9, r5, r10 /* *(d + offset) = t1 */
  287. bslli r11, r12, 8 /* h = v << 8 */
  288. addi r4, r4,-4 /* n = n - 4 */
  289. bneid r4, a_wu1_loop /* while (n) loop */
  290. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  291. bri a_word_done
  292. a_word_u2:
  293. bslli r11, r11, 16 /* h = h << 16 */
  294. a_wu2_loop:
  295. lw r12, r8, r10 /* v = *(as + offset) */
  296. bsrli r9, r12, 16 /* t1 = v >> 16 */
  297. or r9, r11, r9 /* t1 = h | t1 */
  298. sw r9, r5, r10 /* *(d + offset) = t1 */
  299. bslli r11, r12, 16 /* h = v << 16 */
  300. addi r4, r4,-4 /* n = n - 4 */
  301. bneid r4, a_wu2_loop /* while (n) loop */
  302. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  303. a_word_done:
  304. add r5, r5, r10 /* d = d + offset */
  305. add r6, r6, r10 /* s = s + offset */
  306. rsub r7, r10, r7 /* c = c - offset */
  307. a_xfer_end:
  308. a_xfer_end_loop:
  309. beqi r7, a_done /* while (c) */
  310. lbui r9, r6, 0 /* t1 = *s */
  311. addi r6, r6, 1 /* s++ */
  312. sbi r9, r5, 0 /* *d = t1 */
  313. addi r7, r7, -1 /* c-- */
  314. brid a_xfer_end_loop /* loop */
  315. addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
  316. a_done:
  317. rtsd r15, 8
  318. nop
  319. .size memcpy, . - memcpy
  320. .end memcpy
  321. /*----------------------------------------------------------------------------*/
  322. .globl memmove
  323. .type memmove, @function
  324. .ent memmove
  325. memmove:
  326. cmpu r4, r5, r6 /* n = s - d */
  327. bgei r4,fast_memcpy_ascending
  328. fast_memcpy_descending:
  329. /* move d to return register as value of function */
  330. addi r3, r5, 0
  331. add r5, r5, r7 /* d = d + c */
  332. add r6, r6, r7 /* s = s + c */
  333. addi r4, r0, 4 /* n = 4 */
  334. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  335. blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
  336. /* transfer first 0~3 bytes to get aligned dest address */
  337. andi r4, r5, 3 /* n = d & 3 */
  338. /* if zero, destination already aligned */
  339. beqi r4,d_dalign_done
  340. rsub r7, r4, r7 /* c = c - n adjust c */
  341. d_xfer_first_loop:
  342. /* if no bytes left to transfer, transfer the bulk */
  343. beqi r4,d_dalign_done
  344. addi r6, r6, -1 /* s-- */
  345. addi r5, r5, -1 /* d-- */
  346. lbui r11, r6, 0 /* h = *s */
  347. sbi r11, r5, 0 /* *d = h */
  348. brid d_xfer_first_loop /* loop */
  349. addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
  350. d_dalign_done:
  351. addi r4, r0, 32 /* n = 32 */
  352. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  353. /* if n < 0, less than one block to transfer */
  354. blti r4, d_block_done
  355. d_block_xfer:
  356. andi r4, r7, 0xffffffe0 /* n = c & ~31 */
  357. rsub r7, r4, r7 /* c = c - n */
  358. andi r9, r6, 3 /* t1 = s & 3 */
  359. /* if temp != 0, unaligned transfers needed */
  360. bnei r9, d_block_unaligned
  361. d_block_aligned:
  362. addi r6, r6, -32 /* s = s - 32 */
  363. addi r5, r5, -32 /* d = d - 32 */
  364. lwi r9, r6, 28 /* t1 = *(s + 28) */
  365. lwi r10, r6, 24 /* t2 = *(s + 24) */
  366. lwi r11, r6, 20 /* t3 = *(s + 20) */
  367. lwi r12, r6, 16 /* t4 = *(s + 16) */
  368. swi r9, r5, 28 /* *(d + 28) = t1 */
  369. swi r10, r5, 24 /* *(d + 24) = t2 */
  370. swi r11, r5, 20 /* *(d + 20) = t3 */
  371. swi r12, r5, 16 /* *(d + 16) = t4 */
  372. lwi r9, r6, 12 /* t1 = *(s + 12) */
  373. lwi r10, r6, 8 /* t2 = *(s + 8) */
  374. lwi r11, r6, 4 /* t3 = *(s + 4) */
  375. lwi r12, r6, 0 /* t4 = *(s + 0) */
  376. swi r9, r5, 12 /* *(d + 12) = t1 */
  377. swi r10, r5, 8 /* *(d + 8) = t2 */
  378. swi r11, r5, 4 /* *(d + 4) = t3 */
  379. addi r4, r4, -32 /* n = n - 32 */
  380. bneid r4, d_block_aligned /* while (n) loop */
  381. swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
  382. bri d_block_done
  383. d_block_unaligned:
  384. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  385. rsub r6, r4, r6 /* s = s - n */
  386. lwi r11, r8, 0 /* h = *(as + 0) */
  387. addi r9, r9, -1
  388. beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
  389. addi r9, r9, -1
  390. beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
  391. d_block_u3:
  392. bsrli r11, r11, 8 /* h = h >> 8 */
  393. d_bu3_loop:
  394. addi r8, r8, -32 /* as = as - 32 */
  395. addi r5, r5, -32 /* d = d - 32 */
  396. lwi r12, r8, 28 /* v = *(as + 28) */
  397. bslli r9, r12, 24 /* t1 = v << 24 */
  398. or r9, r11, r9 /* t1 = h | t1 */
  399. swi r9, r5, 28 /* *(d + 28) = t1 */
  400. bsrli r11, r12, 8 /* h = v >> 8 */
  401. lwi r12, r8, 24 /* v = *(as + 24) */
  402. bslli r9, r12, 24 /* t1 = v << 24 */
  403. or r9, r11, r9 /* t1 = h | t1 */
  404. swi r9, r5, 24 /* *(d + 24) = t1 */
  405. bsrli r11, r12, 8 /* h = v >> 8 */
  406. lwi r12, r8, 20 /* v = *(as + 20) */
  407. bslli r9, r12, 24 /* t1 = v << 24 */
  408. or r9, r11, r9 /* t1 = h | t1 */
  409. swi r9, r5, 20 /* *(d + 20) = t1 */
  410. bsrli r11, r12, 8 /* h = v >> 8 */
  411. lwi r12, r8, 16 /* v = *(as + 16) */
  412. bslli r9, r12, 24 /* t1 = v << 24 */
  413. or r9, r11, r9 /* t1 = h | t1 */
  414. swi r9, r5, 16 /* *(d + 16) = t1 */
  415. bsrli r11, r12, 8 /* h = v >> 8 */
  416. lwi r12, r8, 12 /* v = *(as + 12) */
  417. bslli r9, r12, 24 /* t1 = v << 24 */
  418. or r9, r11, r9 /* t1 = h | t1 */
  419. swi r9, r5, 12 /* *(d + 112) = t1 */
  420. bsrli r11, r12, 8 /* h = v >> 8 */
  421. lwi r12, r8, 8 /* v = *(as + 8) */
  422. bslli r9, r12, 24 /* t1 = v << 24 */
  423. or r9, r11, r9 /* t1 = h | t1 */
  424. swi r9, r5, 8 /* *(d + 8) = t1 */
  425. bsrli r11, r12, 8 /* h = v >> 8 */
  426. lwi r12, r8, 4 /* v = *(as + 4) */
  427. bslli r9, r12, 24 /* t1 = v << 24 */
  428. or r9, r11, r9 /* t1 = h | t1 */
  429. swi r9, r5, 4 /* *(d + 4) = t1 */
  430. bsrli r11, r12, 8 /* h = v >> 8 */
  431. lwi r12, r8, 0 /* v = *(as + 0) */
  432. bslli r9, r12, 24 /* t1 = v << 24 */
  433. or r9, r11, r9 /* t1 = h | t1 */
  434. swi r9, r5, 0 /* *(d + 0) = t1 */
  435. addi r4, r4, -32 /* n = n - 32 */
  436. bneid r4, d_bu3_loop /* while (n) loop */
  437. bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
  438. bri d_block_done
  439. d_block_u1:
  440. bsrli r11, r11, 24 /* h = h >> 24 */
  441. d_bu1_loop:
  442. addi r8, r8, -32 /* as = as - 32 */
  443. addi r5, r5, -32 /* d = d - 32 */
  444. lwi r12, r8, 28 /* v = *(as + 28) */
  445. bslli r9, r12, 8 /* t1 = v << 8 */
  446. or r9, r11, r9 /* t1 = h | t1 */
  447. swi r9, r5, 28 /* *(d + 28) = t1 */
  448. bsrli r11, r12, 24 /* h = v >> 24 */
  449. lwi r12, r8, 24 /* v = *(as + 24) */
  450. bslli r9, r12, 8 /* t1 = v << 8 */
  451. or r9, r11, r9 /* t1 = h | t1 */
  452. swi r9, r5, 24 /* *(d + 24) = t1 */
  453. bsrli r11, r12, 24 /* h = v >> 24 */
  454. lwi r12, r8, 20 /* v = *(as + 20) */
  455. bslli r9, r12, 8 /* t1 = v << 8 */
  456. or r9, r11, r9 /* t1 = h | t1 */
  457. swi r9, r5, 20 /* *(d + 20) = t1 */
  458. bsrli r11, r12, 24 /* h = v >> 24 */
  459. lwi r12, r8, 16 /* v = *(as + 16) */
  460. bslli r9, r12, 8 /* t1 = v << 8 */
  461. or r9, r11, r9 /* t1 = h | t1 */
  462. swi r9, r5, 16 /* *(d + 16) = t1 */
  463. bsrli r11, r12, 24 /* h = v >> 24 */
  464. lwi r12, r8, 12 /* v = *(as + 12) */
  465. bslli r9, r12, 8 /* t1 = v << 8 */
  466. or r9, r11, r9 /* t1 = h | t1 */
  467. swi r9, r5, 12 /* *(d + 112) = t1 */
  468. bsrli r11, r12, 24 /* h = v >> 24 */
  469. lwi r12, r8, 8 /* v = *(as + 8) */
  470. bslli r9, r12, 8 /* t1 = v << 8 */
  471. or r9, r11, r9 /* t1 = h | t1 */
  472. swi r9, r5, 8 /* *(d + 8) = t1 */
  473. bsrli r11, r12, 24 /* h = v >> 24 */
  474. lwi r12, r8, 4 /* v = *(as + 4) */
  475. bslli r9, r12, 8 /* t1 = v << 8 */
  476. or r9, r11, r9 /* t1 = h | t1 */
  477. swi r9, r5, 4 /* *(d + 4) = t1 */
  478. bsrli r11, r12, 24 /* h = v >> 24 */
  479. lwi r12, r8, 0 /* v = *(as + 0) */
  480. bslli r9, r12, 8 /* t1 = v << 8 */
  481. or r9, r11, r9 /* t1 = h | t1 */
  482. swi r9, r5, 0 /* *(d + 0) = t1 */
  483. addi r4, r4, -32 /* n = n - 32 */
  484. bneid r4, d_bu1_loop /* while (n) loop */
  485. bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
  486. bri d_block_done
  487. d_block_u2:
  488. bsrli r11, r11, 16 /* h = h >> 16 */
  489. d_bu2_loop:
  490. addi r8, r8, -32 /* as = as - 32 */
  491. addi r5, r5, -32 /* d = d - 32 */
  492. lwi r12, r8, 28 /* v = *(as + 28) */
  493. bslli r9, r12, 16 /* t1 = v << 16 */
  494. or r9, r11, r9 /* t1 = h | t1 */
  495. swi r9, r5, 28 /* *(d + 28) = t1 */
  496. bsrli r11, r12, 16 /* h = v >> 16 */
  497. lwi r12, r8, 24 /* v = *(as + 24) */
  498. bslli r9, r12, 16 /* t1 = v << 16 */
  499. or r9, r11, r9 /* t1 = h | t1 */
  500. swi r9, r5, 24 /* *(d + 24) = t1 */
  501. bsrli r11, r12, 16 /* h = v >> 16 */
  502. lwi r12, r8, 20 /* v = *(as + 20) */
  503. bslli r9, r12, 16 /* t1 = v << 16 */
  504. or r9, r11, r9 /* t1 = h | t1 */
  505. swi r9, r5, 20 /* *(d + 20) = t1 */
  506. bsrli r11, r12, 16 /* h = v >> 16 */
  507. lwi r12, r8, 16 /* v = *(as + 16) */
  508. bslli r9, r12, 16 /* t1 = v << 16 */
  509. or r9, r11, r9 /* t1 = h | t1 */
  510. swi r9, r5, 16 /* *(d + 16) = t1 */
  511. bsrli r11, r12, 16 /* h = v >> 16 */
  512. lwi r12, r8, 12 /* v = *(as + 12) */
  513. bslli r9, r12, 16 /* t1 = v << 16 */
  514. or r9, r11, r9 /* t1 = h | t1 */
  515. swi r9, r5, 12 /* *(d + 112) = t1 */
  516. bsrli r11, r12, 16 /* h = v >> 16 */
  517. lwi r12, r8, 8 /* v = *(as + 8) */
  518. bslli r9, r12, 16 /* t1 = v << 16 */
  519. or r9, r11, r9 /* t1 = h | t1 */
  520. swi r9, r5, 8 /* *(d + 8) = t1 */
  521. bsrli r11, r12, 16 /* h = v >> 16 */
  522. lwi r12, r8, 4 /* v = *(as + 4) */
  523. bslli r9, r12, 16 /* t1 = v << 16 */
  524. or r9, r11, r9 /* t1 = h | t1 */
  525. swi r9, r5, 4 /* *(d + 4) = t1 */
  526. bsrli r11, r12, 16 /* h = v >> 16 */
  527. lwi r12, r8, 0 /* v = *(as + 0) */
  528. bslli r9, r12, 16 /* t1 = v << 16 */
  529. or r9, r11, r9 /* t1 = h | t1 */
  530. swi r9, r5, 0 /* *(d + 0) = t1 */
  531. addi r4, r4, -32 /* n = n - 32 */
  532. bneid r4, d_bu2_loop /* while (n) loop */
  533. bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
  534. d_block_done:
  535. addi r4, r0, 4 /* n = 4 */
  536. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  537. blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
  538. d_word_xfer:
  539. andi r4, r7, 0xfffffffc /* n = c & ~3 */
  540. rsub r5, r4, r5 /* d = d - n */
  541. rsub r6, r4, r6 /* s = s - n */
  542. rsub r7, r4, r7 /* c = c - n */
  543. andi r9, r6, 3 /* t1 = s & 3 */
  544. /* if temp != 0, unaligned transfers needed */
  545. bnei r9, d_word_unaligned
  546. d_word_aligned:
  547. addi r4, r4,-4 /* n-- */
  548. lw r9, r6, r4 /* t1 = *(s+n) */
  549. bneid r4, d_word_aligned /* loop */
  550. sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
  551. bri d_word_done
  552. d_word_unaligned:
  553. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  554. lw r11, r8, r4 /* h = *(as + n) */
  555. addi r9, r9, -1
  556. beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
  557. addi r9, r9, -1
  558. beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
  559. d_word_u3:
  560. bsrli r11, r11, 8 /* h = h >> 8 */
  561. d_wu3_loop:
  562. addi r4, r4,-4 /* n = n - 4 */
  563. lw r12, r8, r4 /* v = *(as + n) */
  564. bslli r9, r12, 24 /* t1 = v << 24 */
  565. or r9, r11, r9 /* t1 = h | t1 */
  566. sw r9, r5, r4 /* *(d + n) = t1 */
  567. bneid r4, d_wu3_loop /* while (n) loop */
  568. bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
  569. bri d_word_done
  570. d_word_u1:
  571. bsrli r11, r11, 24 /* h = h >> 24 */
  572. d_wu1_loop:
  573. addi r4, r4,-4 /* n = n - 4 */
  574. lw r12, r8, r4 /* v = *(as + n) */
  575. bslli r9, r12, 8 /* t1 = v << 8 */
  576. or r9, r11, r9 /* t1 = h | t1 */
  577. sw r9, r5, r4 /* *(d + n) = t1 */
  578. bneid r4, d_wu1_loop /* while (n) loop */
  579. bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
  580. bri d_word_done
  581. d_word_u2:
  582. bsrli r11, r11, 16 /* h = h >> 16 */
  583. d_wu2_loop:
  584. addi r4, r4,-4 /* n = n - 4 */
  585. lw r12, r8, r4 /* v = *(as + n) */
  586. bslli r9, r12, 16 /* t1 = v << 16 */
  587. or r9, r11, r9 /* t1 = h | t1 */
  588. sw r9, r5, r4 /* *(d + n) = t1 */
  589. bneid r4, d_wu2_loop /* while (n) loop */
  590. bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
  591. d_word_done:
  592. d_xfer_end:
  593. d_xfer_end_loop:
  594. beqi r7, a_done /* while (c) */
  595. addi r6, r6, -1 /* s-- */
  596. lbui r9, r6, 0 /* t1 = *s */
  597. addi r5, r5, -1 /* d-- */
  598. sbi r9, r5, 0 /* *d = t1 */
  599. brid d_xfer_end_loop /* loop */
  600. addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
  601. d_done:
  602. rtsd r15, 8
  603. nop
  604. .size memmove, . - memmove
  605. .end memmove