memcopy.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. /*
  2. * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
  3. * xthal_memcpy and xthal_bcopy
  4. *
  5. * This file is subject to the terms and conditions of the GNU General Public
  6. * License. See the file "COPYING" in the main directory of this archive
  7. * for more details.
  8. *
  9. * Copyright (C) 2002 - 2012 Tensilica Inc.
  10. */
  11. #include <linux/linkage.h>
  12. #include <asm/asmmacro.h>
  13. #include <asm/core.h>
  14. /*
  15. * void *memcpy(void *dst, const void *src, size_t len);
  16. *
  17. * This function is intended to do the same thing as the standard
  18. * library function memcpy() for most cases.
  19. * However, where the source and/or destination references
  20. * an instruction RAM or ROM or a data RAM or ROM, that
  21. * source and/or destination will always be accessed with
  22. * 32-bit load and store instructions (as required for these
  23. * types of devices).
  24. *
  25. * !!!!!!! XTFIXME:
  26. * !!!!!!! Handling of IRAM/IROM has not yet
  27. * !!!!!!! been implemented.
  28. *
  29. * The (general case) algorithm is as follows:
  30. * If destination is unaligned, align it by conditionally
  31. * copying 1 and 2 bytes.
  32. * If source is aligned,
  33. * do 16 bytes with a loop, and then finish up with
  34. * 8, 4, 2, and 1 byte copies conditional on the length;
  35. * else (if source is unaligned),
  36. * do the same, but use SRC to align the source data.
  37. * This code tries to use fall-through branches for the common
  38. * case of aligned source and destination and multiple
  39. * of 4 (or 8) length.
  40. *
  41. * Register use:
  42. * a0/ return address
  43. * a1/ stack pointer
  44. * a2/ return value
  45. * a3/ src
  46. * a4/ length
  47. * a5/ dst
  48. * a6/ tmp
  49. * a7/ tmp
  50. * a8/ tmp
  51. * a9/ tmp
  52. * a10/ tmp
  53. * a11/ tmp
  54. */
  55. .text
  56. /*
  57. * Byte by byte copy
  58. */
  59. .align 4
  60. .byte 0 # 1 mod 4 alignment for LOOPNEZ
  61. # (0 mod 4 alignment for LBEG)
  62. .Lbytecopy:
  63. #if XCHAL_HAVE_LOOPS
  64. loopnez a4, .Lbytecopydone
  65. #else /* !XCHAL_HAVE_LOOPS */
  66. beqz a4, .Lbytecopydone
  67. add a7, a3, a4 # a7 = end address for source
  68. #endif /* !XCHAL_HAVE_LOOPS */
  69. .Lnextbyte:
  70. l8ui a6, a3, 0
  71. addi a3, a3, 1
  72. s8i a6, a5, 0
  73. addi a5, a5, 1
  74. #if !XCHAL_HAVE_LOOPS
  75. bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
  76. #endif /* !XCHAL_HAVE_LOOPS */
  77. .Lbytecopydone:
  78. abi_ret_default
  79. /*
  80. * Destination is unaligned
  81. */
  82. .align 4
  83. .Ldst1mod2: # dst is only byte aligned
  84. _bltui a4, 7, .Lbytecopy # do short copies byte by byte
  85. # copy 1 byte
  86. l8ui a6, a3, 0
  87. addi a3, a3, 1
  88. addi a4, a4, -1
  89. s8i a6, a5, 0
  90. addi a5, a5, 1
  91. _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
  92. # return to main algorithm
  93. .Ldst2mod4: # dst 16-bit aligned
  94. # copy 2 bytes
  95. _bltui a4, 6, .Lbytecopy # do short copies byte by byte
  96. l8ui a6, a3, 0
  97. l8ui a7, a3, 1
  98. addi a3, a3, 2
  99. addi a4, a4, -2
  100. s8i a6, a5, 0
  101. s8i a7, a5, 1
  102. addi a5, a5, 2
  103. j .Ldstaligned # dst is now aligned, return to main algorithm
  104. ENTRY(__memcpy)
  105. WEAK(memcpy)
  106. abi_entry_default
  107. # a2/ dst, a3/ src, a4/ len
  108. mov a5, a2 # copy dst so that a2 is return value
  109. .Lcommon:
  110. _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
  111. _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
  112. .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
  113. srli a7, a4, 4 # number of loop iterations with 16B
  114. # per iteration
  115. movi a8, 3 # if source is not aligned,
  116. _bany a3, a8, .Lsrcunaligned # then use shifting copy
  117. /*
  118. * Destination and source are word-aligned, use word copy.
  119. */
  120. # copy 16 bytes per iteration for word-aligned dst and word-aligned src
  121. #if XCHAL_HAVE_LOOPS
  122. loopnez a7, .Loop1done
  123. #else /* !XCHAL_HAVE_LOOPS */
  124. beqz a7, .Loop1done
  125. slli a8, a7, 4
  126. add a8, a8, a3 # a8 = end of last 16B source chunk
  127. #endif /* !XCHAL_HAVE_LOOPS */
  128. .Loop1:
  129. l32i a6, a3, 0
  130. l32i a7, a3, 4
  131. s32i a6, a5, 0
  132. l32i a6, a3, 8
  133. s32i a7, a5, 4
  134. l32i a7, a3, 12
  135. s32i a6, a5, 8
  136. addi a3, a3, 16
  137. s32i a7, a5, 12
  138. addi a5, a5, 16
  139. #if !XCHAL_HAVE_LOOPS
  140. bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
  141. #endif /* !XCHAL_HAVE_LOOPS */
  142. .Loop1done:
  143. bbci.l a4, 3, .L2
  144. # copy 8 bytes
  145. l32i a6, a3, 0
  146. l32i a7, a3, 4
  147. addi a3, a3, 8
  148. s32i a6, a5, 0
  149. s32i a7, a5, 4
  150. addi a5, a5, 8
  151. .L2:
  152. bbsi.l a4, 2, .L3
  153. bbsi.l a4, 1, .L4
  154. bbsi.l a4, 0, .L5
  155. abi_ret_default
  156. .L3:
  157. # copy 4 bytes
  158. l32i a6, a3, 0
  159. addi a3, a3, 4
  160. s32i a6, a5, 0
  161. addi a5, a5, 4
  162. bbsi.l a4, 1, .L4
  163. bbsi.l a4, 0, .L5
  164. abi_ret_default
  165. .L4:
  166. # copy 2 bytes
  167. l16ui a6, a3, 0
  168. addi a3, a3, 2
  169. s16i a6, a5, 0
  170. addi a5, a5, 2
  171. bbsi.l a4, 0, .L5
  172. abi_ret_default
  173. .L5:
  174. # copy 1 byte
  175. l8ui a6, a3, 0
  176. s8i a6, a5, 0
  177. abi_ret_default
  178. /*
  179. * Destination is aligned, Source is unaligned
  180. */
  181. .align 4
  182. .Lsrcunaligned:
  183. _beqz a4, .Ldone # avoid loading anything for zero-length copies
  184. # copy 16 bytes per iteration for word-aligned dst and unaligned src
  185. __ssa8 a3 # set shift amount from byte offset
  186. /* set to 1 when running on ISS (simulator) with the
  187. lint or ferret client, or 0 to save a few cycles */
  188. #define SIM_CHECKS_ALIGNMENT 1
  189. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  190. and a11, a3, a8 # save unalignment offset for below
  191. sub a3, a3, a11 # align a3
  192. #endif
  193. l32i a6, a3, 0 # load first word
  194. #if XCHAL_HAVE_LOOPS
  195. loopnez a7, .Loop2done
  196. #else /* !XCHAL_HAVE_LOOPS */
  197. beqz a7, .Loop2done
  198. slli a10, a7, 4
  199. add a10, a10, a3 # a10 = end of last 16B source chunk
  200. #endif /* !XCHAL_HAVE_LOOPS */
  201. .Loop2:
  202. l32i a7, a3, 4
  203. l32i a8, a3, 8
  204. __src_b a6, a6, a7
  205. s32i a6, a5, 0
  206. l32i a9, a3, 12
  207. __src_b a7, a7, a8
  208. s32i a7, a5, 4
  209. l32i a6, a3, 16
  210. __src_b a8, a8, a9
  211. s32i a8, a5, 8
  212. addi a3, a3, 16
  213. __src_b a9, a9, a6
  214. s32i a9, a5, 12
  215. addi a5, a5, 16
  216. #if !XCHAL_HAVE_LOOPS
  217. bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
  218. #endif /* !XCHAL_HAVE_LOOPS */
  219. .Loop2done:
  220. bbci.l a4, 3, .L12
  221. # copy 8 bytes
  222. l32i a7, a3, 4
  223. l32i a8, a3, 8
  224. __src_b a6, a6, a7
  225. s32i a6, a5, 0
  226. addi a3, a3, 8
  227. __src_b a7, a7, a8
  228. s32i a7, a5, 4
  229. addi a5, a5, 8
  230. mov a6, a8
  231. .L12:
  232. bbci.l a4, 2, .L13
  233. # copy 4 bytes
  234. l32i a7, a3, 4
  235. addi a3, a3, 4
  236. __src_b a6, a6, a7
  237. s32i a6, a5, 0
  238. addi a5, a5, 4
  239. mov a6, a7
  240. .L13:
  241. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  242. add a3, a3, a11 # readjust a3 with correct misalignment
  243. #endif
  244. bbsi.l a4, 1, .L14
  245. bbsi.l a4, 0, .L15
  246. .Ldone: abi_ret_default
  247. .L14:
  248. # copy 2 bytes
  249. l8ui a6, a3, 0
  250. l8ui a7, a3, 1
  251. addi a3, a3, 2
  252. s8i a6, a5, 0
  253. s8i a7, a5, 1
  254. addi a5, a5, 2
  255. bbsi.l a4, 0, .L15
  256. abi_ret_default
  257. .L15:
  258. # copy 1 byte
  259. l8ui a6, a3, 0
  260. s8i a6, a5, 0
  261. abi_ret_default
  262. ENDPROC(__memcpy)
  263. /*
  264. * void bcopy(const void *src, void *dest, size_t n);
  265. */
  266. ENTRY(bcopy)
  267. abi_entry_default
  268. # a2=src, a3=dst, a4=len
  269. mov a5, a3
  270. mov a3, a2
  271. mov a2, a5
  272. j .Lmovecommon # go to common code for memmove+bcopy
  273. ENDPROC(bcopy)
  274. /*
  275. * void *memmove(void *dst, const void *src, size_t len);
  276. *
  277. * This function is intended to do the same thing as the standard
  278. * library function memmove() for most cases.
  279. * However, where the source and/or destination references
  280. * an instruction RAM or ROM or a data RAM or ROM, that
  281. * source and/or destination will always be accessed with
  282. * 32-bit load and store instructions (as required for these
  283. * types of devices).
  284. *
  285. * !!!!!!! XTFIXME:
  286. * !!!!!!! Handling of IRAM/IROM has not yet
  287. * !!!!!!! been implemented.
  288. *
  289. * The (general case) algorithm is as follows:
  290. * If end of source doesn't overlap destination then use memcpy.
  291. * Otherwise do memcpy backwards.
  292. *
  293. * Register use:
  294. * a0/ return address
  295. * a1/ stack pointer
  296. * a2/ return value
  297. * a3/ src
  298. * a4/ length
  299. * a5/ dst
  300. * a6/ tmp
  301. * a7/ tmp
  302. * a8/ tmp
  303. * a9/ tmp
  304. * a10/ tmp
  305. * a11/ tmp
  306. */
  307. /*
  308. * Byte by byte copy
  309. */
  310. .align 4
  311. .byte 0 # 1 mod 4 alignment for LOOPNEZ
  312. # (0 mod 4 alignment for LBEG)
  313. .Lbackbytecopy:
  314. #if XCHAL_HAVE_LOOPS
  315. loopnez a4, .Lbackbytecopydone
  316. #else /* !XCHAL_HAVE_LOOPS */
  317. beqz a4, .Lbackbytecopydone
  318. sub a7, a3, a4 # a7 = start address for source
  319. #endif /* !XCHAL_HAVE_LOOPS */
  320. .Lbacknextbyte:
  321. addi a3, a3, -1
  322. l8ui a6, a3, 0
  323. addi a5, a5, -1
  324. s8i a6, a5, 0
  325. #if !XCHAL_HAVE_LOOPS
  326. bne a3, a7, .Lbacknextbyte # continue loop if
  327. # $a3:src != $a7:src_start
  328. #endif /* !XCHAL_HAVE_LOOPS */
  329. .Lbackbytecopydone:
  330. abi_ret_default
  331. /*
  332. * Destination is unaligned
  333. */
  334. .align 4
  335. .Lbackdst1mod2: # dst is only byte aligned
  336. _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
  337. # copy 1 byte
  338. addi a3, a3, -1
  339. l8ui a6, a3, 0
  340. addi a5, a5, -1
  341. s8i a6, a5, 0
  342. addi a4, a4, -1
  343. _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
  344. # return to main algorithm
  345. .Lbackdst2mod4: # dst 16-bit aligned
  346. # copy 2 bytes
  347. _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
  348. addi a3, a3, -2
  349. l8ui a6, a3, 0
  350. l8ui a7, a3, 1
  351. addi a5, a5, -2
  352. s8i a6, a5, 0
  353. s8i a7, a5, 1
  354. addi a4, a4, -2
  355. j .Lbackdstaligned # dst is now aligned,
  356. # return to main algorithm
  357. ENTRY(__memmove)
  358. WEAK(memmove)
  359. abi_entry_default
  360. # a2/ dst, a3/ src, a4/ len
  361. mov a5, a2 # copy dst so that a2 is return value
  362. .Lmovecommon:
  363. sub a6, a5, a3
  364. bgeu a6, a4, .Lcommon
  365. add a5, a5, a4
  366. add a3, a3, a4
  367. _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
  368. _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
  369. .Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
  370. srli a7, a4, 4 # number of loop iterations with 16B
  371. # per iteration
  372. movi a8, 3 # if source is not aligned,
  373. _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
  374. /*
  375. * Destination and source are word-aligned, use word copy.
  376. */
  377. # copy 16 bytes per iteration for word-aligned dst and word-aligned src
  378. #if XCHAL_HAVE_LOOPS
  379. loopnez a7, .LbackLoop1done
  380. #else /* !XCHAL_HAVE_LOOPS */
  381. beqz a7, .LbackLoop1done
  382. slli a8, a7, 4
  383. sub a8, a3, a8 # a8 = start of first 16B source chunk
  384. #endif /* !XCHAL_HAVE_LOOPS */
  385. .LbackLoop1:
  386. addi a3, a3, -16
  387. l32i a7, a3, 12
  388. l32i a6, a3, 8
  389. addi a5, a5, -16
  390. s32i a7, a5, 12
  391. l32i a7, a3, 4
  392. s32i a6, a5, 8
  393. l32i a6, a3, 0
  394. s32i a7, a5, 4
  395. s32i a6, a5, 0
  396. #if !XCHAL_HAVE_LOOPS
  397. bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start
  398. #endif /* !XCHAL_HAVE_LOOPS */
  399. .LbackLoop1done:
  400. bbci.l a4, 3, .Lback2
  401. # copy 8 bytes
  402. addi a3, a3, -8
  403. l32i a6, a3, 0
  404. l32i a7, a3, 4
  405. addi a5, a5, -8
  406. s32i a6, a5, 0
  407. s32i a7, a5, 4
  408. .Lback2:
  409. bbsi.l a4, 2, .Lback3
  410. bbsi.l a4, 1, .Lback4
  411. bbsi.l a4, 0, .Lback5
  412. abi_ret_default
  413. .Lback3:
  414. # copy 4 bytes
  415. addi a3, a3, -4
  416. l32i a6, a3, 0
  417. addi a5, a5, -4
  418. s32i a6, a5, 0
  419. bbsi.l a4, 1, .Lback4
  420. bbsi.l a4, 0, .Lback5
  421. abi_ret_default
  422. .Lback4:
  423. # copy 2 bytes
  424. addi a3, a3, -2
  425. l16ui a6, a3, 0
  426. addi a5, a5, -2
  427. s16i a6, a5, 0
  428. bbsi.l a4, 0, .Lback5
  429. abi_ret_default
  430. .Lback5:
  431. # copy 1 byte
  432. addi a3, a3, -1
  433. l8ui a6, a3, 0
  434. addi a5, a5, -1
  435. s8i a6, a5, 0
  436. abi_ret_default
  437. /*
  438. * Destination is aligned, Source is unaligned
  439. */
  440. .align 4
  441. .Lbacksrcunaligned:
  442. _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
  443. # copy 16 bytes per iteration for word-aligned dst and unaligned src
  444. __ssa8 a3 # set shift amount from byte offset
  445. #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
  446. * the lint or ferret client, or 0
  447. * to save a few cycles */
  448. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  449. and a11, a3, a8 # save unalignment offset for below
  450. sub a3, a3, a11 # align a3
  451. #endif
  452. l32i a6, a3, 0 # load first word
  453. #if XCHAL_HAVE_LOOPS
  454. loopnez a7, .LbackLoop2done
  455. #else /* !XCHAL_HAVE_LOOPS */
  456. beqz a7, .LbackLoop2done
  457. slli a10, a7, 4
  458. sub a10, a3, a10 # a10 = start of first 16B source chunk
  459. #endif /* !XCHAL_HAVE_LOOPS */
  460. .LbackLoop2:
  461. addi a3, a3, -16
  462. l32i a7, a3, 12
  463. l32i a8, a3, 8
  464. addi a5, a5, -16
  465. __src_b a6, a7, a6
  466. s32i a6, a5, 12
  467. l32i a9, a3, 4
  468. __src_b a7, a8, a7
  469. s32i a7, a5, 8
  470. l32i a6, a3, 0
  471. __src_b a8, a9, a8
  472. s32i a8, a5, 4
  473. __src_b a9, a6, a9
  474. s32i a9, a5, 0
  475. #if !XCHAL_HAVE_LOOPS
  476. bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
  477. #endif /* !XCHAL_HAVE_LOOPS */
  478. .LbackLoop2done:
  479. bbci.l a4, 3, .Lback12
  480. # copy 8 bytes
  481. addi a3, a3, -8
  482. l32i a7, a3, 4
  483. l32i a8, a3, 0
  484. addi a5, a5, -8
  485. __src_b a6, a7, a6
  486. s32i a6, a5, 4
  487. __src_b a7, a8, a7
  488. s32i a7, a5, 0
  489. mov a6, a8
  490. .Lback12:
  491. bbci.l a4, 2, .Lback13
  492. # copy 4 bytes
  493. addi a3, a3, -4
  494. l32i a7, a3, 0
  495. addi a5, a5, -4
  496. __src_b a6, a7, a6
  497. s32i a6, a5, 0
  498. mov a6, a7
  499. .Lback13:
  500. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  501. add a3, a3, a11 # readjust a3 with correct misalignment
  502. #endif
  503. bbsi.l a4, 1, .Lback14
  504. bbsi.l a4, 0, .Lback15
  505. .Lbackdone:
  506. abi_ret_default
  507. .Lback14:
  508. # copy 2 bytes
  509. addi a3, a3, -2
  510. l8ui a6, a3, 0
  511. l8ui a7, a3, 1
  512. addi a5, a5, -2
  513. s8i a6, a5, 0
  514. s8i a7, a5, 1
  515. bbsi.l a4, 0, .Lback15
  516. abi_ret_default
  517. .Lback15:
  518. # copy 1 byte
  519. addi a3, a3, -1
  520. addi a5, a5, -1
  521. l8ui a6, a3, 0
  522. s8i a6, a5, 0
  523. abi_ret_default
  524. ENDPROC(__memmove)