memcopy.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. /*
  2. * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
  3. * xthal_memcpy and xthal_bcopy
  4. *
  5. * This file is subject to the terms and conditions of the GNU General Public
  6. * License. See the file "COPYING" in the main directory of this archive
  7. * for more details.
  8. *
  9. * Copyright (C) 2002 - 2012 Tensilica Inc.
  10. */
  11. #include <variant/core.h>
  12. .macro src_b r, w0, w1
  13. #ifdef __XTENSA_EB__
  14. src \r, \w0, \w1
  15. #else
  16. src \r, \w1, \w0
  17. #endif
  18. .endm
  19. .macro ssa8 r
  20. #ifdef __XTENSA_EB__
  21. ssa8b \r
  22. #else
  23. ssa8l \r
  24. #endif
  25. .endm
  26. /*
  27. * void *memcpy(void *dst, const void *src, size_t len);
  28. *
  29. * This function is intended to do the same thing as the standard
  30. * library function memcpy() for most cases.
  31. * However, where the source and/or destination references
  32. * an instruction RAM or ROM or a data RAM or ROM, that
  33. * source and/or destination will always be accessed with
  34. * 32-bit load and store instructions (as required for these
  35. * types of devices).
  36. *
  37. * !!!!!!! XTFIXME:
  38. * !!!!!!! Handling of IRAM/IROM has not yet
  39. * !!!!!!! been implemented.
  40. *
  41. * The (general case) algorithm is as follows:
  42. * If destination is unaligned, align it by conditionally
  43. * copying 1 and 2 bytes.
  44. * If source is aligned,
  45. * do 16 bytes with a loop, and then finish up with
  46. * 8, 4, 2, and 1 byte copies conditional on the length;
  47. * else (if source is unaligned),
  48. * do the same, but use SRC to align the source data.
  49. * This code tries to use fall-through branches for the common
  50. * case of aligned source and destination and multiple
  51. * of 4 (or 8) length.
  52. *
  53. * Register use:
  54. * a0/ return address
  55. * a1/ stack pointer
  56. * a2/ return value
  57. * a3/ src
  58. * a4/ length
  59. * a5/ dst
  60. * a6/ tmp
  61. * a7/ tmp
  62. * a8/ tmp
  63. * a9/ tmp
  64. * a10/ tmp
  65. * a11/ tmp
  66. */
  67. .text
  68. /*
  69. * Byte by byte copy
  70. */
  71. .align 4
  72. .byte 0 # 1 mod 4 alignment for LOOPNEZ
  73. # (0 mod 4 alignment for LBEG)
  74. .Lbytecopy:
  75. #if XCHAL_HAVE_LOOPS
  76. loopnez a4, .Lbytecopydone
  77. #else /* !XCHAL_HAVE_LOOPS */
  78. beqz a4, .Lbytecopydone
  79. add a7, a3, a4 # a7 = end address for source
  80. #endif /* !XCHAL_HAVE_LOOPS */
  81. .Lnextbyte:
  82. l8ui a6, a3, 0
  83. addi a3, a3, 1
  84. s8i a6, a5, 0
  85. addi a5, a5, 1
  86. #if !XCHAL_HAVE_LOOPS
  87. bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
  88. #endif /* !XCHAL_HAVE_LOOPS */
  89. .Lbytecopydone:
  90. retw
  91. /*
  92. * Destination is unaligned
  93. */
  94. .align 4
  95. .Ldst1mod2: # dst is only byte aligned
  96. _bltui a4, 7, .Lbytecopy # do short copies byte by byte
  97. # copy 1 byte
  98. l8ui a6, a3, 0
  99. addi a3, a3, 1
  100. addi a4, a4, -1
  101. s8i a6, a5, 0
  102. addi a5, a5, 1
  103. _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
  104. # return to main algorithm
  105. .Ldst2mod4: # dst 16-bit aligned
  106. # copy 2 bytes
  107. _bltui a4, 6, .Lbytecopy # do short copies byte by byte
  108. l8ui a6, a3, 0
  109. l8ui a7, a3, 1
  110. addi a3, a3, 2
  111. addi a4, a4, -2
  112. s8i a6, a5, 0
  113. s8i a7, a5, 1
  114. addi a5, a5, 2
  115. j .Ldstaligned # dst is now aligned, return to main algorithm
  116. .align 4
  117. .global memcpy
  118. .type memcpy,@function
  119. memcpy:
  120. entry sp, 16 # minimal stack frame
  121. # a2/ dst, a3/ src, a4/ len
  122. mov a5, a2 # copy dst so that a2 is return value
  123. .Lcommon:
  124. _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
  125. _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
  126. .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
  127. srli a7, a4, 4 # number of loop iterations with 16B
  128. # per iteration
  129. movi a8, 3 # if source is not aligned,
  130. _bany a3, a8, .Lsrcunaligned # then use shifting copy
  131. /*
  132. * Destination and source are word-aligned, use word copy.
  133. */
  134. # copy 16 bytes per iteration for word-aligned dst and word-aligned src
  135. #if XCHAL_HAVE_LOOPS
  136. loopnez a7, .Loop1done
  137. #else /* !XCHAL_HAVE_LOOPS */
  138. beqz a7, .Loop1done
  139. slli a8, a7, 4
  140. add a8, a8, a3 # a8 = end of last 16B source chunk
  141. #endif /* !XCHAL_HAVE_LOOPS */
  142. .Loop1:
  143. l32i a6, a3, 0
  144. l32i a7, a3, 4
  145. s32i a6, a5, 0
  146. l32i a6, a3, 8
  147. s32i a7, a5, 4
  148. l32i a7, a3, 12
  149. s32i a6, a5, 8
  150. addi a3, a3, 16
  151. s32i a7, a5, 12
  152. addi a5, a5, 16
  153. #if !XCHAL_HAVE_LOOPS
  154. bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
  155. #endif /* !XCHAL_HAVE_LOOPS */
  156. .Loop1done:
  157. bbci.l a4, 3, .L2
  158. # copy 8 bytes
  159. l32i a6, a3, 0
  160. l32i a7, a3, 4
  161. addi a3, a3, 8
  162. s32i a6, a5, 0
  163. s32i a7, a5, 4
  164. addi a5, a5, 8
  165. .L2:
  166. bbsi.l a4, 2, .L3
  167. bbsi.l a4, 1, .L4
  168. bbsi.l a4, 0, .L5
  169. retw
  170. .L3:
  171. # copy 4 bytes
  172. l32i a6, a3, 0
  173. addi a3, a3, 4
  174. s32i a6, a5, 0
  175. addi a5, a5, 4
  176. bbsi.l a4, 1, .L4
  177. bbsi.l a4, 0, .L5
  178. retw
  179. .L4:
  180. # copy 2 bytes
  181. l16ui a6, a3, 0
  182. addi a3, a3, 2
  183. s16i a6, a5, 0
  184. addi a5, a5, 2
  185. bbsi.l a4, 0, .L5
  186. retw
  187. .L5:
  188. # copy 1 byte
  189. l8ui a6, a3, 0
  190. s8i a6, a5, 0
  191. retw
  192. /*
  193. * Destination is aligned, Source is unaligned
  194. */
  195. .align 4
  196. .Lsrcunaligned:
  197. _beqz a4, .Ldone # avoid loading anything for zero-length copies
  198. # copy 16 bytes per iteration for word-aligned dst and unaligned src
  199. ssa8 a3 # set shift amount from byte offset
  200. #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the
  201. lint or ferret client, or 0 to save a few cycles */
  202. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  203. and a11, a3, a8 # save unalignment offset for below
  204. sub a3, a3, a11 # align a3
  205. #endif
  206. l32i a6, a3, 0 # load first word
  207. #if XCHAL_HAVE_LOOPS
  208. loopnez a7, .Loop2done
  209. #else /* !XCHAL_HAVE_LOOPS */
  210. beqz a7, .Loop2done
  211. slli a10, a7, 4
  212. add a10, a10, a3 # a10 = end of last 16B source chunk
  213. #endif /* !XCHAL_HAVE_LOOPS */
  214. .Loop2:
  215. l32i a7, a3, 4
  216. l32i a8, a3, 8
  217. src_b a6, a6, a7
  218. s32i a6, a5, 0
  219. l32i a9, a3, 12
  220. src_b a7, a7, a8
  221. s32i a7, a5, 4
  222. l32i a6, a3, 16
  223. src_b a8, a8, a9
  224. s32i a8, a5, 8
  225. addi a3, a3, 16
  226. src_b a9, a9, a6
  227. s32i a9, a5, 12
  228. addi a5, a5, 16
  229. #if !XCHAL_HAVE_LOOPS
  230. bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
  231. #endif /* !XCHAL_HAVE_LOOPS */
  232. .Loop2done:
  233. bbci.l a4, 3, .L12
  234. # copy 8 bytes
  235. l32i a7, a3, 4
  236. l32i a8, a3, 8
  237. src_b a6, a6, a7
  238. s32i a6, a5, 0
  239. addi a3, a3, 8
  240. src_b a7, a7, a8
  241. s32i a7, a5, 4
  242. addi a5, a5, 8
  243. mov a6, a8
  244. .L12:
  245. bbci.l a4, 2, .L13
  246. # copy 4 bytes
  247. l32i a7, a3, 4
  248. addi a3, a3, 4
  249. src_b a6, a6, a7
  250. s32i a6, a5, 0
  251. addi a5, a5, 4
  252. mov a6, a7
  253. .L13:
  254. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  255. add a3, a3, a11 # readjust a3 with correct misalignment
  256. #endif
  257. bbsi.l a4, 1, .L14
  258. bbsi.l a4, 0, .L15
  259. .Ldone: retw
  260. .L14:
  261. # copy 2 bytes
  262. l8ui a6, a3, 0
  263. l8ui a7, a3, 1
  264. addi a3, a3, 2
  265. s8i a6, a5, 0
  266. s8i a7, a5, 1
  267. addi a5, a5, 2
  268. bbsi.l a4, 0, .L15
  269. retw
  270. .L15:
  271. # copy 1 byte
  272. l8ui a6, a3, 0
  273. s8i a6, a5, 0
  274. retw
  275. /*
  276. * void bcopy(const void *src, void *dest, size_t n);
  277. */
  278. .align 4
  279. .global bcopy
  280. .type bcopy,@function
  281. bcopy:
  282. entry sp, 16 # minimal stack frame
  283. # a2=src, a3=dst, a4=len
  284. mov a5, a3
  285. mov a3, a2
  286. mov a2, a5
  287. j .Lmovecommon # go to common code for memmove+bcopy
  288. /*
  289. * void *memmove(void *dst, const void *src, size_t len);
  290. *
  291. * This function is intended to do the same thing as the standard
  292. * library function memmove() for most cases.
  293. * However, where the source and/or destination references
  294. * an instruction RAM or ROM or a data RAM or ROM, that
  295. * source and/or destination will always be accessed with
  296. * 32-bit load and store instructions (as required for these
  297. * types of devices).
  298. *
  299. * !!!!!!! XTFIXME:
  300. * !!!!!!! Handling of IRAM/IROM has not yet
  301. * !!!!!!! been implemented.
  302. *
  303. * The (general case) algorithm is as follows:
  304. * If end of source doesn't overlap destination then use memcpy.
  305. * Otherwise do memcpy backwards.
  306. *
  307. * Register use:
  308. * a0/ return address
  309. * a1/ stack pointer
  310. * a2/ return value
  311. * a3/ src
  312. * a4/ length
  313. * a5/ dst
  314. * a6/ tmp
  315. * a7/ tmp
  316. * a8/ tmp
  317. * a9/ tmp
  318. * a10/ tmp
  319. * a11/ tmp
  320. */
  321. /*
  322. * Byte by byte copy
  323. */
  324. .align 4
  325. .byte 0 # 1 mod 4 alignment for LOOPNEZ
  326. # (0 mod 4 alignment for LBEG)
  327. .Lbackbytecopy:
  328. #if XCHAL_HAVE_LOOPS
  329. loopnez a4, .Lbackbytecopydone
  330. #else /* !XCHAL_HAVE_LOOPS */
  331. beqz a4, .Lbackbytecopydone
  332. sub a7, a3, a4 # a7 = start address for source
  333. #endif /* !XCHAL_HAVE_LOOPS */
  334. .Lbacknextbyte:
  335. addi a3, a3, -1
  336. l8ui a6, a3, 0
  337. addi a5, a5, -1
  338. s8i a6, a5, 0
  339. #if !XCHAL_HAVE_LOOPS
  340. bne a3, a7, .Lbacknextbyte # continue loop if
  341. # $a3:src != $a7:src_start
  342. #endif /* !XCHAL_HAVE_LOOPS */
  343. .Lbackbytecopydone:
  344. retw
  345. /*
  346. * Destination is unaligned
  347. */
  348. .align 4
  349. .Lbackdst1mod2: # dst is only byte aligned
  350. _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
  351. # copy 1 byte
  352. addi a3, a3, -1
  353. l8ui a6, a3, 0
  354. addi a5, a5, -1
  355. s8i a6, a5, 0
  356. addi a4, a4, -1
  357. _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
  358. # return to main algorithm
  359. .Lbackdst2mod4: # dst 16-bit aligned
  360. # copy 2 bytes
  361. _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
  362. addi a3, a3, -2
  363. l8ui a6, a3, 0
  364. l8ui a7, a3, 1
  365. addi a5, a5, -2
  366. s8i a6, a5, 0
  367. s8i a7, a5, 1
  368. addi a4, a4, -2
  369. j .Lbackdstaligned # dst is now aligned,
  370. # return to main algorithm
  371. .align 4
  372. .global memmove
  373. .type memmove,@function
  374. memmove:
  375. entry sp, 16 # minimal stack frame
  376. # a2/ dst, a3/ src, a4/ len
  377. mov a5, a2 # copy dst so that a2 is return value
  378. .Lmovecommon:
  379. sub a6, a5, a3
  380. bgeu a6, a4, .Lcommon
  381. add a5, a5, a4
  382. add a3, a3, a4
  383. _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
  384. _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
  385. .Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
  386. srli a7, a4, 4 # number of loop iterations with 16B
  387. # per iteration
  388. movi a8, 3 # if source is not aligned,
  389. _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
  390. /*
  391. * Destination and source are word-aligned, use word copy.
  392. */
  393. # copy 16 bytes per iteration for word-aligned dst and word-aligned src
  394. #if XCHAL_HAVE_LOOPS
  395. loopnez a7, .backLoop1done
  396. #else /* !XCHAL_HAVE_LOOPS */
  397. beqz a7, .backLoop1done
  398. slli a8, a7, 4
  399. sub a8, a3, a8 # a8 = start of first 16B source chunk
  400. #endif /* !XCHAL_HAVE_LOOPS */
  401. .backLoop1:
  402. addi a3, a3, -16
  403. l32i a7, a3, 12
  404. l32i a6, a3, 8
  405. addi a5, a5, -16
  406. s32i a7, a5, 12
  407. l32i a7, a3, 4
  408. s32i a6, a5, 8
  409. l32i a6, a3, 0
  410. s32i a7, a5, 4
  411. s32i a6, a5, 0
  412. #if !XCHAL_HAVE_LOOPS
  413. bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
  414. #endif /* !XCHAL_HAVE_LOOPS */
  415. .backLoop1done:
  416. bbci.l a4, 3, .Lback2
  417. # copy 8 bytes
  418. addi a3, a3, -8
  419. l32i a6, a3, 0
  420. l32i a7, a3, 4
  421. addi a5, a5, -8
  422. s32i a6, a5, 0
  423. s32i a7, a5, 4
  424. .Lback2:
  425. bbsi.l a4, 2, .Lback3
  426. bbsi.l a4, 1, .Lback4
  427. bbsi.l a4, 0, .Lback5
  428. retw
  429. .Lback3:
  430. # copy 4 bytes
  431. addi a3, a3, -4
  432. l32i a6, a3, 0
  433. addi a5, a5, -4
  434. s32i a6, a5, 0
  435. bbsi.l a4, 1, .Lback4
  436. bbsi.l a4, 0, .Lback5
  437. retw
  438. .Lback4:
  439. # copy 2 bytes
  440. addi a3, a3, -2
  441. l16ui a6, a3, 0
  442. addi a5, a5, -2
  443. s16i a6, a5, 0
  444. bbsi.l a4, 0, .Lback5
  445. retw
  446. .Lback5:
  447. # copy 1 byte
  448. addi a3, a3, -1
  449. l8ui a6, a3, 0
  450. addi a5, a5, -1
  451. s8i a6, a5, 0
  452. retw
  453. /*
  454. * Destination is aligned, Source is unaligned
  455. */
  456. .align 4
  457. .Lbacksrcunaligned:
  458. _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
  459. # copy 16 bytes per iteration for word-aligned dst and unaligned src
  460. ssa8 a3 # set shift amount from byte offset
  461. #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
  462. * the lint or ferret client, or 0
  463. * to save a few cycles */
  464. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  465. and a11, a3, a8 # save unalignment offset for below
  466. sub a3, a3, a11 # align a3
  467. #endif
  468. l32i a6, a3, 0 # load first word
  469. #if XCHAL_HAVE_LOOPS
  470. loopnez a7, .backLoop2done
  471. #else /* !XCHAL_HAVE_LOOPS */
  472. beqz a7, .backLoop2done
  473. slli a10, a7, 4
  474. sub a10, a3, a10 # a10 = start of first 16B source chunk
  475. #endif /* !XCHAL_HAVE_LOOPS */
  476. .backLoop2:
  477. addi a3, a3, -16
  478. l32i a7, a3, 12
  479. l32i a8, a3, 8
  480. addi a5, a5, -16
  481. src_b a6, a7, a6
  482. s32i a6, a5, 12
  483. l32i a9, a3, 4
  484. src_b a7, a8, a7
  485. s32i a7, a5, 8
  486. l32i a6, a3, 0
  487. src_b a8, a9, a8
  488. s32i a8, a5, 4
  489. src_b a9, a6, a9
  490. s32i a9, a5, 0
  491. #if !XCHAL_HAVE_LOOPS
  492. bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
  493. #endif /* !XCHAL_HAVE_LOOPS */
  494. .backLoop2done:
  495. bbci.l a4, 3, .Lback12
  496. # copy 8 bytes
  497. addi a3, a3, -8
  498. l32i a7, a3, 4
  499. l32i a8, a3, 0
  500. addi a5, a5, -8
  501. src_b a6, a7, a6
  502. s32i a6, a5, 4
  503. src_b a7, a8, a7
  504. s32i a7, a5, 0
  505. mov a6, a8
  506. .Lback12:
  507. bbci.l a4, 2, .Lback13
  508. # copy 4 bytes
  509. addi a3, a3, -4
  510. l32i a7, a3, 0
  511. addi a5, a5, -4
  512. src_b a6, a7, a6
  513. s32i a6, a5, 0
  514. mov a6, a7
  515. .Lback13:
  516. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  517. add a3, a3, a11 # readjust a3 with correct misalignment
  518. #endif
  519. bbsi.l a4, 1, .Lback14
  520. bbsi.l a4, 0, .Lback15
  521. .Lbackdone:
  522. retw
  523. .Lback14:
  524. # copy 2 bytes
  525. addi a3, a3, -2
  526. l8ui a6, a3, 0
  527. l8ui a7, a3, 1
  528. addi a5, a5, -2
  529. s8i a6, a5, 0
  530. s8i a7, a5, 1
  531. bbsi.l a4, 0, .Lback15
  532. retw
  533. .Lback15:
  534. # copy 1 byte
  535. addi a3, a3, -1
  536. addi a5, a5, -1
  537. l8ui a6, a3, 0
  538. s8i a6, a5, 0
  539. retw
  540. /*
  541. * Local Variables:
  542. * mode:fundamental
  543. * comment-start: "# "
  544. * comment-start-skip: "# *"
  545. * End:
  546. */