memcpy.S 22 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150
  1. /* memcpy.S: Sparc optimized memcpy and memmove code
  2. * Hand optimized from GNU libc's memcpy and memmove
  3. * Copyright (C) 1991,1996 Free Software Foundation
  4. * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
  5. * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
  6. * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
  7. * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
  8. */
  9. #ifdef __KERNEL__
  10. #define FUNC(x) \
  11. .globl x; \
  12. .type x,@function; \
  13. .align 4; \
  14. x:
  15. #undef FASTER_REVERSE
  16. #undef FASTER_NONALIGNED
  17. #define FASTER_ALIGNED
  18. /* In kernel these functions don't return a value.
  19. * One should use macros in asm/string.h for that purpose.
  20. * We return 0, so that bugs are more apparent.
  21. */
  22. #define SETUP_RETL
  23. #define RETL_INSN clr %o0
  24. #else
  25. /* libc */
  26. #include "DEFS.h"
  27. #define FASTER_REVERSE
  28. #define FASTER_NONALIGNED
  29. #define FASTER_ALIGNED
  30. #define SETUP_RETL mov %o0, %g6
  31. #define RETL_INSN mov %g6, %o0
  32. #endif
  33. /* Both these macros have to start with exactly the same insn */
  34. #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  35. ldd [%src + (offset) + 0x00], %t0; \
  36. ldd [%src + (offset) + 0x08], %t2; \
  37. ldd [%src + (offset) + 0x10], %t4; \
  38. ldd [%src + (offset) + 0x18], %t6; \
  39. st %t0, [%dst + (offset) + 0x00]; \
  40. st %t1, [%dst + (offset) + 0x04]; \
  41. st %t2, [%dst + (offset) + 0x08]; \
  42. st %t3, [%dst + (offset) + 0x0c]; \
  43. st %t4, [%dst + (offset) + 0x10]; \
  44. st %t5, [%dst + (offset) + 0x14]; \
  45. st %t6, [%dst + (offset) + 0x18]; \
  46. st %t7, [%dst + (offset) + 0x1c];
  47. #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  48. ldd [%src + (offset) + 0x00], %t0; \
  49. ldd [%src + (offset) + 0x08], %t2; \
  50. ldd [%src + (offset) + 0x10], %t4; \
  51. ldd [%src + (offset) + 0x18], %t6; \
  52. std %t0, [%dst + (offset) + 0x00]; \
  53. std %t2, [%dst + (offset) + 0x08]; \
  54. std %t4, [%dst + (offset) + 0x10]; \
  55. std %t6, [%dst + (offset) + 0x18];
  56. #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
  57. ldd [%src - (offset) - 0x10], %t0; \
  58. ldd [%src - (offset) - 0x08], %t2; \
  59. st %t0, [%dst - (offset) - 0x10]; \
  60. st %t1, [%dst - (offset) - 0x0c]; \
  61. st %t2, [%dst - (offset) - 0x08]; \
  62. st %t3, [%dst - (offset) - 0x04];
  63. #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
  64. ldd [%src - (offset) - 0x10], %t0; \
  65. ldd [%src - (offset) - 0x08], %t2; \
  66. std %t0, [%dst - (offset) - 0x10]; \
  67. std %t2, [%dst - (offset) - 0x08];
  68. #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
  69. ldub [%src - (offset) - 0x02], %t0; \
  70. ldub [%src - (offset) - 0x01], %t1; \
  71. stb %t0, [%dst - (offset) - 0x02]; \
  72. stb %t1, [%dst - (offset) - 0x01];
  73. /* Both these macros have to start with exactly the same insn */
  74. #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  75. ldd [%src - (offset) - 0x20], %t0; \
  76. ldd [%src - (offset) - 0x18], %t2; \
  77. ldd [%src - (offset) - 0x10], %t4; \
  78. ldd [%src - (offset) - 0x08], %t6; \
  79. st %t0, [%dst - (offset) - 0x20]; \
  80. st %t1, [%dst - (offset) - 0x1c]; \
  81. st %t2, [%dst - (offset) - 0x18]; \
  82. st %t3, [%dst - (offset) - 0x14]; \
  83. st %t4, [%dst - (offset) - 0x10]; \
  84. st %t5, [%dst - (offset) - 0x0c]; \
  85. st %t6, [%dst - (offset) - 0x08]; \
  86. st %t7, [%dst - (offset) - 0x04];
  87. #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  88. ldd [%src - (offset) - 0x20], %t0; \
  89. ldd [%src - (offset) - 0x18], %t2; \
  90. ldd [%src - (offset) - 0x10], %t4; \
  91. ldd [%src - (offset) - 0x08], %t6; \
  92. std %t0, [%dst - (offset) - 0x20]; \
  93. std %t2, [%dst - (offset) - 0x18]; \
  94. std %t4, [%dst - (offset) - 0x10]; \
  95. std %t6, [%dst - (offset) - 0x08];
  96. #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
  97. ldd [%src + (offset) + 0x00], %t0; \
  98. ldd [%src + (offset) + 0x08], %t2; \
  99. st %t0, [%dst + (offset) + 0x00]; \
  100. st %t1, [%dst + (offset) + 0x04]; \
  101. st %t2, [%dst + (offset) + 0x08]; \
  102. st %t3, [%dst + (offset) + 0x0c];
  103. #define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
  104. ldub [%src + (offset) + 0x00], %t0; \
  105. ldub [%src + (offset) + 0x01], %t1; \
  106. stb %t0, [%dst + (offset) + 0x00]; \
  107. stb %t1, [%dst + (offset) + 0x01];
  108. #define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
  109. ldd [%src + (offset) + 0x00], %t0; \
  110. ldd [%src + (offset) + 0x08], %t2; \
  111. srl %t0, shir, %t5; \
  112. srl %t1, shir, %t6; \
  113. sll %t0, shil, %t0; \
  114. or %t5, %prev, %t5; \
  115. sll %t1, shil, %prev; \
  116. or %t6, %t0, %t0; \
  117. srl %t2, shir, %t1; \
  118. srl %t3, shir, %t6; \
  119. sll %t2, shil, %t2; \
  120. or %t1, %prev, %t1; \
  121. std %t4, [%dst + (offset) + (offset2) - 0x04]; \
  122. std %t0, [%dst + (offset) + (offset2) + 0x04]; \
  123. sll %t3, shil, %prev; \
  124. or %t6, %t2, %t4;
  125. #define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
  126. ldd [%src + (offset) + 0x00], %t0; \
  127. ldd [%src + (offset) + 0x08], %t2; \
  128. srl %t0, shir, %t4; \
  129. srl %t1, shir, %t5; \
  130. sll %t0, shil, %t6; \
  131. or %t4, %prev, %t0; \
  132. sll %t1, shil, %prev; \
  133. or %t5, %t6, %t1; \
  134. srl %t2, shir, %t4; \
  135. srl %t3, shir, %t5; \
  136. sll %t2, shil, %t6; \
  137. or %t4, %prev, %t2; \
  138. sll %t3, shil, %prev; \
  139. or %t5, %t6, %t3; \
  140. std %t0, [%dst + (offset) + (offset2) + 0x00]; \
  141. std %t2, [%dst + (offset) + (offset2) + 0x08];
  142. .text
  143. .align 4
  144. #ifdef FASTER_REVERSE
  145. 70: /* rdword_align */
  146. andcc %o1, 1, %g0
  147. be 4f
  148. andcc %o1, 2, %g0
  149. ldub [%o1 - 1], %g2
  150. sub %o1, 1, %o1
  151. stb %g2, [%o0 - 1]
  152. sub %o2, 1, %o2
  153. be 3f
  154. sub %o0, 1, %o0
  155. 4:
  156. lduh [%o1 - 2], %g2
  157. sub %o1, 2, %o1
  158. sth %g2, [%o0 - 2]
  159. sub %o2, 2, %o2
  160. b 3f
  161. sub %o0, 2, %o0
  162. #endif /* FASTER_REVERSE */
  163. 0:
  164. retl
  165. nop ! Only bcopy returns here and it retuns void...
  166. #ifdef __KERNEL__
  167. FUNC(amemmove)
  168. FUNC(__memmove)
  169. #endif
  170. FUNC(memmove)
  171. cmp %o0, %o1
  172. SETUP_RETL
  173. bleu 9f
  174. sub %o0, %o1, %o4
  175. add %o1, %o2, %o3
  176. cmp %o3, %o0
  177. bleu 0f
  178. andcc %o4, 3, %o5
  179. #ifndef FASTER_REVERSE
  180. add %o1, %o2, %o1
  181. add %o0, %o2, %o0
  182. sub %o1, 1, %o1
  183. sub %o0, 1, %o0
  184. 1: /* reverse_bytes */
  185. ldub [%o1], %o4
  186. subcc %o2, 1, %o2
  187. stb %o4, [%o0]
  188. sub %o1, 1, %o1
  189. bne 1b
  190. sub %o0, 1, %o0
  191. retl
  192. RETL_INSN
  193. #else /* FASTER_REVERSE */
  194. add %o1, %o2, %o1
  195. add %o0, %o2, %o0
  196. bne 77f
  197. cmp %o2, 15
  198. bleu 91f
  199. andcc %o1, 3, %g0
  200. bne 70b
  201. 3:
  202. andcc %o1, 4, %g0
  203. be 2f
  204. mov %o2, %g1
  205. ld [%o1 - 4], %o4
  206. sub %g1, 4, %g1
  207. st %o4, [%o0 - 4]
  208. sub %o1, 4, %o1
  209. sub %o0, 4, %o0
  210. 2:
  211. andcc %g1, 0xffffff80, %g7
  212. be 3f
  213. andcc %o0, 4, %g0
  214. be 74f + 4
  215. 5:
  216. RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  217. RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  218. RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  219. RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  220. subcc %g7, 128, %g7
  221. sub %o1, 128, %o1
  222. bne 5b
  223. sub %o0, 128, %o0
  224. 3:
  225. andcc %g1, 0x70, %g7
  226. be 72f
  227. andcc %g1, 8, %g0
  228. sethi %hi(72f), %o5
  229. srl %g7, 1, %o4
  230. add %g7, %o4, %o4
  231. sub %o1, %g7, %o1
  232. sub %o5, %o4, %o5
  233. jmpl %o5 + %lo(72f), %g0
  234. sub %o0, %g7, %o0
  235. 71: /* rmemcpy_table */
  236. RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
  237. RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
  238. RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
  239. RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
  240. RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
  241. RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
  242. RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
  243. 72: /* rmemcpy_table_end */
  244. be 73f
  245. andcc %g1, 4, %g0
  246. ldd [%o1 - 0x08], %g2
  247. sub %o0, 8, %o0
  248. sub %o1, 8, %o1
  249. st %g2, [%o0]
  250. st %g3, [%o0 + 0x04]
  251. 73: /* rmemcpy_last7 */
  252. be 1f
  253. andcc %g1, 2, %g0
  254. ld [%o1 - 4], %g2
  255. sub %o1, 4, %o1
  256. st %g2, [%o0 - 4]
  257. sub %o0, 4, %o0
  258. 1:
  259. be 1f
  260. andcc %g1, 1, %g0
  261. lduh [%o1 - 2], %g2
  262. sub %o1, 2, %o1
  263. sth %g2, [%o0 - 2]
  264. sub %o0, 2, %o0
  265. 1:
  266. be 1f
  267. nop
  268. ldub [%o1 - 1], %g2
  269. stb %g2, [%o0 - 1]
  270. 1:
  271. retl
  272. RETL_INSN
  273. 74: /* rldd_std */
  274. RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  275. RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  276. RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  277. RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  278. subcc %g7, 128, %g7
  279. sub %o1, 128, %o1
  280. bne 74b
  281. sub %o0, 128, %o0
  282. andcc %g1, 0x70, %g7
  283. be 72b
  284. andcc %g1, 8, %g0
  285. sethi %hi(72b), %o5
  286. srl %g7, 1, %o4
  287. add %g7, %o4, %o4
  288. sub %o1, %g7, %o1
  289. sub %o5, %o4, %o5
  290. jmpl %o5 + %lo(72b), %g0
  291. sub %o0, %g7, %o0
  292. 75: /* rshort_end */
  293. and %o2, 0xe, %o3
  294. 2:
  295. sethi %hi(76f), %o5
  296. sll %o3, 3, %o4
  297. sub %o0, %o3, %o0
  298. sub %o5, %o4, %o5
  299. sub %o1, %o3, %o1
  300. jmpl %o5 + %lo(76f), %g0
  301. andcc %o2, 1, %g0
  302. RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
  303. RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
  304. RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
  305. RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
  306. RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
  307. RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
  308. RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
  309. 76: /* rshort_table_end */
  310. be 1f
  311. nop
  312. ldub [%o1 - 1], %g2
  313. stb %g2, [%o0 - 1]
  314. 1:
  315. retl
  316. RETL_INSN
  317. 91: /* rshort_aligned_end */
  318. bne 75b
  319. andcc %o2, 8, %g0
  320. be 1f
  321. andcc %o2, 4, %g0
  322. ld [%o1 - 0x08], %g2
  323. ld [%o1 - 0x04], %g3
  324. sub %o1, 8, %o1
  325. st %g2, [%o0 - 0x08]
  326. st %g3, [%o0 - 0x04]
  327. sub %o0, 8, %o0
  328. 1:
  329. b 73b
  330. mov %o2, %g1
  331. 77: /* rnon_aligned */
  332. cmp %o2, 15
  333. bleu 75b
  334. andcc %o0, 3, %g0
  335. be 64f
  336. andcc %o0, 1, %g0
  337. be 63f
  338. andcc %o0, 2, %g0
  339. ldub [%o1 - 1], %g5
  340. sub %o1, 1, %o1
  341. stb %g5, [%o0 - 1]
  342. sub %o0, 1, %o0
  343. be 64f
  344. sub %o2, 1, %o2
  345. 63:
  346. ldub [%o1 - 1], %g5
  347. sub %o1, 2, %o1
  348. stb %g5, [%o0 - 1]
  349. sub %o0, 2, %o0
  350. ldub [%o1], %g5
  351. sub %o2, 2, %o2
  352. stb %g5, [%o0]
  353. 64:
  354. and %o1, 3, %g2
  355. and %o1, -4, %o1
  356. and %o2, 0xc, %g3
  357. add %o1, 4, %o1
  358. cmp %g3, 4
  359. sll %g2, 3, %g4
  360. mov 32, %g2
  361. be 4f
  362. sub %g2, %g4, %g7
  363. blu 3f
  364. cmp %g3, 8
  365. be 2f
  366. srl %o2, 2, %g3
  367. ld [%o1 - 4], %o3
  368. add %o0, -8, %o0
  369. ld [%o1 - 8], %o4
  370. add %o1, -16, %o1
  371. b 7f
  372. add %g3, 1, %g3
  373. 2:
  374. ld [%o1 - 4], %o4
  375. add %o0, -4, %o0
  376. ld [%o1 - 8], %g1
  377. add %o1, -12, %o1
  378. b 8f
  379. add %g3, 2, %g3
  380. 3:
  381. ld [%o1 - 4], %o5
  382. add %o0, -12, %o0
  383. ld [%o1 - 8], %o3
  384. add %o1, -20, %o1
  385. b 6f
  386. srl %o2, 2, %g3
  387. 4:
  388. ld [%o1 - 4], %g1
  389. srl %o2, 2, %g3
  390. ld [%o1 - 8], %o5
  391. add %o1, -24, %o1
  392. add %o0, -16, %o0
  393. add %g3, -1, %g3
  394. ld [%o1 + 12], %o3
  395. 5:
  396. sll %o5, %g4, %g2
  397. srl %g1, %g7, %g5
  398. or %g2, %g5, %g2
  399. st %g2, [%o0 + 12]
  400. 6:
  401. ld [%o1 + 8], %o4
  402. sll %o3, %g4, %g2
  403. srl %o5, %g7, %g5
  404. or %g2, %g5, %g2
  405. st %g2, [%o0 + 8]
  406. 7:
  407. ld [%o1 + 4], %g1
  408. sll %o4, %g4, %g2
  409. srl %o3, %g7, %g5
  410. or %g2, %g5, %g2
  411. st %g2, [%o0 + 4]
  412. 8:
  413. ld [%o1], %o5
  414. sll %g1, %g4, %g2
  415. srl %o4, %g7, %g5
  416. addcc %g3, -4, %g3
  417. or %g2, %g5, %g2
  418. add %o1, -16, %o1
  419. st %g2, [%o0]
  420. add %o0, -16, %o0
  421. bne,a 5b
  422. ld [%o1 + 12], %o3
  423. sll %o5, %g4, %g2
  424. srl %g1, %g7, %g5
  425. srl %g4, 3, %g3
  426. or %g2, %g5, %g2
  427. add %o1, %g3, %o1
  428. andcc %o2, 2, %g0
  429. st %g2, [%o0 + 12]
  430. be 1f
  431. andcc %o2, 1, %g0
  432. ldub [%o1 + 15], %g5
  433. add %o1, -2, %o1
  434. stb %g5, [%o0 + 11]
  435. add %o0, -2, %o0
  436. ldub [%o1 + 16], %g5
  437. stb %g5, [%o0 + 12]
  438. 1:
  439. be 1f
  440. nop
  441. ldub [%o1 + 15], %g5
  442. stb %g5, [%o0 + 11]
  443. 1:
  444. retl
  445. RETL_INSN
  446. #endif /* FASTER_REVERSE */
  447. /* NOTE: This code is executed just for the cases,
  448. where %src (=%o1) & 3 is != 0.
  449. We need to align it to 4. So, for (%src & 3)
  450. 1 we need to do ldub,lduh
  451. 2 lduh
  452. 3 just ldub
  453. so even if it looks weird, the branches
  454. are correct here. -jj
  455. */
  456. 78: /* dword_align */
  457. andcc %o1, 1, %g0
  458. be 4f
  459. andcc %o1, 2, %g0
  460. ldub [%o1], %g2
  461. add %o1, 1, %o1
  462. stb %g2, [%o0]
  463. sub %o2, 1, %o2
  464. bne 3f
  465. add %o0, 1, %o0
  466. 4:
  467. lduh [%o1], %g2
  468. add %o1, 2, %o1
  469. sth %g2, [%o0]
  470. sub %o2, 2, %o2
  471. b 3f
  472. add %o0, 2, %o0
  473. #ifdef __KERNEL__
  474. FUNC(__memcpy)
  475. #endif
  476. FUNC(memcpy) /* %o0=dst %o1=src %o2=len */
  477. sub %o0, %o1, %o4
  478. SETUP_RETL
  479. 9:
  480. andcc %o4, 3, %o5
  481. 0:
  482. bne 86f
  483. cmp %o2, 15
  484. bleu 90f
  485. andcc %o1, 3, %g0
  486. bne 78b
  487. 3:
  488. andcc %o1, 4, %g0
  489. be 2f
  490. mov %o2, %g1
  491. ld [%o1], %o4
  492. sub %g1, 4, %g1
  493. st %o4, [%o0]
  494. add %o1, 4, %o1
  495. add %o0, 4, %o0
  496. 2:
  497. andcc %g1, 0xffffff80, %g7
  498. be 3f
  499. andcc %o0, 4, %g0
  500. be 82f + 4
  501. 5:
  502. MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  503. MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  504. MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  505. MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  506. subcc %g7, 128, %g7
  507. add %o1, 128, %o1
  508. bne 5b
  509. add %o0, 128, %o0
  510. 3:
  511. andcc %g1, 0x70, %g7
  512. be 80f
  513. andcc %g1, 8, %g0
  514. sethi %hi(80f), %o5
  515. srl %g7, 1, %o4
  516. add %g7, %o4, %o4
  517. add %o1, %g7, %o1
  518. sub %o5, %o4, %o5
  519. jmpl %o5 + %lo(80f), %g0
  520. add %o0, %g7, %o0
  521. 79: /* memcpy_table */
  522. MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
  523. MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
  524. MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
  525. MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
  526. MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
  527. MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
  528. MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
  529. 80: /* memcpy_table_end */
  530. be 81f
  531. andcc %g1, 4, %g0
  532. ldd [%o1], %g2
  533. add %o0, 8, %o0
  534. st %g2, [%o0 - 0x08]
  535. add %o1, 8, %o1
  536. st %g3, [%o0 - 0x04]
  537. 81: /* memcpy_last7 */
  538. be 1f
  539. andcc %g1, 2, %g0
  540. ld [%o1], %g2
  541. add %o1, 4, %o1
  542. st %g2, [%o0]
  543. add %o0, 4, %o0
  544. 1:
  545. be 1f
  546. andcc %g1, 1, %g0
  547. lduh [%o1], %g2
  548. add %o1, 2, %o1
  549. sth %g2, [%o0]
  550. add %o0, 2, %o0
  551. 1:
  552. be 1f
  553. nop
  554. ldub [%o1], %g2
  555. stb %g2, [%o0]
  556. 1:
  557. retl
  558. RETL_INSN
  559. 82: /* ldd_std */
  560. MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  561. MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  562. MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  563. MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  564. subcc %g7, 128, %g7
  565. add %o1, 128, %o1
  566. bne 82b
  567. add %o0, 128, %o0
  568. #ifndef FASTER_ALIGNED
  569. andcc %g1, 0x70, %g7
  570. be 80b
  571. andcc %g1, 8, %g0
  572. sethi %hi(80b), %o5
  573. srl %g7, 1, %o4
  574. add %g7, %o4, %o4
  575. add %o1, %g7, %o1
  576. sub %o5, %o4, %o5
  577. jmpl %o5 + %lo(80b), %g0
  578. add %o0, %g7, %o0
  579. #else /* FASTER_ALIGNED */
  580. andcc %g1, 0x70, %g7
  581. be 84f
  582. andcc %g1, 8, %g0
  583. sethi %hi(84f), %o5
  584. add %o1, %g7, %o1
  585. sub %o5, %g7, %o5
  586. jmpl %o5 + %lo(84f), %g0
  587. add %o0, %g7, %o0
  588. 83: /* amemcpy_table */
  589. MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
  590. MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
  591. MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
  592. MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
  593. MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
  594. MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
  595. MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
  596. 84: /* amemcpy_table_end */
  597. be 85f
  598. andcc %g1, 4, %g0
  599. ldd [%o1], %g2
  600. add %o0, 8, %o0
  601. std %g2, [%o0 - 0x08]
  602. add %o1, 8, %o1
  603. 85: /* amemcpy_last7 */
  604. be 1f
  605. andcc %g1, 2, %g0
  606. ld [%o1], %g2
  607. add %o1, 4, %o1
  608. st %g2, [%o0]
  609. add %o0, 4, %o0
  610. 1:
  611. be 1f
  612. andcc %g1, 1, %g0
  613. lduh [%o1], %g2
  614. add %o1, 2, %o1
  615. sth %g2, [%o0]
  616. add %o0, 2, %o0
  617. 1:
  618. be 1f
  619. nop
  620. ldub [%o1], %g2
  621. stb %g2, [%o0]
  622. 1:
  623. retl
  624. RETL_INSN
  625. #endif /* FASTER_ALIGNED */
  626. 86: /* non_aligned */
  627. cmp %o2, 6
  628. bleu 88f
  629. #ifdef FASTER_NONALIGNED
  630. cmp %o2, 256
  631. bcc 87f
  632. #endif /* FASTER_NONALIGNED */
  633. andcc %o0, 3, %g0
  634. be 61f
  635. andcc %o0, 1, %g0
  636. be 60f
  637. andcc %o0, 2, %g0
  638. ldub [%o1], %g5
  639. add %o1, 1, %o1
  640. stb %g5, [%o0]
  641. sub %o2, 1, %o2
  642. bne 61f
  643. add %o0, 1, %o0
  644. 60:
  645. ldub [%o1], %g3
  646. add %o1, 2, %o1
  647. stb %g3, [%o0]
  648. sub %o2, 2, %o2
  649. ldub [%o1 - 1], %g3
  650. add %o0, 2, %o0
  651. stb %g3, [%o0 - 1]
  652. 61:
  653. and %o1, 3, %g2
  654. and %o2, 0xc, %g3
  655. and %o1, -4, %o1
  656. cmp %g3, 4
  657. sll %g2, 3, %g4
  658. mov 32, %g2
  659. be 4f
  660. sub %g2, %g4, %g7
  661. blu 3f
  662. cmp %g3, 0x8
  663. be 2f
  664. srl %o2, 2, %g3
  665. ld [%o1], %o3
  666. add %o0, -8, %o0
  667. ld [%o1 + 4], %o4
  668. b 8f
  669. add %g3, 1, %g3
  670. 2:
  671. ld [%o1], %o4
  672. add %o0, -12, %o0
  673. ld [%o1 + 4], %o5
  674. add %g3, 2, %g3
  675. b 9f
  676. add %o1, -4, %o1
  677. 3:
  678. ld [%o1], %g1
  679. add %o0, -4, %o0
  680. ld [%o1 + 4], %o3
  681. srl %o2, 2, %g3
  682. b 7f
  683. add %o1, 4, %o1
  684. 4:
  685. ld [%o1], %o5
  686. cmp %o2, 7
  687. ld [%o1 + 4], %g1
  688. srl %o2, 2, %g3
  689. bleu 10f
  690. add %o1, 8, %o1
  691. ld [%o1], %o3
  692. add %g3, -1, %g3
  693. 5:
  694. sll %o5, %g4, %g2
  695. srl %g1, %g7, %g5
  696. or %g2, %g5, %g2
  697. st %g2, [%o0]
  698. 7:
  699. ld [%o1 + 4], %o4
  700. sll %g1, %g4, %g2
  701. srl %o3, %g7, %g5
  702. or %g2, %g5, %g2
  703. st %g2, [%o0 + 4]
  704. 8:
  705. ld [%o1 + 8], %o5
  706. sll %o3, %g4, %g2
  707. srl %o4, %g7, %g5
  708. or %g2, %g5, %g2
  709. st %g2, [%o0 + 8]
  710. 9:
  711. ld [%o1 + 12], %g1
  712. sll %o4, %g4, %g2
  713. srl %o5, %g7, %g5
  714. addcc %g3, -4, %g3
  715. or %g2, %g5, %g2
  716. add %o1, 16, %o1
  717. st %g2, [%o0 + 12]
  718. add %o0, 16, %o0
  719. bne,a 5b
  720. ld [%o1], %o3
  721. 10:
  722. sll %o5, %g4, %g2
  723. srl %g1, %g7, %g5
  724. srl %g7, 3, %g3
  725. or %g2, %g5, %g2
  726. sub %o1, %g3, %o1
  727. andcc %o2, 2, %g0
  728. st %g2, [%o0]
  729. be 1f
  730. andcc %o2, 1, %g0
  731. ldub [%o1], %g2
  732. add %o1, 2, %o1
  733. stb %g2, [%o0 + 4]
  734. add %o0, 2, %o0
  735. ldub [%o1 - 1], %g2
  736. stb %g2, [%o0 + 3]
  737. 1:
  738. be 1f
  739. nop
  740. ldub [%o1], %g2
  741. stb %g2, [%o0 + 4]
  742. 1:
  743. retl
  744. RETL_INSN
  745. #ifdef FASTER_NONALIGNED
  746. 87: /* faster_nonaligned */
  747. andcc %o1, 3, %g0
  748. be 3f
  749. andcc %o1, 1, %g0
  750. be 4f
  751. andcc %o1, 2, %g0
  752. ldub [%o1], %g2
  753. add %o1, 1, %o1
  754. stb %g2, [%o0]
  755. sub %o2, 1, %o2
  756. bne 3f
  757. add %o0, 1, %o0
  758. 4:
  759. lduh [%o1], %g2
  760. add %o1, 2, %o1
  761. srl %g2, 8, %g3
  762. sub %o2, 2, %o2
  763. stb %g3, [%o0]
  764. add %o0, 2, %o0
  765. stb %g2, [%o0 - 1]
  766. 3:
  767. andcc %o1, 4, %g0
  768. bne 2f
  769. cmp %o5, 1
  770. ld [%o1], %o4
  771. srl %o4, 24, %g2
  772. stb %g2, [%o0]
  773. srl %o4, 16, %g3
  774. stb %g3, [%o0 + 1]
  775. srl %o4, 8, %g2
  776. stb %g2, [%o0 + 2]
  777. sub %o2, 4, %o2
  778. stb %o4, [%o0 + 3]
  779. add %o1, 4, %o1
  780. add %o0, 4, %o0
  781. 2:
  782. be 33f
  783. cmp %o5, 2
  784. be 32f
  785. sub %o2, 4, %o2
  786. 31:
  787. ld [%o1], %g2
  788. add %o1, 4, %o1
  789. srl %g2, 24, %g3
  790. and %o0, 7, %g5
  791. stb %g3, [%o0]
  792. cmp %g5, 7
  793. sll %g2, 8, %g1
  794. add %o0, 4, %o0
  795. be 41f
  796. and %o2, 0xffffffc0, %o3
  797. ld [%o0 - 7], %o4
  798. 4:
  799. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  800. SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  801. SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  802. SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  803. subcc %o3, 64, %o3
  804. add %o1, 64, %o1
  805. bne 4b
  806. add %o0, 64, %o0
  807. andcc %o2, 0x30, %o3
  808. be,a 1f
  809. srl %g1, 16, %g2
  810. 4:
  811. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  812. subcc %o3, 16, %o3
  813. add %o1, 16, %o1
  814. bne 4b
  815. add %o0, 16, %o0
  816. srl %g1, 16, %g2
  817. 1:
  818. st %o4, [%o0 - 7]
  819. sth %g2, [%o0 - 3]
  820. srl %g1, 8, %g4
  821. b 88f
  822. stb %g4, [%o0 - 1]
  823. 32:
  824. ld [%o1], %g2
  825. add %o1, 4, %o1
  826. srl %g2, 16, %g3
  827. and %o0, 7, %g5
  828. sth %g3, [%o0]
  829. cmp %g5, 6
  830. sll %g2, 16, %g1
  831. add %o0, 4, %o0
  832. be 42f
  833. and %o2, 0xffffffc0, %o3
  834. ld [%o0 - 6], %o4
  835. 4:
  836. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  837. SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  838. SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  839. SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  840. subcc %o3, 64, %o3
  841. add %o1, 64, %o1
  842. bne 4b
  843. add %o0, 64, %o0
  844. andcc %o2, 0x30, %o3
  845. be,a 1f
  846. srl %g1, 16, %g2
  847. 4:
  848. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  849. subcc %o3, 16, %o3
  850. add %o1, 16, %o1
  851. bne 4b
  852. add %o0, 16, %o0
  853. srl %g1, 16, %g2
  854. 1:
  855. st %o4, [%o0 - 6]
  856. b 88f
  857. sth %g2, [%o0 - 2]
  858. 33:
  859. ld [%o1], %g2
  860. sub %o2, 4, %o2
  861. srl %g2, 24, %g3
  862. and %o0, 7, %g5
  863. stb %g3, [%o0]
  864. cmp %g5, 5
  865. srl %g2, 8, %g4
  866. sll %g2, 24, %g1
  867. sth %g4, [%o0 + 1]
  868. add %o1, 4, %o1
  869. be 43f
  870. and %o2, 0xffffffc0, %o3
  871. ld [%o0 - 1], %o4
  872. add %o0, 4, %o0
  873. 4:
  874. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  875. SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  876. SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  877. SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  878. subcc %o3, 64, %o3
  879. add %o1, 64, %o1
  880. bne 4b
  881. add %o0, 64, %o0
  882. andcc %o2, 0x30, %o3
  883. be,a 1f
  884. srl %g1, 24, %g2
  885. 4:
  886. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  887. subcc %o3, 16, %o3
  888. add %o1, 16, %o1
  889. bne 4b
  890. add %o0, 16, %o0
  891. srl %g1, 24, %g2
  892. 1:
  893. st %o4, [%o0 - 5]
  894. b 88f
  895. stb %g2, [%o0 - 1]
  896. 41:
  897. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  898. SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  899. SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  900. SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  901. subcc %o3, 64, %o3
  902. add %o1, 64, %o1
  903. bne 41b
  904. add %o0, 64, %o0
  905. andcc %o2, 0x30, %o3
  906. be,a 1f
  907. srl %g1, 16, %g2
  908. 4:
  909. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  910. subcc %o3, 16, %o3
  911. add %o1, 16, %o1
  912. bne 4b
  913. add %o0, 16, %o0
  914. srl %g1, 16, %g2
  915. 1:
  916. sth %g2, [%o0 - 3]
  917. srl %g1, 8, %g4
  918. b 88f
  919. stb %g4, [%o0 - 1]
  920. 43:
  921. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  922. SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  923. SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  924. SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  925. subcc %o3, 64, %o3
  926. add %o1, 64, %o1
  927. bne 43b
  928. add %o0, 64, %o0
  929. andcc %o2, 0x30, %o3
  930. be,a 1f
  931. srl %g1, 24, %g2
  932. 4:
  933. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  934. subcc %o3, 16, %o3
  935. add %o1, 16, %o1
  936. bne 4b
  937. add %o0, 16, %o0
  938. srl %g1, 24, %g2
  939. 1:
  940. stb %g2, [%o0 + 3]
  941. b 88f
  942. add %o0, 4, %o0
  943. 42:
  944. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  945. SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  946. SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  947. SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  948. subcc %o3, 64, %o3
  949. add %o1, 64, %o1
  950. bne 42b
  951. add %o0, 64, %o0
  952. andcc %o2, 0x30, %o3
  953. be,a 1f
  954. srl %g1, 16, %g2
  955. 4:
  956. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  957. subcc %o3, 16, %o3
  958. add %o1, 16, %o1
  959. bne 4b
  960. add %o0, 16, %o0
  961. srl %g1, 16, %g2
  962. 1:
  963. sth %g2, [%o0 - 2]
  964. /* Fall through */
  965. #endif /* FASTER_NONALIGNED */
  966. 88: /* short_end */
  967. and %o2, 0xe, %o3
  968. 20:
  969. sethi %hi(89f), %o5
  970. sll %o3, 3, %o4
  971. add %o0, %o3, %o0
  972. sub %o5, %o4, %o5
  973. add %o1, %o3, %o1
  974. jmpl %o5 + %lo(89f), %g0
  975. andcc %o2, 1, %g0
  976. MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
  977. MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
  978. MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
  979. MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
  980. MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
  981. MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
  982. MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
  983. 89: /* short_table_end */
  984. be 1f
  985. nop
  986. ldub [%o1], %g2
  987. stb %g2, [%o0]
  988. 1:
  989. retl
  990. RETL_INSN
  991. 90: /* short_aligned_end */
  992. bne 88b
  993. andcc %o2, 8, %g0
  994. be 1f
  995. andcc %o2, 4, %g0
  996. ld [%o1 + 0x00], %g2
  997. ld [%o1 + 0x04], %g3
  998. add %o1, 8, %o1
  999. st %g2, [%o0 + 0x00]
  1000. st %g3, [%o0 + 0x04]
  1001. add %o0, 8, %o0
  1002. 1:
  1003. b 81b
  1004. mov %o2, %g1