memcpy.S 22 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147
  1. /* memcpy.S: Sparc optimized memcpy and memmove code
  2. * Hand optimized from GNU libc's memcpy and memmove
  3. * Copyright (C) 1991,1996 Free Software Foundation
  4. * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
  5. * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
  6. * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
  7. * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
  8. */
  9. #ifdef __KERNEL__
  10. #define FUNC(x) \
  11. .globl x; \
  12. .type x,@function; \
  13. .align 4; \
  14. x:
  15. #undef FASTER_REVERSE
  16. #undef FASTER_NONALIGNED
  17. #define FASTER_ALIGNED
  18. /* In kernel these functions don't return a value.
  19. * One should use macros in asm/string.h for that purpose.
  20. * We return 0, so that bugs are more apparent.
  21. */
  22. #define SETUP_RETL
  23. #define RETL_INSN clr %o0
  24. #else
  25. /* libc */
  26. #include "DEFS.h"
  27. #define FASTER_REVERSE
  28. #define FASTER_NONALIGNED
  29. #define FASTER_ALIGNED
  30. #define SETUP_RETL mov %o0, %g6
  31. #define RETL_INSN mov %g6, %o0
  32. #endif
  33. /* Both these macros have to start with exactly the same insn */
  34. #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  35. ldd [%src + (offset) + 0x00], %t0; \
  36. ldd [%src + (offset) + 0x08], %t2; \
  37. ldd [%src + (offset) + 0x10], %t4; \
  38. ldd [%src + (offset) + 0x18], %t6; \
  39. st %t0, [%dst + (offset) + 0x00]; \
  40. st %t1, [%dst + (offset) + 0x04]; \
  41. st %t2, [%dst + (offset) + 0x08]; \
  42. st %t3, [%dst + (offset) + 0x0c]; \
  43. st %t4, [%dst + (offset) + 0x10]; \
  44. st %t5, [%dst + (offset) + 0x14]; \
  45. st %t6, [%dst + (offset) + 0x18]; \
  46. st %t7, [%dst + (offset) + 0x1c];
  47. #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  48. ldd [%src + (offset) + 0x00], %t0; \
  49. ldd [%src + (offset) + 0x08], %t2; \
  50. ldd [%src + (offset) + 0x10], %t4; \
  51. ldd [%src + (offset) + 0x18], %t6; \
  52. std %t0, [%dst + (offset) + 0x00]; \
  53. std %t2, [%dst + (offset) + 0x08]; \
  54. std %t4, [%dst + (offset) + 0x10]; \
  55. std %t6, [%dst + (offset) + 0x18];
  56. #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
  57. ldd [%src - (offset) - 0x10], %t0; \
  58. ldd [%src - (offset) - 0x08], %t2; \
  59. st %t0, [%dst - (offset) - 0x10]; \
  60. st %t1, [%dst - (offset) - 0x0c]; \
  61. st %t2, [%dst - (offset) - 0x08]; \
  62. st %t3, [%dst - (offset) - 0x04];
  63. #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
  64. ldd [%src - (offset) - 0x10], %t0; \
  65. ldd [%src - (offset) - 0x08], %t2; \
  66. std %t0, [%dst - (offset) - 0x10]; \
  67. std %t2, [%dst - (offset) - 0x08];
  68. #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
  69. ldub [%src - (offset) - 0x02], %t0; \
  70. ldub [%src - (offset) - 0x01], %t1; \
  71. stb %t0, [%dst - (offset) - 0x02]; \
  72. stb %t1, [%dst - (offset) - 0x01];
  73. /* Both these macros have to start with exactly the same insn */
  74. #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  75. ldd [%src - (offset) - 0x20], %t0; \
  76. ldd [%src - (offset) - 0x18], %t2; \
  77. ldd [%src - (offset) - 0x10], %t4; \
  78. ldd [%src - (offset) - 0x08], %t6; \
  79. st %t0, [%dst - (offset) - 0x20]; \
  80. st %t1, [%dst - (offset) - 0x1c]; \
  81. st %t2, [%dst - (offset) - 0x18]; \
  82. st %t3, [%dst - (offset) - 0x14]; \
  83. st %t4, [%dst - (offset) - 0x10]; \
  84. st %t5, [%dst - (offset) - 0x0c]; \
  85. st %t6, [%dst - (offset) - 0x08]; \
  86. st %t7, [%dst - (offset) - 0x04];
  87. #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  88. ldd [%src - (offset) - 0x20], %t0; \
  89. ldd [%src - (offset) - 0x18], %t2; \
  90. ldd [%src - (offset) - 0x10], %t4; \
  91. ldd [%src - (offset) - 0x08], %t6; \
  92. std %t0, [%dst - (offset) - 0x20]; \
  93. std %t2, [%dst - (offset) - 0x18]; \
  94. std %t4, [%dst - (offset) - 0x10]; \
  95. std %t6, [%dst - (offset) - 0x08];
  96. #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
  97. ldd [%src + (offset) + 0x00], %t0; \
  98. ldd [%src + (offset) + 0x08], %t2; \
  99. st %t0, [%dst + (offset) + 0x00]; \
  100. st %t1, [%dst + (offset) + 0x04]; \
  101. st %t2, [%dst + (offset) + 0x08]; \
  102. st %t3, [%dst + (offset) + 0x0c];
  103. #define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
  104. ldub [%src + (offset) + 0x00], %t0; \
  105. ldub [%src + (offset) + 0x01], %t1; \
  106. stb %t0, [%dst + (offset) + 0x00]; \
  107. stb %t1, [%dst + (offset) + 0x01];
  108. #define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
  109. ldd [%src + (offset) + 0x00], %t0; \
  110. ldd [%src + (offset) + 0x08], %t2; \
  111. srl %t0, shir, %t5; \
  112. srl %t1, shir, %t6; \
  113. sll %t0, shil, %t0; \
  114. or %t5, %prev, %t5; \
  115. sll %t1, shil, %prev; \
  116. or %t6, %t0, %t0; \
  117. srl %t2, shir, %t1; \
  118. srl %t3, shir, %t6; \
  119. sll %t2, shil, %t2; \
  120. or %t1, %prev, %t1; \
  121. std %t4, [%dst + (offset) + (offset2) - 0x04]; \
  122. std %t0, [%dst + (offset) + (offset2) + 0x04]; \
  123. sll %t3, shil, %prev; \
  124. or %t6, %t2, %t4;
  125. #define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
  126. ldd [%src + (offset) + 0x00], %t0; \
  127. ldd [%src + (offset) + 0x08], %t2; \
  128. srl %t0, shir, %t4; \
  129. srl %t1, shir, %t5; \
  130. sll %t0, shil, %t6; \
  131. or %t4, %prev, %t0; \
  132. sll %t1, shil, %prev; \
  133. or %t5, %t6, %t1; \
  134. srl %t2, shir, %t4; \
  135. srl %t3, shir, %t5; \
  136. sll %t2, shil, %t6; \
  137. or %t4, %prev, %t2; \
  138. sll %t3, shil, %prev; \
  139. or %t5, %t6, %t3; \
  140. std %t0, [%dst + (offset) + (offset2) + 0x00]; \
  141. std %t2, [%dst + (offset) + (offset2) + 0x08];
  142. .text
  143. .align 4
  144. #ifdef FASTER_REVERSE
  145. 70: /* rdword_align */
  146. andcc %o1, 1, %g0
  147. be 4f
  148. andcc %o1, 2, %g0
  149. ldub [%o1 - 1], %g2
  150. sub %o1, 1, %o1
  151. stb %g2, [%o0 - 1]
  152. sub %o2, 1, %o2
  153. be 3f
  154. sub %o0, 1, %o0
  155. 4:
  156. lduh [%o1 - 2], %g2
  157. sub %o1, 2, %o1
  158. sth %g2, [%o0 - 2]
  159. sub %o2, 2, %o2
  160. b 3f
  161. sub %o0, 2, %o0
  162. #endif /* FASTER_REVERSE */
  163. 0:
  164. retl
  165. nop ! Only bcopy returns here and it retuns void...
  166. #ifdef __KERNEL__
  167. FUNC(amemmove)
  168. FUNC(__memmove)
  169. #endif
  170. FUNC(memmove)
  171. cmp %o0, %o1
  172. SETUP_RETL
  173. bleu 9f
  174. sub %o0, %o1, %o4
  175. add %o1, %o2, %o3
  176. cmp %o3, %o0
  177. bleu 0f
  178. andcc %o4, 3, %o5
  179. #ifndef FASTER_REVERSE
  180. add %o1, %o2, %o1
  181. add %o0, %o2, %o0
  182. sub %o1, 1, %o1
  183. sub %o0, 1, %o0
  184. 1: /* reverse_bytes */
  185. ldub [%o1], %o4
  186. subcc %o2, 1, %o2
  187. stb %o4, [%o0]
  188. sub %o1, 1, %o1
  189. bne 1b
  190. sub %o0, 1, %o0
  191. retl
  192. RETL_INSN
  193. #else /* FASTER_REVERSE */
  194. add %o1, %o2, %o1
  195. add %o0, %o2, %o0
  196. bne 77f
  197. cmp %o2, 15
  198. bleu 91f
  199. andcc %o1, 3, %g0
  200. bne 70b
  201. 3:
  202. andcc %o1, 4, %g0
  203. be 2f
  204. mov %o2, %g1
  205. ld [%o1 - 4], %o4
  206. sub %g1, 4, %g1
  207. st %o4, [%o0 - 4]
  208. sub %o1, 4, %o1
  209. sub %o0, 4, %o0
  210. 2:
  211. andcc %g1, 0xffffff80, %g7
  212. be 3f
  213. andcc %o0, 4, %g0
  214. be 74f + 4
  215. 5:
  216. RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  217. RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  218. RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  219. RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  220. subcc %g7, 128, %g7
  221. sub %o1, 128, %o1
  222. bne 5b
  223. sub %o0, 128, %o0
  224. 3:
  225. andcc %g1, 0x70, %g7
  226. be 72f
  227. andcc %g1, 8, %g0
  228. sethi %hi(72f), %o5
  229. srl %g7, 1, %o4
  230. add %g7, %o4, %o4
  231. sub %o1, %g7, %o1
  232. sub %o5, %o4, %o5
  233. jmpl %o5 + %lo(72f), %g0
  234. sub %o0, %g7, %o0
  235. 71: /* rmemcpy_table */
  236. RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
  237. RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
  238. RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
  239. RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
  240. RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
  241. RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
  242. RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
  243. 72: /* rmemcpy_table_end */
  244. be 73f
  245. andcc %g1, 4, %g0
  246. ldd [%o1 - 0x08], %g2
  247. sub %o0, 8, %o0
  248. sub %o1, 8, %o1
  249. st %g2, [%o0]
  250. st %g3, [%o0 + 0x04]
  251. 73: /* rmemcpy_last7 */
  252. be 1f
  253. andcc %g1, 2, %g0
  254. ld [%o1 - 4], %g2
  255. sub %o1, 4, %o1
  256. st %g2, [%o0 - 4]
  257. sub %o0, 4, %o0
  258. 1:
  259. be 1f
  260. andcc %g1, 1, %g0
  261. lduh [%o1 - 2], %g2
  262. sub %o1, 2, %o1
  263. sth %g2, [%o0 - 2]
  264. sub %o0, 2, %o0
  265. 1:
  266. be 1f
  267. nop
  268. ldub [%o1 - 1], %g2
  269. stb %g2, [%o0 - 1]
  270. 1:
  271. retl
  272. RETL_INSN
  273. 74: /* rldd_std */
  274. RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  275. RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  276. RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  277. RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  278. subcc %g7, 128, %g7
  279. sub %o1, 128, %o1
  280. bne 74b
  281. sub %o0, 128, %o0
  282. andcc %g1, 0x70, %g7
  283. be 72b
  284. andcc %g1, 8, %g0
  285. sethi %hi(72b), %o5
  286. srl %g7, 1, %o4
  287. add %g7, %o4, %o4
  288. sub %o1, %g7, %o1
  289. sub %o5, %o4, %o5
  290. jmpl %o5 + %lo(72b), %g0
  291. sub %o0, %g7, %o0
  292. 75: /* rshort_end */
  293. and %o2, 0xe, %o3
  294. 2:
  295. sethi %hi(76f), %o5
  296. sll %o3, 3, %o4
  297. sub %o0, %o3, %o0
  298. sub %o5, %o4, %o5
  299. sub %o1, %o3, %o1
  300. jmpl %o5 + %lo(76f), %g0
  301. andcc %o2, 1, %g0
  302. RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
  303. RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
  304. RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
  305. RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
  306. RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
  307. RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
  308. RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
  309. 76: /* rshort_table_end */
  310. be 1f
  311. nop
  312. ldub [%o1 - 1], %g2
  313. stb %g2, [%o0 - 1]
  314. 1:
  315. retl
  316. RETL_INSN
  317. 91: /* rshort_aligned_end */
  318. bne 75b
  319. andcc %o2, 8, %g0
  320. be 1f
  321. andcc %o2, 4, %g0
  322. ld [%o1 - 0x08], %g2
  323. ld [%o1 - 0x04], %g3
  324. sub %o1, 8, %o1
  325. st %g2, [%o0 - 0x08]
  326. st %g3, [%o0 - 0x04]
  327. sub %o0, 8, %o0
  328. 1:
  329. b 73b
  330. mov %o2, %g1
  331. 77: /* rnon_aligned */
  332. cmp %o2, 15
  333. bleu 75b
  334. andcc %o0, 3, %g0
  335. be 64f
  336. andcc %o0, 1, %g0
  337. be 63f
  338. andcc %o0, 2, %g0
  339. ldub [%o1 - 1], %g5
  340. sub %o1, 1, %o1
  341. stb %g5, [%o0 - 1]
  342. sub %o0, 1, %o0
  343. be 64f
  344. sub %o2, 1, %o2
  345. 63:
  346. ldub [%o1 - 1], %g5
  347. sub %o1, 2, %o1
  348. stb %g5, [%o0 - 1]
  349. sub %o0, 2, %o0
  350. ldub [%o1], %g5
  351. sub %o2, 2, %o2
  352. stb %g5, [%o0]
  353. 64:
  354. and %o1, 3, %g2
  355. and %o1, -4, %o1
  356. and %o2, 0xc, %g3
  357. add %o1, 4, %o1
  358. cmp %g3, 4
  359. sll %g2, 3, %g4
  360. mov 32, %g2
  361. be 4f
  362. sub %g2, %g4, %g7
  363. blu 3f
  364. cmp %g3, 8
  365. be 2f
  366. srl %o2, 2, %g3
  367. ld [%o1 - 4], %o3
  368. add %o0, -8, %o0
  369. ld [%o1 - 8], %o4
  370. add %o1, -16, %o1
  371. b 7f
  372. add %g3, 1, %g3
  373. 2:
  374. ld [%o1 - 4], %o4
  375. add %o0, -4, %o0
  376. ld [%o1 - 8], %g1
  377. add %o1, -12, %o1
  378. b 8f
  379. add %g3, 2, %g3
  380. 3:
  381. ld [%o1 - 4], %o5
  382. add %o0, -12, %o0
  383. ld [%o1 - 8], %o3
  384. add %o1, -20, %o1
  385. b 6f
  386. srl %o2, 2, %g3
  387. 4:
  388. ld [%o1 - 4], %g1
  389. srl %o2, 2, %g3
  390. ld [%o1 - 8], %o5
  391. add %o1, -24, %o1
  392. add %o0, -16, %o0
  393. add %g3, -1, %g3
  394. ld [%o1 + 12], %o3
  395. 5:
  396. sll %o5, %g4, %g2
  397. srl %g1, %g7, %g5
  398. or %g2, %g5, %g2
  399. st %g2, [%o0 + 12]
  400. 6:
  401. ld [%o1 + 8], %o4
  402. sll %o3, %g4, %g2
  403. srl %o5, %g7, %g5
  404. or %g2, %g5, %g2
  405. st %g2, [%o0 + 8]
  406. 7:
  407. ld [%o1 + 4], %g1
  408. sll %o4, %g4, %g2
  409. srl %o3, %g7, %g5
  410. or %g2, %g5, %g2
  411. st %g2, [%o0 + 4]
  412. 8:
  413. ld [%o1], %o5
  414. sll %g1, %g4, %g2
  415. srl %o4, %g7, %g5
  416. addcc %g3, -4, %g3
  417. or %g2, %g5, %g2
  418. add %o1, -16, %o1
  419. st %g2, [%o0]
  420. add %o0, -16, %o0
  421. bne,a 5b
  422. ld [%o1 + 12], %o3
  423. sll %o5, %g4, %g2
  424. srl %g1, %g7, %g5
  425. srl %g4, 3, %g3
  426. or %g2, %g5, %g2
  427. add %o1, %g3, %o1
  428. andcc %o2, 2, %g0
  429. st %g2, [%o0 + 12]
  430. be 1f
  431. andcc %o2, 1, %g0
  432. ldub [%o1 + 15], %g5
  433. add %o1, -2, %o1
  434. stb %g5, [%o0 + 11]
  435. add %o0, -2, %o0
  436. ldub [%o1 + 16], %g5
  437. stb %g5, [%o0 + 12]
  438. 1:
  439. be 1f
  440. nop
  441. ldub [%o1 + 15], %g5
  442. stb %g5, [%o0 + 11]
  443. 1:
  444. retl
  445. RETL_INSN
  446. #endif /* FASTER_REVERSE */
  447. /* NOTE: This code is executed just for the cases,
  448. where %src (=%o1) & 3 is != 0.
  449. We need to align it to 4. So, for (%src & 3)
  450. 1 we need to do ldub,lduh
  451. 2 lduh
  452. 3 just ldub
  453. so even if it looks weird, the branches
  454. are correct here. -jj
  455. */
  456. 78: /* dword_align */
  457. andcc %o1, 1, %g0
  458. be 4f
  459. andcc %o1, 2, %g0
  460. ldub [%o1], %g2
  461. add %o1, 1, %o1
  462. stb %g2, [%o0]
  463. sub %o2, 1, %o2
  464. bne 3f
  465. add %o0, 1, %o0
  466. 4:
  467. lduh [%o1], %g2
  468. add %o1, 2, %o1
  469. sth %g2, [%o0]
  470. sub %o2, 2, %o2
  471. b 3f
  472. add %o0, 2, %o0
  473. FUNC(memcpy) /* %o0=dst %o1=src %o2=len */
  474. sub %o0, %o1, %o4
  475. SETUP_RETL
  476. 9:
  477. andcc %o4, 3, %o5
  478. 0:
  479. bne 86f
  480. cmp %o2, 15
  481. bleu 90f
  482. andcc %o1, 3, %g0
  483. bne 78b
  484. 3:
  485. andcc %o1, 4, %g0
  486. be 2f
  487. mov %o2, %g1
  488. ld [%o1], %o4
  489. sub %g1, 4, %g1
  490. st %o4, [%o0]
  491. add %o1, 4, %o1
  492. add %o0, 4, %o0
  493. 2:
  494. andcc %g1, 0xffffff80, %g7
  495. be 3f
  496. andcc %o0, 4, %g0
  497. be 82f + 4
  498. 5:
  499. MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  500. MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  501. MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  502. MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  503. subcc %g7, 128, %g7
  504. add %o1, 128, %o1
  505. bne 5b
  506. add %o0, 128, %o0
  507. 3:
  508. andcc %g1, 0x70, %g7
  509. be 80f
  510. andcc %g1, 8, %g0
  511. sethi %hi(80f), %o5
  512. srl %g7, 1, %o4
  513. add %g7, %o4, %o4
  514. add %o1, %g7, %o1
  515. sub %o5, %o4, %o5
  516. jmpl %o5 + %lo(80f), %g0
  517. add %o0, %g7, %o0
  518. 79: /* memcpy_table */
  519. MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
  520. MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
  521. MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
  522. MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
  523. MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
  524. MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
  525. MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
  526. 80: /* memcpy_table_end */
  527. be 81f
  528. andcc %g1, 4, %g0
  529. ldd [%o1], %g2
  530. add %o0, 8, %o0
  531. st %g2, [%o0 - 0x08]
  532. add %o1, 8, %o1
  533. st %g3, [%o0 - 0x04]
  534. 81: /* memcpy_last7 */
  535. be 1f
  536. andcc %g1, 2, %g0
  537. ld [%o1], %g2
  538. add %o1, 4, %o1
  539. st %g2, [%o0]
  540. add %o0, 4, %o0
  541. 1:
  542. be 1f
  543. andcc %g1, 1, %g0
  544. lduh [%o1], %g2
  545. add %o1, 2, %o1
  546. sth %g2, [%o0]
  547. add %o0, 2, %o0
  548. 1:
  549. be 1f
  550. nop
  551. ldub [%o1], %g2
  552. stb %g2, [%o0]
  553. 1:
  554. retl
  555. RETL_INSN
  556. 82: /* ldd_std */
  557. MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
  558. MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
  559. MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
  560. MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
  561. subcc %g7, 128, %g7
  562. add %o1, 128, %o1
  563. bne 82b
  564. add %o0, 128, %o0
  565. #ifndef FASTER_ALIGNED
  566. andcc %g1, 0x70, %g7
  567. be 80b
  568. andcc %g1, 8, %g0
  569. sethi %hi(80b), %o5
  570. srl %g7, 1, %o4
  571. add %g7, %o4, %o4
  572. add %o1, %g7, %o1
  573. sub %o5, %o4, %o5
  574. jmpl %o5 + %lo(80b), %g0
  575. add %o0, %g7, %o0
  576. #else /* FASTER_ALIGNED */
  577. andcc %g1, 0x70, %g7
  578. be 84f
  579. andcc %g1, 8, %g0
  580. sethi %hi(84f), %o5
  581. add %o1, %g7, %o1
  582. sub %o5, %g7, %o5
  583. jmpl %o5 + %lo(84f), %g0
  584. add %o0, %g7, %o0
  585. 83: /* amemcpy_table */
  586. MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
  587. MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
  588. MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
  589. MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
  590. MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
  591. MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
  592. MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
  593. 84: /* amemcpy_table_end */
  594. be 85f
  595. andcc %g1, 4, %g0
  596. ldd [%o1], %g2
  597. add %o0, 8, %o0
  598. std %g2, [%o0 - 0x08]
  599. add %o1, 8, %o1
  600. 85: /* amemcpy_last7 */
  601. be 1f
  602. andcc %g1, 2, %g0
  603. ld [%o1], %g2
  604. add %o1, 4, %o1
  605. st %g2, [%o0]
  606. add %o0, 4, %o0
  607. 1:
  608. be 1f
  609. andcc %g1, 1, %g0
  610. lduh [%o1], %g2
  611. add %o1, 2, %o1
  612. sth %g2, [%o0]
  613. add %o0, 2, %o0
  614. 1:
  615. be 1f
  616. nop
  617. ldub [%o1], %g2
  618. stb %g2, [%o0]
  619. 1:
  620. retl
  621. RETL_INSN
  622. #endif /* FASTER_ALIGNED */
  623. 86: /* non_aligned */
  624. cmp %o2, 6
  625. bleu 88f
  626. #ifdef FASTER_NONALIGNED
  627. cmp %o2, 256
  628. bcc 87f
  629. #endif /* FASTER_NONALIGNED */
  630. andcc %o0, 3, %g0
  631. be 61f
  632. andcc %o0, 1, %g0
  633. be 60f
  634. andcc %o0, 2, %g0
  635. ldub [%o1], %g5
  636. add %o1, 1, %o1
  637. stb %g5, [%o0]
  638. sub %o2, 1, %o2
  639. bne 61f
  640. add %o0, 1, %o0
  641. 60:
  642. ldub [%o1], %g3
  643. add %o1, 2, %o1
  644. stb %g3, [%o0]
  645. sub %o2, 2, %o2
  646. ldub [%o1 - 1], %g3
  647. add %o0, 2, %o0
  648. stb %g3, [%o0 - 1]
  649. 61:
  650. and %o1, 3, %g2
  651. and %o2, 0xc, %g3
  652. and %o1, -4, %o1
  653. cmp %g3, 4
  654. sll %g2, 3, %g4
  655. mov 32, %g2
  656. be 4f
  657. sub %g2, %g4, %g7
  658. blu 3f
  659. cmp %g3, 0x8
  660. be 2f
  661. srl %o2, 2, %g3
  662. ld [%o1], %o3
  663. add %o0, -8, %o0
  664. ld [%o1 + 4], %o4
  665. b 8f
  666. add %g3, 1, %g3
  667. 2:
  668. ld [%o1], %o4
  669. add %o0, -12, %o0
  670. ld [%o1 + 4], %o5
  671. add %g3, 2, %g3
  672. b 9f
  673. add %o1, -4, %o1
  674. 3:
  675. ld [%o1], %g1
  676. add %o0, -4, %o0
  677. ld [%o1 + 4], %o3
  678. srl %o2, 2, %g3
  679. b 7f
  680. add %o1, 4, %o1
  681. 4:
  682. ld [%o1], %o5
  683. cmp %o2, 7
  684. ld [%o1 + 4], %g1
  685. srl %o2, 2, %g3
  686. bleu 10f
  687. add %o1, 8, %o1
  688. ld [%o1], %o3
  689. add %g3, -1, %g3
  690. 5:
  691. sll %o5, %g4, %g2
  692. srl %g1, %g7, %g5
  693. or %g2, %g5, %g2
  694. st %g2, [%o0]
  695. 7:
  696. ld [%o1 + 4], %o4
  697. sll %g1, %g4, %g2
  698. srl %o3, %g7, %g5
  699. or %g2, %g5, %g2
  700. st %g2, [%o0 + 4]
  701. 8:
  702. ld [%o1 + 8], %o5
  703. sll %o3, %g4, %g2
  704. srl %o4, %g7, %g5
  705. or %g2, %g5, %g2
  706. st %g2, [%o0 + 8]
  707. 9:
  708. ld [%o1 + 12], %g1
  709. sll %o4, %g4, %g2
  710. srl %o5, %g7, %g5
  711. addcc %g3, -4, %g3
  712. or %g2, %g5, %g2
  713. add %o1, 16, %o1
  714. st %g2, [%o0 + 12]
  715. add %o0, 16, %o0
  716. bne,a 5b
  717. ld [%o1], %o3
  718. 10:
  719. sll %o5, %g4, %g2
  720. srl %g1, %g7, %g5
  721. srl %g7, 3, %g3
  722. or %g2, %g5, %g2
  723. sub %o1, %g3, %o1
  724. andcc %o2, 2, %g0
  725. st %g2, [%o0]
  726. be 1f
  727. andcc %o2, 1, %g0
  728. ldub [%o1], %g2
  729. add %o1, 2, %o1
  730. stb %g2, [%o0 + 4]
  731. add %o0, 2, %o0
  732. ldub [%o1 - 1], %g2
  733. stb %g2, [%o0 + 3]
  734. 1:
  735. be 1f
  736. nop
  737. ldub [%o1], %g2
  738. stb %g2, [%o0 + 4]
  739. 1:
  740. retl
  741. RETL_INSN
  742. #ifdef FASTER_NONALIGNED
  743. 87: /* faster_nonaligned */
  744. andcc %o1, 3, %g0
  745. be 3f
  746. andcc %o1, 1, %g0
  747. be 4f
  748. andcc %o1, 2, %g0
  749. ldub [%o1], %g2
  750. add %o1, 1, %o1
  751. stb %g2, [%o0]
  752. sub %o2, 1, %o2
  753. bne 3f
  754. add %o0, 1, %o0
  755. 4:
  756. lduh [%o1], %g2
  757. add %o1, 2, %o1
  758. srl %g2, 8, %g3
  759. sub %o2, 2, %o2
  760. stb %g3, [%o0]
  761. add %o0, 2, %o0
  762. stb %g2, [%o0 - 1]
  763. 3:
  764. andcc %o1, 4, %g0
  765. bne 2f
  766. cmp %o5, 1
  767. ld [%o1], %o4
  768. srl %o4, 24, %g2
  769. stb %g2, [%o0]
  770. srl %o4, 16, %g3
  771. stb %g3, [%o0 + 1]
  772. srl %o4, 8, %g2
  773. stb %g2, [%o0 + 2]
  774. sub %o2, 4, %o2
  775. stb %o4, [%o0 + 3]
  776. add %o1, 4, %o1
  777. add %o0, 4, %o0
  778. 2:
  779. be 33f
  780. cmp %o5, 2
  781. be 32f
  782. sub %o2, 4, %o2
  783. 31:
  784. ld [%o1], %g2
  785. add %o1, 4, %o1
  786. srl %g2, 24, %g3
  787. and %o0, 7, %g5
  788. stb %g3, [%o0]
  789. cmp %g5, 7
  790. sll %g2, 8, %g1
  791. add %o0, 4, %o0
  792. be 41f
  793. and %o2, 0xffffffc0, %o3
  794. ld [%o0 - 7], %o4
  795. 4:
  796. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  797. SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  798. SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  799. SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  800. subcc %o3, 64, %o3
  801. add %o1, 64, %o1
  802. bne 4b
  803. add %o0, 64, %o0
  804. andcc %o2, 0x30, %o3
  805. be,a 1f
  806. srl %g1, 16, %g2
  807. 4:
  808. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  809. subcc %o3, 16, %o3
  810. add %o1, 16, %o1
  811. bne 4b
  812. add %o0, 16, %o0
  813. srl %g1, 16, %g2
  814. 1:
  815. st %o4, [%o0 - 7]
  816. sth %g2, [%o0 - 3]
  817. srl %g1, 8, %g4
  818. b 88f
  819. stb %g4, [%o0 - 1]
  820. 32:
  821. ld [%o1], %g2
  822. add %o1, 4, %o1
  823. srl %g2, 16, %g3
  824. and %o0, 7, %g5
  825. sth %g3, [%o0]
  826. cmp %g5, 6
  827. sll %g2, 16, %g1
  828. add %o0, 4, %o0
  829. be 42f
  830. and %o2, 0xffffffc0, %o3
  831. ld [%o0 - 6], %o4
  832. 4:
  833. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  834. SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  835. SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  836. SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  837. subcc %o3, 64, %o3
  838. add %o1, 64, %o1
  839. bne 4b
  840. add %o0, 64, %o0
  841. andcc %o2, 0x30, %o3
  842. be,a 1f
  843. srl %g1, 16, %g2
  844. 4:
  845. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  846. subcc %o3, 16, %o3
  847. add %o1, 16, %o1
  848. bne 4b
  849. add %o0, 16, %o0
  850. srl %g1, 16, %g2
  851. 1:
  852. st %o4, [%o0 - 6]
  853. b 88f
  854. sth %g2, [%o0 - 2]
  855. 33:
  856. ld [%o1], %g2
  857. sub %o2, 4, %o2
  858. srl %g2, 24, %g3
  859. and %o0, 7, %g5
  860. stb %g3, [%o0]
  861. cmp %g5, 5
  862. srl %g2, 8, %g4
  863. sll %g2, 24, %g1
  864. sth %g4, [%o0 + 1]
  865. add %o1, 4, %o1
  866. be 43f
  867. and %o2, 0xffffffc0, %o3
  868. ld [%o0 - 1], %o4
  869. add %o0, 4, %o0
  870. 4:
  871. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  872. SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  873. SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  874. SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  875. subcc %o3, 64, %o3
  876. add %o1, 64, %o1
  877. bne 4b
  878. add %o0, 64, %o0
  879. andcc %o2, 0x30, %o3
  880. be,a 1f
  881. srl %g1, 24, %g2
  882. 4:
  883. SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
  884. subcc %o3, 16, %o3
  885. add %o1, 16, %o1
  886. bne 4b
  887. add %o0, 16, %o0
  888. srl %g1, 24, %g2
  889. 1:
  890. st %o4, [%o0 - 5]
  891. b 88f
  892. stb %g2, [%o0 - 1]
  893. 41:
  894. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  895. SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  896. SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  897. SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  898. subcc %o3, 64, %o3
  899. add %o1, 64, %o1
  900. bne 41b
  901. add %o0, 64, %o0
  902. andcc %o2, 0x30, %o3
  903. be,a 1f
  904. srl %g1, 16, %g2
  905. 4:
  906. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
  907. subcc %o3, 16, %o3
  908. add %o1, 16, %o1
  909. bne 4b
  910. add %o0, 16, %o0
  911. srl %g1, 16, %g2
  912. 1:
  913. sth %g2, [%o0 - 3]
  914. srl %g1, 8, %g4
  915. b 88f
  916. stb %g4, [%o0 - 1]
  917. 43:
  918. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  919. SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  920. SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  921. SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  922. subcc %o3, 64, %o3
  923. add %o1, 64, %o1
  924. bne 43b
  925. add %o0, 64, %o0
  926. andcc %o2, 0x30, %o3
  927. be,a 1f
  928. srl %g1, 24, %g2
  929. 4:
  930. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
  931. subcc %o3, 16, %o3
  932. add %o1, 16, %o1
  933. bne 4b
  934. add %o0, 16, %o0
  935. srl %g1, 24, %g2
  936. 1:
  937. stb %g2, [%o0 + 3]
  938. b 88f
  939. add %o0, 4, %o0
  940. 42:
  941. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  942. SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  943. SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  944. SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  945. subcc %o3, 64, %o3
  946. add %o1, 64, %o1
  947. bne 42b
  948. add %o0, 64, %o0
  949. andcc %o2, 0x30, %o3
  950. be,a 1f
  951. srl %g1, 16, %g2
  952. 4:
  953. SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
  954. subcc %o3, 16, %o3
  955. add %o1, 16, %o1
  956. bne 4b
  957. add %o0, 16, %o0
  958. srl %g1, 16, %g2
  959. 1:
  960. sth %g2, [%o0 - 2]
  961. /* Fall through */
  962. #endif /* FASTER_NONALIGNED */
  963. 88: /* short_end */
  964. and %o2, 0xe, %o3
  965. 20:
  966. sethi %hi(89f), %o5
  967. sll %o3, 3, %o4
  968. add %o0, %o3, %o0
  969. sub %o5, %o4, %o5
  970. add %o1, %o3, %o1
  971. jmpl %o5 + %lo(89f), %g0
  972. andcc %o2, 1, %g0
  973. MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
  974. MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
  975. MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
  976. MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
  977. MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
  978. MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
  979. MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
  980. 89: /* short_table_end */
  981. be 1f
  982. nop
  983. ldub [%o1], %g2
  984. stb %g2, [%o0]
  985. 1:
  986. retl
  987. RETL_INSN
  988. 90: /* short_aligned_end */
  989. bne 88b
  990. andcc %o2, 8, %g0
  991. be 1f
  992. andcc %o2, 4, %g0
  993. ld [%o1 + 0x00], %g2
  994. ld [%o1 + 0x04], %g3
  995. add %o1, 8, %o1
  996. st %g2, [%o0 + 0x00]
  997. st %g3, [%o0 + 0x04]
  998. add %o0, 8, %o0
  999. 1:
  1000. b 81b
  1001. mov %o2, %g1