NG4memcpy.S 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /* NG4memcpy.S: Niagara-4 optimized memcpy.
  2. *
  3. * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
  4. */
  5. #ifdef __KERNEL__
  6. #include <asm/visasm.h>
  7. #include <asm/asi.h>
  8. #define GLOBAL_SPARE %g7
  9. #else
  10. #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  11. #define FPRS_FEF 0x04
  12. /* On T4 it is very expensive to access ASRs like %fprs and
  13. * %asi, avoiding a read or a write can save ~50 cycles.
  14. */
  15. #define FPU_ENTER \
  16. rd %fprs, %o5; \
  17. andcc %o5, FPRS_FEF, %g0; \
  18. be,a,pn %icc, 999f; \
  19. wr %g0, FPRS_FEF, %fprs; \
  20. 999:
  21. #ifdef MEMCPY_DEBUG
  22. #define VISEntryHalf FPU_ENTER; \
  23. clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
  24. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  25. #else
  26. #define VISEntryHalf FPU_ENTER
  27. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  28. #endif
  29. #define GLOBAL_SPARE %g5
  30. #endif
  31. #ifndef STORE_ASI
  32. #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  33. #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
  34. #else
  35. #define STORE_ASI 0x80 /* ASI_P */
  36. #endif
  37. #endif
  38. #ifndef EX_LD
  39. #define EX_LD(x) x
  40. #endif
  41. #ifndef EX_ST
  42. #define EX_ST(x) x
  43. #endif
  44. #ifndef EX_RETVAL
  45. #define EX_RETVAL(x) x
  46. #endif
  47. #ifndef LOAD
  48. #define LOAD(type,addr,dest) type [addr], dest
  49. #endif
  50. #ifndef STORE
  51. #ifndef MEMCPY_DEBUG
  52. #define STORE(type,src,addr) type src, [addr]
  53. #else
  54. #define STORE(type,src,addr) type##a src, [addr] %asi
  55. #endif
  56. #endif
  57. #ifndef STORE_INIT
  58. #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
  59. #endif
  60. #ifndef FUNC_NAME
  61. #define FUNC_NAME NG4memcpy
  62. #endif
  63. #ifndef PREAMBLE
  64. #define PREAMBLE
  65. #endif
  66. #ifndef XCC
  67. #define XCC xcc
  68. #endif
  69. .register %g2,#scratch
  70. .register %g3,#scratch
  71. .text
  72. .align 64
  73. .globl FUNC_NAME
  74. .type FUNC_NAME,#function
  75. FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
  76. #ifdef MEMCPY_DEBUG
  77. wr %g0, 0x80, %asi
  78. #endif
  79. srlx %o2, 31, %g2
  80. cmp %g2, 0
  81. tne %XCC, 5
  82. PREAMBLE
  83. mov %o0, %o3
  84. brz,pn %o2, .Lexit
  85. cmp %o2, 3
  86. ble,pn %icc, .Ltiny
  87. cmp %o2, 19
  88. ble,pn %icc, .Lsmall
  89. or %o0, %o1, %g2
  90. cmp %o2, 128
  91. bl,pn %icc, .Lmedium
  92. nop
  93. .Llarge:/* len >= 0x80 */
  94. /* First get dest 8 byte aligned. */
  95. sub %g0, %o0, %g1
  96. and %g1, 0x7, %g1
  97. brz,pt %g1, 51f
  98. sub %o2, %g1, %o2
  99. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
  100. add %o1, 1, %o1
  101. subcc %g1, 1, %g1
  102. add %o0, 1, %o0
  103. bne,pt %icc, 1b
  104. EX_ST(STORE(stb, %g2, %o0 - 0x01))
  105. 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
  106. LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
  107. LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
  108. LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
  109. LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
  110. LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
  111. LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
  112. LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
  113. /* Check if we can use the straight fully aligned
  114. * loop, or we require the alignaddr/faligndata variant.
  115. */
  116. andcc %o1, 0x7, %o5
  117. bne,pn %icc, .Llarge_src_unaligned
  118. sub %g0, %o0, %g1
  119. /* Legitimize the use of initializing stores by getting dest
  120. * to be 64-byte aligned.
  121. */
  122. and %g1, 0x3f, %g1
  123. brz,pt %g1, .Llarge_aligned
  124. sub %o2, %g1, %o2
  125. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
  126. add %o1, 8, %o1
  127. subcc %g1, 8, %g1
  128. add %o0, 8, %o0
  129. bne,pt %icc, 1b
  130. EX_ST(STORE(stx, %g2, %o0 - 0x08))
  131. .Llarge_aligned:
  132. /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
  133. andn %o2, 0x3f, %o4
  134. sub %o2, %o4, %o2
  135. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
  136. add %o1, 0x40, %o1
  137. EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
  138. subcc %o4, 0x40, %o4
  139. EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
  140. EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
  141. EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
  142. EX_ST(STORE_INIT(%g1, %o0))
  143. add %o0, 0x08, %o0
  144. EX_ST(STORE_INIT(%g2, %o0))
  145. add %o0, 0x08, %o0
  146. EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
  147. EX_ST(STORE_INIT(%g3, %o0))
  148. add %o0, 0x08, %o0
  149. EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
  150. EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
  151. add %o0, 0x08, %o0
  152. EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
  153. EX_ST(STORE_INIT(%o5, %o0))
  154. add %o0, 0x08, %o0
  155. EX_ST(STORE_INIT(%g2, %o0))
  156. add %o0, 0x08, %o0
  157. EX_ST(STORE_INIT(%g3, %o0))
  158. add %o0, 0x08, %o0
  159. EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
  160. add %o0, 0x08, %o0
  161. bne,pt %icc, 1b
  162. LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
  163. membar #StoreLoad | #StoreStore
  164. brz,pn %o2, .Lexit
  165. cmp %o2, 19
  166. ble,pn %icc, .Lsmall_unaligned
  167. nop
  168. ba,a,pt %icc, .Lmedium_noprefetch
  169. .Lexit: retl
  170. mov EX_RETVAL(%o3), %o0
  171. .Llarge_src_unaligned:
  172. andn %o2, 0x3f, %o4
  173. sub %o2, %o4, %o2
  174. VISEntryHalf
  175. alignaddr %o1, %g0, %g1
  176. add %o1, %o4, %o1
  177. EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
  178. 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
  179. subcc %o4, 0x40, %o4
  180. EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
  181. EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
  182. EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
  183. EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
  184. EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
  185. EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
  186. faligndata %f0, %f2, %f16
  187. EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
  188. faligndata %f2, %f4, %f18
  189. add %g1, 0x40, %g1
  190. faligndata %f4, %f6, %f20
  191. faligndata %f6, %f8, %f22
  192. faligndata %f8, %f10, %f24
  193. faligndata %f10, %f12, %f26
  194. faligndata %f12, %f14, %f28
  195. faligndata %f14, %f0, %f30
  196. EX_ST(STORE(std, %f16, %o0 + 0x00))
  197. EX_ST(STORE(std, %f18, %o0 + 0x08))
  198. EX_ST(STORE(std, %f20, %o0 + 0x10))
  199. EX_ST(STORE(std, %f22, %o0 + 0x18))
  200. EX_ST(STORE(std, %f24, %o0 + 0x20))
  201. EX_ST(STORE(std, %f26, %o0 + 0x28))
  202. EX_ST(STORE(std, %f28, %o0 + 0x30))
  203. EX_ST(STORE(std, %f30, %o0 + 0x38))
  204. add %o0, 0x40, %o0
  205. bne,pt %icc, 1b
  206. LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
  207. VISExitHalf
  208. brz,pn %o2, .Lexit
  209. cmp %o2, 19
  210. ble,pn %icc, .Lsmall_unaligned
  211. nop
  212. ba,a,pt %icc, .Lmedium_unaligned
  213. .Lmedium:
  214. LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
  215. andcc %g2, 0x7, %g0
  216. bne,pn %icc, .Lmedium_unaligned
  217. nop
  218. .Lmedium_noprefetch:
  219. andncc %o2, 0x20 - 1, %o5
  220. be,pn %icc, 2f
  221. sub %o2, %o5, %o2
  222. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
  223. EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
  224. EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
  225. EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
  226. add %o1, 0x20, %o1
  227. subcc %o5, 0x20, %o5
  228. EX_ST(STORE(stx, %g1, %o0 + 0x00))
  229. EX_ST(STORE(stx, %g2, %o0 + 0x08))
  230. EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
  231. EX_ST(STORE(stx, %o4, %o0 + 0x18))
  232. bne,pt %icc, 1b
  233. add %o0, 0x20, %o0
  234. 2: andcc %o2, 0x18, %o5
  235. be,pt %icc, 3f
  236. sub %o2, %o5, %o2
  237. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
  238. add %o1, 0x08, %o1
  239. add %o0, 0x08, %o0
  240. subcc %o5, 0x08, %o5
  241. bne,pt %icc, 1b
  242. EX_ST(STORE(stx, %g1, %o0 - 0x08))
  243. 3: brz,pt %o2, .Lexit
  244. cmp %o2, 0x04
  245. bl,pn %icc, .Ltiny
  246. nop
  247. EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
  248. add %o1, 0x04, %o1
  249. add %o0, 0x04, %o0
  250. subcc %o2, 0x04, %o2
  251. bne,pn %icc, .Ltiny
  252. EX_ST(STORE(stw, %g1, %o0 - 0x04))
  253. ba,a,pt %icc, .Lexit
  254. .Lmedium_unaligned:
  255. /* First get dest 8 byte aligned. */
  256. sub %g0, %o0, %g1
  257. and %g1, 0x7, %g1
  258. brz,pt %g1, 2f
  259. sub %o2, %g1, %o2
  260. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
  261. add %o1, 1, %o1
  262. subcc %g1, 1, %g1
  263. add %o0, 1, %o0
  264. bne,pt %icc, 1b
  265. EX_ST(STORE(stb, %g2, %o0 - 0x01))
  266. 2:
  267. and %o1, 0x7, %g1
  268. brz,pn %g1, .Lmedium_noprefetch
  269. sll %g1, 3, %g1
  270. mov 64, %g2
  271. sub %g2, %g1, %g2
  272. andn %o1, 0x7, %o1
  273. EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
  274. sllx %o4, %g1, %o4
  275. andn %o2, 0x08 - 1, %o5
  276. sub %o2, %o5, %o2
  277. 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
  278. add %o1, 0x08, %o1
  279. subcc %o5, 0x08, %o5
  280. srlx %g3, %g2, GLOBAL_SPARE
  281. or GLOBAL_SPARE, %o4, GLOBAL_SPARE
  282. EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
  283. add %o0, 0x08, %o0
  284. bne,pt %icc, 1b
  285. sllx %g3, %g1, %o4
  286. srl %g1, 3, %g1
  287. add %o1, %g1, %o1
  288. brz,pn %o2, .Lexit
  289. nop
  290. ba,pt %icc, .Lsmall_unaligned
  291. .Ltiny:
  292. EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
  293. subcc %o2, 1, %o2
  294. be,pn %icc, .Lexit
  295. EX_ST(STORE(stb, %g1, %o0 + 0x00))
  296. EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
  297. subcc %o2, 1, %o2
  298. be,pn %icc, .Lexit
  299. EX_ST(STORE(stb, %g1, %o0 + 0x01))
  300. EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
  301. ba,pt %icc, .Lexit
  302. EX_ST(STORE(stb, %g1, %o0 + 0x02))
  303. .Lsmall:
  304. andcc %g2, 0x3, %g0
  305. bne,pn %icc, .Lsmall_unaligned
  306. andn %o2, 0x4 - 1, %o5
  307. sub %o2, %o5, %o2
  308. 1:
  309. EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
  310. add %o1, 0x04, %o1
  311. subcc %o5, 0x04, %o5
  312. add %o0, 0x04, %o0
  313. bne,pt %icc, 1b
  314. EX_ST(STORE(stw, %g1, %o0 - 0x04))
  315. brz,pt %o2, .Lexit
  316. nop
  317. ba,a,pt %icc, .Ltiny
  318. .Lsmall_unaligned:
  319. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
  320. add %o1, 1, %o1
  321. add %o0, 1, %o0
  322. subcc %o2, 1, %o2
  323. bne,pt %icc, 1b
  324. EX_ST(STORE(stb, %g1, %o0 - 0x01))
  325. ba,a,pt %icc, .Lexit
  326. .size FUNC_NAME, .-FUNC_NAME