copyuser_power7.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. .macro err1
  22. 100:
  23. .section __ex_table,"a"
  24. .align 3
  25. .llong 100b,.Ldo_err1
  26. .previous
  27. .endm
  28. .macro err2
  29. 200:
  30. .section __ex_table,"a"
  31. .align 3
  32. .llong 200b,.Ldo_err2
  33. .previous
  34. .endm
  35. #ifdef CONFIG_ALTIVEC
  36. .macro err3
  37. 300:
  38. .section __ex_table,"a"
  39. .align 3
  40. .llong 300b,.Ldo_err3
  41. .previous
  42. .endm
  43. .macro err4
  44. 400:
  45. .section __ex_table,"a"
  46. .align 3
  47. .llong 400b,.Ldo_err4
  48. .previous
  49. .endm
  50. .Ldo_err4:
  51. ld r16,STK_REG(R16)(r1)
  52. ld r15,STK_REG(R15)(r1)
  53. ld r14,STK_REG(R14)(r1)
  54. .Ldo_err3:
  55. bl .exit_vmx_usercopy
  56. ld r0,STACKFRAMESIZE+16(r1)
  57. mtlr r0
  58. b .Lexit
  59. #endif /* CONFIG_ALTIVEC */
  60. .Ldo_err2:
  61. ld r22,STK_REG(R22)(r1)
  62. ld r21,STK_REG(R21)(r1)
  63. ld r20,STK_REG(R20)(r1)
  64. ld r19,STK_REG(R19)(r1)
  65. ld r18,STK_REG(R18)(r1)
  66. ld r17,STK_REG(R17)(r1)
  67. ld r16,STK_REG(R16)(r1)
  68. ld r15,STK_REG(R15)(r1)
  69. ld r14,STK_REG(R14)(r1)
  70. .Lexit:
  71. addi r1,r1,STACKFRAMESIZE
  72. .Ldo_err1:
  73. ld r3,48(r1)
  74. ld r4,56(r1)
  75. ld r5,64(r1)
  76. b __copy_tofrom_user_base
  77. _GLOBAL(__copy_tofrom_user_power7)
  78. #ifdef CONFIG_ALTIVEC
  79. cmpldi r5,16
  80. cmpldi cr1,r5,4096
  81. std r3,48(r1)
  82. std r4,56(r1)
  83. std r5,64(r1)
  84. blt .Lshort_copy
  85. bgt cr1,.Lvmx_copy
  86. #else
  87. cmpldi r5,16
  88. std r3,48(r1)
  89. std r4,56(r1)
  90. std r5,64(r1)
  91. blt .Lshort_copy
  92. #endif
  93. .Lnonvmx_copy:
  94. /* Get the source 8B aligned */
  95. neg r6,r4
  96. mtocrf 0x01,r6
  97. clrldi r6,r6,(64-3)
  98. bf cr7*4+3,1f
  99. err1; lbz r0,0(r4)
  100. addi r4,r4,1
  101. err1; stb r0,0(r3)
  102. addi r3,r3,1
  103. 1: bf cr7*4+2,2f
  104. err1; lhz r0,0(r4)
  105. addi r4,r4,2
  106. err1; sth r0,0(r3)
  107. addi r3,r3,2
  108. 2: bf cr7*4+1,3f
  109. err1; lwz r0,0(r4)
  110. addi r4,r4,4
  111. err1; stw r0,0(r3)
  112. addi r3,r3,4
  113. 3: sub r5,r5,r6
  114. cmpldi r5,128
  115. blt 5f
  116. mflr r0
  117. stdu r1,-STACKFRAMESIZE(r1)
  118. std r14,STK_REG(R14)(r1)
  119. std r15,STK_REG(R15)(r1)
  120. std r16,STK_REG(R16)(r1)
  121. std r17,STK_REG(R17)(r1)
  122. std r18,STK_REG(R18)(r1)
  123. std r19,STK_REG(R19)(r1)
  124. std r20,STK_REG(R20)(r1)
  125. std r21,STK_REG(R21)(r1)
  126. std r22,STK_REG(R22)(r1)
  127. std r0,STACKFRAMESIZE+16(r1)
  128. srdi r6,r5,7
  129. mtctr r6
  130. /* Now do cacheline (128B) sized loads and stores. */
  131. .align 5
  132. 4:
  133. err2; ld r0,0(r4)
  134. err2; ld r6,8(r4)
  135. err2; ld r7,16(r4)
  136. err2; ld r8,24(r4)
  137. err2; ld r9,32(r4)
  138. err2; ld r10,40(r4)
  139. err2; ld r11,48(r4)
  140. err2; ld r12,56(r4)
  141. err2; ld r14,64(r4)
  142. err2; ld r15,72(r4)
  143. err2; ld r16,80(r4)
  144. err2; ld r17,88(r4)
  145. err2; ld r18,96(r4)
  146. err2; ld r19,104(r4)
  147. err2; ld r20,112(r4)
  148. err2; ld r21,120(r4)
  149. addi r4,r4,128
  150. err2; std r0,0(r3)
  151. err2; std r6,8(r3)
  152. err2; std r7,16(r3)
  153. err2; std r8,24(r3)
  154. err2; std r9,32(r3)
  155. err2; std r10,40(r3)
  156. err2; std r11,48(r3)
  157. err2; std r12,56(r3)
  158. err2; std r14,64(r3)
  159. err2; std r15,72(r3)
  160. err2; std r16,80(r3)
  161. err2; std r17,88(r3)
  162. err2; std r18,96(r3)
  163. err2; std r19,104(r3)
  164. err2; std r20,112(r3)
  165. err2; std r21,120(r3)
  166. addi r3,r3,128
  167. bdnz 4b
  168. clrldi r5,r5,(64-7)
  169. ld r14,STK_REG(R14)(r1)
  170. ld r15,STK_REG(R15)(r1)
  171. ld r16,STK_REG(R16)(r1)
  172. ld r17,STK_REG(R17)(r1)
  173. ld r18,STK_REG(R18)(r1)
  174. ld r19,STK_REG(R19)(r1)
  175. ld r20,STK_REG(R20)(r1)
  176. ld r21,STK_REG(R21)(r1)
  177. ld r22,STK_REG(R22)(r1)
  178. addi r1,r1,STACKFRAMESIZE
  179. /* Up to 127B to go */
  180. 5: srdi r6,r5,4
  181. mtocrf 0x01,r6
  182. 6: bf cr7*4+1,7f
  183. err1; ld r0,0(r4)
  184. err1; ld r6,8(r4)
  185. err1; ld r7,16(r4)
  186. err1; ld r8,24(r4)
  187. err1; ld r9,32(r4)
  188. err1; ld r10,40(r4)
  189. err1; ld r11,48(r4)
  190. err1; ld r12,56(r4)
  191. addi r4,r4,64
  192. err1; std r0,0(r3)
  193. err1; std r6,8(r3)
  194. err1; std r7,16(r3)
  195. err1; std r8,24(r3)
  196. err1; std r9,32(r3)
  197. err1; std r10,40(r3)
  198. err1; std r11,48(r3)
  199. err1; std r12,56(r3)
  200. addi r3,r3,64
  201. /* Up to 63B to go */
  202. 7: bf cr7*4+2,8f
  203. err1; ld r0,0(r4)
  204. err1; ld r6,8(r4)
  205. err1; ld r7,16(r4)
  206. err1; ld r8,24(r4)
  207. addi r4,r4,32
  208. err1; std r0,0(r3)
  209. err1; std r6,8(r3)
  210. err1; std r7,16(r3)
  211. err1; std r8,24(r3)
  212. addi r3,r3,32
  213. /* Up to 31B to go */
  214. 8: bf cr7*4+3,9f
  215. err1; ld r0,0(r4)
  216. err1; ld r6,8(r4)
  217. addi r4,r4,16
  218. err1; std r0,0(r3)
  219. err1; std r6,8(r3)
  220. addi r3,r3,16
  221. 9: clrldi r5,r5,(64-4)
  222. /* Up to 15B to go */
  223. .Lshort_copy:
  224. mtocrf 0x01,r5
  225. bf cr7*4+0,12f
  226. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  227. err1; lwz r6,4(r4)
  228. addi r4,r4,8
  229. err1; stw r0,0(r3)
  230. err1; stw r6,4(r3)
  231. addi r3,r3,8
  232. 12: bf cr7*4+1,13f
  233. err1; lwz r0,0(r4)
  234. addi r4,r4,4
  235. err1; stw r0,0(r3)
  236. addi r3,r3,4
  237. 13: bf cr7*4+2,14f
  238. err1; lhz r0,0(r4)
  239. addi r4,r4,2
  240. err1; sth r0,0(r3)
  241. addi r3,r3,2
  242. 14: bf cr7*4+3,15f
  243. err1; lbz r0,0(r4)
  244. err1; stb r0,0(r3)
  245. 15: li r3,0
  246. blr
  247. .Lunwind_stack_nonvmx_copy:
  248. addi r1,r1,STACKFRAMESIZE
  249. b .Lnonvmx_copy
  250. #ifdef CONFIG_ALTIVEC
  251. .Lvmx_copy:
  252. mflr r0
  253. std r0,16(r1)
  254. stdu r1,-STACKFRAMESIZE(r1)
  255. bl .enter_vmx_usercopy
  256. cmpwi cr1,r3,0
  257. ld r0,STACKFRAMESIZE+16(r1)
  258. ld r3,STACKFRAMESIZE+48(r1)
  259. ld r4,STACKFRAMESIZE+56(r1)
  260. ld r5,STACKFRAMESIZE+64(r1)
  261. mtlr r0
  262. /*
  263. * We prefetch both the source and destination using enhanced touch
  264. * instructions. We use a stream ID of 0 for the load side and
  265. * 1 for the store side.
  266. */
  267. clrrdi r6,r4,7
  268. clrrdi r9,r3,7
  269. ori r9,r9,1 /* stream=1 */
  270. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  271. cmpldi r7,0x3FF
  272. ble 1f
  273. li r7,0x3FF
  274. 1: lis r0,0x0E00 /* depth=7 */
  275. sldi r7,r7,7
  276. or r7,r7,r0
  277. ori r10,r7,1 /* stream=1 */
  278. lis r8,0x8000 /* GO=1 */
  279. clrldi r8,r8,32
  280. .machine push
  281. .machine "power4"
  282. /* setup read stream 0 */
  283. dcbt r0,r6,0b01000 /* addr from */
  284. dcbt r0,r7,0b01010 /* length and depth from */
  285. /* setup write stream 1 */
  286. dcbtst r0,r9,0b01000 /* addr to */
  287. dcbtst r0,r10,0b01010 /* length and depth to */
  288. eieio
  289. dcbt r0,r8,0b01010 /* all streams GO */
  290. .machine pop
  291. beq cr1,.Lunwind_stack_nonvmx_copy
  292. /*
  293. * If source and destination are not relatively aligned we use a
  294. * slower permute loop.
  295. */
  296. xor r6,r4,r3
  297. rldicl. r6,r6,0,(64-4)
  298. bne .Lvmx_unaligned_copy
  299. /* Get the destination 16B aligned */
  300. neg r6,r3
  301. mtocrf 0x01,r6
  302. clrldi r6,r6,(64-4)
  303. bf cr7*4+3,1f
  304. err3; lbz r0,0(r4)
  305. addi r4,r4,1
  306. err3; stb r0,0(r3)
  307. addi r3,r3,1
  308. 1: bf cr7*4+2,2f
  309. err3; lhz r0,0(r4)
  310. addi r4,r4,2
  311. err3; sth r0,0(r3)
  312. addi r3,r3,2
  313. 2: bf cr7*4+1,3f
  314. err3; lwz r0,0(r4)
  315. addi r4,r4,4
  316. err3; stw r0,0(r3)
  317. addi r3,r3,4
  318. 3: bf cr7*4+0,4f
  319. err3; ld r0,0(r4)
  320. addi r4,r4,8
  321. err3; std r0,0(r3)
  322. addi r3,r3,8
  323. 4: sub r5,r5,r6
  324. /* Get the desination 128B aligned */
  325. neg r6,r3
  326. srdi r7,r6,4
  327. mtocrf 0x01,r7
  328. clrldi r6,r6,(64-7)
  329. li r9,16
  330. li r10,32
  331. li r11,48
  332. bf cr7*4+3,5f
  333. err3; lvx vr1,r0,r4
  334. addi r4,r4,16
  335. err3; stvx vr1,r0,r3
  336. addi r3,r3,16
  337. 5: bf cr7*4+2,6f
  338. err3; lvx vr1,r0,r4
  339. err3; lvx vr0,r4,r9
  340. addi r4,r4,32
  341. err3; stvx vr1,r0,r3
  342. err3; stvx vr0,r3,r9
  343. addi r3,r3,32
  344. 6: bf cr7*4+1,7f
  345. err3; lvx vr3,r0,r4
  346. err3; lvx vr2,r4,r9
  347. err3; lvx vr1,r4,r10
  348. err3; lvx vr0,r4,r11
  349. addi r4,r4,64
  350. err3; stvx vr3,r0,r3
  351. err3; stvx vr2,r3,r9
  352. err3; stvx vr1,r3,r10
  353. err3; stvx vr0,r3,r11
  354. addi r3,r3,64
  355. 7: sub r5,r5,r6
  356. srdi r6,r5,7
  357. std r14,STK_REG(R14)(r1)
  358. std r15,STK_REG(R15)(r1)
  359. std r16,STK_REG(R16)(r1)
  360. li r12,64
  361. li r14,80
  362. li r15,96
  363. li r16,112
  364. mtctr r6
  365. /*
  366. * Now do cacheline sized loads and stores. By this stage the
  367. * cacheline stores are also cacheline aligned.
  368. */
  369. .align 5
  370. 8:
  371. err4; lvx vr7,r0,r4
  372. err4; lvx vr6,r4,r9
  373. err4; lvx vr5,r4,r10
  374. err4; lvx vr4,r4,r11
  375. err4; lvx vr3,r4,r12
  376. err4; lvx vr2,r4,r14
  377. err4; lvx vr1,r4,r15
  378. err4; lvx vr0,r4,r16
  379. addi r4,r4,128
  380. err4; stvx vr7,r0,r3
  381. err4; stvx vr6,r3,r9
  382. err4; stvx vr5,r3,r10
  383. err4; stvx vr4,r3,r11
  384. err4; stvx vr3,r3,r12
  385. err4; stvx vr2,r3,r14
  386. err4; stvx vr1,r3,r15
  387. err4; stvx vr0,r3,r16
  388. addi r3,r3,128
  389. bdnz 8b
  390. ld r14,STK_REG(R14)(r1)
  391. ld r15,STK_REG(R15)(r1)
  392. ld r16,STK_REG(R16)(r1)
  393. /* Up to 127B to go */
  394. clrldi r5,r5,(64-7)
  395. srdi r6,r5,4
  396. mtocrf 0x01,r6
  397. bf cr7*4+1,9f
  398. err3; lvx vr3,r0,r4
  399. err3; lvx vr2,r4,r9
  400. err3; lvx vr1,r4,r10
  401. err3; lvx vr0,r4,r11
  402. addi r4,r4,64
  403. err3; stvx vr3,r0,r3
  404. err3; stvx vr2,r3,r9
  405. err3; stvx vr1,r3,r10
  406. err3; stvx vr0,r3,r11
  407. addi r3,r3,64
  408. 9: bf cr7*4+2,10f
  409. err3; lvx vr1,r0,r4
  410. err3; lvx vr0,r4,r9
  411. addi r4,r4,32
  412. err3; stvx vr1,r0,r3
  413. err3; stvx vr0,r3,r9
  414. addi r3,r3,32
  415. 10: bf cr7*4+3,11f
  416. err3; lvx vr1,r0,r4
  417. addi r4,r4,16
  418. err3; stvx vr1,r0,r3
  419. addi r3,r3,16
  420. /* Up to 15B to go */
  421. 11: clrldi r5,r5,(64-4)
  422. mtocrf 0x01,r5
  423. bf cr7*4+0,12f
  424. err3; ld r0,0(r4)
  425. addi r4,r4,8
  426. err3; std r0,0(r3)
  427. addi r3,r3,8
  428. 12: bf cr7*4+1,13f
  429. err3; lwz r0,0(r4)
  430. addi r4,r4,4
  431. err3; stw r0,0(r3)
  432. addi r3,r3,4
  433. 13: bf cr7*4+2,14f
  434. err3; lhz r0,0(r4)
  435. addi r4,r4,2
  436. err3; sth r0,0(r3)
  437. addi r3,r3,2
  438. 14: bf cr7*4+3,15f
  439. err3; lbz r0,0(r4)
  440. err3; stb r0,0(r3)
  441. 15: addi r1,r1,STACKFRAMESIZE
  442. b .exit_vmx_usercopy /* tail call optimise */
  443. .Lvmx_unaligned_copy:
  444. /* Get the destination 16B aligned */
  445. neg r6,r3
  446. mtocrf 0x01,r6
  447. clrldi r6,r6,(64-4)
  448. bf cr7*4+3,1f
  449. err3; lbz r0,0(r4)
  450. addi r4,r4,1
  451. err3; stb r0,0(r3)
  452. addi r3,r3,1
  453. 1: bf cr7*4+2,2f
  454. err3; lhz r0,0(r4)
  455. addi r4,r4,2
  456. err3; sth r0,0(r3)
  457. addi r3,r3,2
  458. 2: bf cr7*4+1,3f
  459. err3; lwz r0,0(r4)
  460. addi r4,r4,4
  461. err3; stw r0,0(r3)
  462. addi r3,r3,4
  463. 3: bf cr7*4+0,4f
  464. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  465. err3; lwz r7,4(r4)
  466. addi r4,r4,8
  467. err3; stw r0,0(r3)
  468. err3; stw r7,4(r3)
  469. addi r3,r3,8
  470. 4: sub r5,r5,r6
  471. /* Get the desination 128B aligned */
  472. neg r6,r3
  473. srdi r7,r6,4
  474. mtocrf 0x01,r7
  475. clrldi r6,r6,(64-7)
  476. li r9,16
  477. li r10,32
  478. li r11,48
  479. lvsl vr16,0,r4 /* Setup permute control vector */
  480. err3; lvx vr0,0,r4
  481. addi r4,r4,16
  482. bf cr7*4+3,5f
  483. err3; lvx vr1,r0,r4
  484. vperm vr8,vr0,vr1,vr16
  485. addi r4,r4,16
  486. err3; stvx vr8,r0,r3
  487. addi r3,r3,16
  488. vor vr0,vr1,vr1
  489. 5: bf cr7*4+2,6f
  490. err3; lvx vr1,r0,r4
  491. vperm vr8,vr0,vr1,vr16
  492. err3; lvx vr0,r4,r9
  493. vperm vr9,vr1,vr0,vr16
  494. addi r4,r4,32
  495. err3; stvx vr8,r0,r3
  496. err3; stvx vr9,r3,r9
  497. addi r3,r3,32
  498. 6: bf cr7*4+1,7f
  499. err3; lvx vr3,r0,r4
  500. vperm vr8,vr0,vr3,vr16
  501. err3; lvx vr2,r4,r9
  502. vperm vr9,vr3,vr2,vr16
  503. err3; lvx vr1,r4,r10
  504. vperm vr10,vr2,vr1,vr16
  505. err3; lvx vr0,r4,r11
  506. vperm vr11,vr1,vr0,vr16
  507. addi r4,r4,64
  508. err3; stvx vr8,r0,r3
  509. err3; stvx vr9,r3,r9
  510. err3; stvx vr10,r3,r10
  511. err3; stvx vr11,r3,r11
  512. addi r3,r3,64
  513. 7: sub r5,r5,r6
  514. srdi r6,r5,7
  515. std r14,STK_REG(R14)(r1)
  516. std r15,STK_REG(R15)(r1)
  517. std r16,STK_REG(R16)(r1)
  518. li r12,64
  519. li r14,80
  520. li r15,96
  521. li r16,112
  522. mtctr r6
  523. /*
  524. * Now do cacheline sized loads and stores. By this stage the
  525. * cacheline stores are also cacheline aligned.
  526. */
  527. .align 5
  528. 8:
  529. err4; lvx vr7,r0,r4
  530. vperm vr8,vr0,vr7,vr16
  531. err4; lvx vr6,r4,r9
  532. vperm vr9,vr7,vr6,vr16
  533. err4; lvx vr5,r4,r10
  534. vperm vr10,vr6,vr5,vr16
  535. err4; lvx vr4,r4,r11
  536. vperm vr11,vr5,vr4,vr16
  537. err4; lvx vr3,r4,r12
  538. vperm vr12,vr4,vr3,vr16
  539. err4; lvx vr2,r4,r14
  540. vperm vr13,vr3,vr2,vr16
  541. err4; lvx vr1,r4,r15
  542. vperm vr14,vr2,vr1,vr16
  543. err4; lvx vr0,r4,r16
  544. vperm vr15,vr1,vr0,vr16
  545. addi r4,r4,128
  546. err4; stvx vr8,r0,r3
  547. err4; stvx vr9,r3,r9
  548. err4; stvx vr10,r3,r10
  549. err4; stvx vr11,r3,r11
  550. err4; stvx vr12,r3,r12
  551. err4; stvx vr13,r3,r14
  552. err4; stvx vr14,r3,r15
  553. err4; stvx vr15,r3,r16
  554. addi r3,r3,128
  555. bdnz 8b
  556. ld r14,STK_REG(R14)(r1)
  557. ld r15,STK_REG(R15)(r1)
  558. ld r16,STK_REG(R16)(r1)
  559. /* Up to 127B to go */
  560. clrldi r5,r5,(64-7)
  561. srdi r6,r5,4
  562. mtocrf 0x01,r6
  563. bf cr7*4+1,9f
  564. err3; lvx vr3,r0,r4
  565. vperm vr8,vr0,vr3,vr16
  566. err3; lvx vr2,r4,r9
  567. vperm vr9,vr3,vr2,vr16
  568. err3; lvx vr1,r4,r10
  569. vperm vr10,vr2,vr1,vr16
  570. err3; lvx vr0,r4,r11
  571. vperm vr11,vr1,vr0,vr16
  572. addi r4,r4,64
  573. err3; stvx vr8,r0,r3
  574. err3; stvx vr9,r3,r9
  575. err3; stvx vr10,r3,r10
  576. err3; stvx vr11,r3,r11
  577. addi r3,r3,64
  578. 9: bf cr7*4+2,10f
  579. err3; lvx vr1,r0,r4
  580. vperm vr8,vr0,vr1,vr16
  581. err3; lvx vr0,r4,r9
  582. vperm vr9,vr1,vr0,vr16
  583. addi r4,r4,32
  584. err3; stvx vr8,r0,r3
  585. err3; stvx vr9,r3,r9
  586. addi r3,r3,32
  587. 10: bf cr7*4+3,11f
  588. err3; lvx vr1,r0,r4
  589. vperm vr8,vr0,vr1,vr16
  590. addi r4,r4,16
  591. err3; stvx vr8,r0,r3
  592. addi r3,r3,16
  593. /* Up to 15B to go */
  594. 11: clrldi r5,r5,(64-4)
  595. addi r4,r4,-16 /* Unwind the +16 load offset */
  596. mtocrf 0x01,r5
  597. bf cr7*4+0,12f
  598. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  599. err3; lwz r6,4(r4)
  600. addi r4,r4,8
  601. err3; stw r0,0(r3)
  602. err3; stw r6,4(r3)
  603. addi r3,r3,8
  604. 12: bf cr7*4+1,13f
  605. err3; lwz r0,0(r4)
  606. addi r4,r4,4
  607. err3; stw r0,0(r3)
  608. addi r3,r3,4
  609. 13: bf cr7*4+2,14f
  610. err3; lhz r0,0(r4)
  611. addi r4,r4,2
  612. err3; sth r0,0(r3)
  613. addi r3,r3,2
  614. 14: bf cr7*4+3,15f
  615. err3; lbz r0,0(r4)
  616. err3; stb r0,0(r3)
  617. 15: addi r1,r1,STACKFRAMESIZE
  618. b .exit_vmx_usercopy /* tail call optimise */
  619. #endif /* CONFiG_ALTIVEC */