vector.S 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. #include <linux/config.h>
  2. #include <asm/ppc_asm.h>
  3. #include <asm/reg.h>
  4. /*
  5. * The routines below are in assembler so we can closely control the
  6. * usage of floating-point registers. These routines must be called
  7. * with preempt disabled.
  8. */
  9. #ifdef CONFIG_PPC32
  10. .data
  11. fpzero:
  12. .long 0
  13. fpone:
  14. .long 0x3f800000 /* 1.0 in single-precision FP */
  15. fphalf:
  16. .long 0x3f000000 /* 0.5 in single-precision FP */
  17. #define LDCONST(fr, name) \
  18. lis r11,name@ha; \
  19. lfs fr,name@l(r11)
  20. #else
  21. .section ".toc","aw"
  22. fpzero:
  23. .tc FD_0_0[TC],0
  24. fpone:
  25. .tc FD_3ff00000_0[TC],0x3ff0000000000000 /* 1.0 */
  26. fphalf:
  27. .tc FD_3fe00000_0[TC],0x3fe0000000000000 /* 0.5 */
  28. #define LDCONST(fr, name) \
  29. lfd fr,name@toc(r2)
  30. #endif
  31. .text
  32. /*
  33. * Internal routine to enable floating point and set FPSCR to 0.
  34. * Don't call it from C; it doesn't use the normal calling convention.
  35. */
  36. fpenable:
  37. #ifdef CONFIG_PPC32
  38. stwu r1,-64(r1)
  39. #else
  40. stdu r1,-64(r1)
  41. #endif
  42. mfmsr r10
  43. ori r11,r10,MSR_FP
  44. mtmsr r11
  45. isync
  46. stfd fr0,24(r1)
  47. stfd fr1,16(r1)
  48. stfd fr31,8(r1)
  49. LDCONST(fr1, fpzero)
  50. mffs fr31
  51. mtfsf 0xff,fr1
  52. blr
  53. fpdisable:
  54. mtlr r12
  55. mtfsf 0xff,fr31
  56. lfd fr31,8(r1)
  57. lfd fr1,16(r1)
  58. lfd fr0,24(r1)
  59. mtmsr r10
  60. isync
  61. addi r1,r1,64
  62. blr
  63. /*
  64. * Vector add, floating point.
  65. */
  66. _GLOBAL(vaddfp)
  67. mflr r12
  68. bl fpenable
  69. li r0,4
  70. mtctr r0
  71. li r6,0
  72. 1: lfsx fr0,r4,r6
  73. lfsx fr1,r5,r6
  74. fadds fr0,fr0,fr1
  75. stfsx fr0,r3,r6
  76. addi r6,r6,4
  77. bdnz 1b
  78. b fpdisable
  79. /*
  80. * Vector subtract, floating point.
  81. */
  82. _GLOBAL(vsubfp)
  83. mflr r12
  84. bl fpenable
  85. li r0,4
  86. mtctr r0
  87. li r6,0
  88. 1: lfsx fr0,r4,r6
  89. lfsx fr1,r5,r6
  90. fsubs fr0,fr0,fr1
  91. stfsx fr0,r3,r6
  92. addi r6,r6,4
  93. bdnz 1b
  94. b fpdisable
  95. /*
  96. * Vector multiply and add, floating point.
  97. */
  98. _GLOBAL(vmaddfp)
  99. mflr r12
  100. bl fpenable
  101. stfd fr2,32(r1)
  102. li r0,4
  103. mtctr r0
  104. li r7,0
  105. 1: lfsx fr0,r4,r7
  106. lfsx fr1,r5,r7
  107. lfsx fr2,r6,r7
  108. fmadds fr0,fr0,fr2,fr1
  109. stfsx fr0,r3,r7
  110. addi r7,r7,4
  111. bdnz 1b
  112. lfd fr2,32(r1)
  113. b fpdisable
  114. /*
  115. * Vector negative multiply and subtract, floating point.
  116. */
  117. _GLOBAL(vnmsubfp)
  118. mflr r12
  119. bl fpenable
  120. stfd fr2,32(r1)
  121. li r0,4
  122. mtctr r0
  123. li r7,0
  124. 1: lfsx fr0,r4,r7
  125. lfsx fr1,r5,r7
  126. lfsx fr2,r6,r7
  127. fnmsubs fr0,fr0,fr2,fr1
  128. stfsx fr0,r3,r7
  129. addi r7,r7,4
  130. bdnz 1b
  131. lfd fr2,32(r1)
  132. b fpdisable
  133. /*
  134. * Vector reciprocal estimate. We just compute 1.0/x.
  135. * r3 -> destination, r4 -> source.
  136. */
  137. _GLOBAL(vrefp)
  138. mflr r12
  139. bl fpenable
  140. li r0,4
  141. LDCONST(fr1, fpone)
  142. mtctr r0
  143. li r6,0
  144. 1: lfsx fr0,r4,r6
  145. fdivs fr0,fr1,fr0
  146. stfsx fr0,r3,r6
  147. addi r6,r6,4
  148. bdnz 1b
  149. b fpdisable
  150. /*
  151. * Vector reciprocal square-root estimate, floating point.
  152. * We use the frsqrte instruction for the initial estimate followed
  153. * by 2 iterations of Newton-Raphson to get sufficient accuracy.
  154. * r3 -> destination, r4 -> source.
  155. */
  156. _GLOBAL(vrsqrtefp)
  157. mflr r12
  158. bl fpenable
  159. stfd fr2,32(r1)
  160. stfd fr3,40(r1)
  161. stfd fr4,48(r1)
  162. stfd fr5,56(r1)
  163. li r0,4
  164. LDCONST(fr4, fpone)
  165. LDCONST(fr5, fphalf)
  166. mtctr r0
  167. li r6,0
  168. 1: lfsx fr0,r4,r6
  169. frsqrte fr1,fr0 /* r = frsqrte(s) */
  170. fmuls fr3,fr1,fr0 /* r * s */
  171. fmuls fr2,fr1,fr5 /* r * 0.5 */
  172. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  173. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  174. fmuls fr3,fr1,fr0 /* r * s */
  175. fmuls fr2,fr1,fr5 /* r * 0.5 */
  176. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  177. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  178. stfsx fr1,r3,r6
  179. addi r6,r6,4
  180. bdnz 1b
  181. lfd fr5,56(r1)
  182. lfd fr4,48(r1)
  183. lfd fr3,40(r1)
  184. lfd fr2,32(r1)
  185. b fpdisable