vector.S 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. #include <asm/ppc_asm.h>
  2. #include <asm/reg.h>
  3. /*
  4. * The routines below are in assembler so we can closely control the
  5. * usage of floating-point registers. These routines must be called
  6. * with preempt disabled.
  7. */
  8. #ifdef CONFIG_PPC32
  9. .data
  10. fpzero:
  11. .long 0
  12. fpone:
  13. .long 0x3f800000 /* 1.0 in single-precision FP */
  14. fphalf:
  15. .long 0x3f000000 /* 0.5 in single-precision FP */
  16. #define LDCONST(fr, name) \
  17. lis r11,name@ha; \
  18. lfs fr,name@l(r11)
  19. #else
  20. .section ".toc","aw"
  21. fpzero:
  22. .tc FD_0_0[TC],0
  23. fpone:
  24. .tc FD_3ff00000_0[TC],0x3ff0000000000000 /* 1.0 */
  25. fphalf:
  26. .tc FD_3fe00000_0[TC],0x3fe0000000000000 /* 0.5 */
  27. #define LDCONST(fr, name) \
  28. lfd fr,name@toc(r2)
  29. #endif
  30. .text
  31. /*
  32. * Internal routine to enable floating point and set FPSCR to 0.
  33. * Don't call it from C; it doesn't use the normal calling convention.
  34. */
  35. fpenable:
  36. #ifdef CONFIG_PPC32
  37. stwu r1,-64(r1)
  38. #else
  39. stdu r1,-64(r1)
  40. #endif
  41. mfmsr r10
  42. ori r11,r10,MSR_FP
  43. mtmsr r11
  44. isync
  45. stfd fr0,24(r1)
  46. stfd fr1,16(r1)
  47. stfd fr31,8(r1)
  48. LDCONST(fr1, fpzero)
  49. mffs fr31
  50. MTFSF_L(fr1)
  51. blr
  52. fpdisable:
  53. mtlr r12
  54. MTFSF_L(fr31)
  55. lfd fr31,8(r1)
  56. lfd fr1,16(r1)
  57. lfd fr0,24(r1)
  58. mtmsr r10
  59. isync
  60. addi r1,r1,64
  61. blr
  62. /*
  63. * Vector add, floating point.
  64. */
  65. _GLOBAL(vaddfp)
  66. mflr r12
  67. bl fpenable
  68. li r0,4
  69. mtctr r0
  70. li r6,0
  71. 1: lfsx fr0,r4,r6
  72. lfsx fr1,r5,r6
  73. fadds fr0,fr0,fr1
  74. stfsx fr0,r3,r6
  75. addi r6,r6,4
  76. bdnz 1b
  77. b fpdisable
  78. /*
  79. * Vector subtract, floating point.
  80. */
  81. _GLOBAL(vsubfp)
  82. mflr r12
  83. bl fpenable
  84. li r0,4
  85. mtctr r0
  86. li r6,0
  87. 1: lfsx fr0,r4,r6
  88. lfsx fr1,r5,r6
  89. fsubs fr0,fr0,fr1
  90. stfsx fr0,r3,r6
  91. addi r6,r6,4
  92. bdnz 1b
  93. b fpdisable
  94. /*
  95. * Vector multiply and add, floating point.
  96. */
  97. _GLOBAL(vmaddfp)
  98. mflr r12
  99. bl fpenable
  100. stfd fr2,32(r1)
  101. li r0,4
  102. mtctr r0
  103. li r7,0
  104. 1: lfsx fr0,r4,r7
  105. lfsx fr1,r5,r7
  106. lfsx fr2,r6,r7
  107. fmadds fr0,fr0,fr2,fr1
  108. stfsx fr0,r3,r7
  109. addi r7,r7,4
  110. bdnz 1b
  111. lfd fr2,32(r1)
  112. b fpdisable
  113. /*
  114. * Vector negative multiply and subtract, floating point.
  115. */
  116. _GLOBAL(vnmsubfp)
  117. mflr r12
  118. bl fpenable
  119. stfd fr2,32(r1)
  120. li r0,4
  121. mtctr r0
  122. li r7,0
  123. 1: lfsx fr0,r4,r7
  124. lfsx fr1,r5,r7
  125. lfsx fr2,r6,r7
  126. fnmsubs fr0,fr0,fr2,fr1
  127. stfsx fr0,r3,r7
  128. addi r7,r7,4
  129. bdnz 1b
  130. lfd fr2,32(r1)
  131. b fpdisable
  132. /*
  133. * Vector reciprocal estimate. We just compute 1.0/x.
  134. * r3 -> destination, r4 -> source.
  135. */
  136. _GLOBAL(vrefp)
  137. mflr r12
  138. bl fpenable
  139. li r0,4
  140. LDCONST(fr1, fpone)
  141. mtctr r0
  142. li r6,0
  143. 1: lfsx fr0,r4,r6
  144. fdivs fr0,fr1,fr0
  145. stfsx fr0,r3,r6
  146. addi r6,r6,4
  147. bdnz 1b
  148. b fpdisable
  149. /*
  150. * Vector reciprocal square-root estimate, floating point.
  151. * We use the frsqrte instruction for the initial estimate followed
  152. * by 2 iterations of Newton-Raphson to get sufficient accuracy.
  153. * r3 -> destination, r4 -> source.
  154. */
  155. _GLOBAL(vrsqrtefp)
  156. mflr r12
  157. bl fpenable
  158. stfd fr2,32(r1)
  159. stfd fr3,40(r1)
  160. stfd fr4,48(r1)
  161. stfd fr5,56(r1)
  162. li r0,4
  163. LDCONST(fr4, fpone)
  164. LDCONST(fr5, fphalf)
  165. mtctr r0
  166. li r6,0
  167. 1: lfsx fr0,r4,r6
  168. frsqrte fr1,fr0 /* r = frsqrte(s) */
  169. fmuls fr3,fr1,fr0 /* r * s */
  170. fmuls fr2,fr1,fr5 /* r * 0.5 */
  171. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  172. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  173. fmuls fr3,fr1,fr0 /* r * s */
  174. fmuls fr2,fr1,fr5 /* r * 0.5 */
  175. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  176. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  177. stfsx fr1,r3,r6
  178. addi r6,r6,4
  179. bdnz 1b
  180. lfd fr5,56(r1)
  181. lfd fr4,48(r1)
  182. lfd fr3,40(r1)
  183. lfd fr2,32(r1)
  184. b fpdisable