checksum_64.S 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /*
  2. * This file contains assembly-language implementations
  3. * of IP-style 1's complement checksum routines.
  4. *
  5. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13. */
  14. #include <linux/sys.h>
  15. #include <asm/processor.h>
  16. #include <asm/errno.h>
  17. #include <asm/ppc_asm.h>
  18. /*
  19. * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
  20. * len is in words and is always >= 5.
  21. *
  22. * In practice len == 5, but this is not guaranteed. So this code does not
  23. * attempt to use doubleword instructions.
  24. */
  25. _GLOBAL(ip_fast_csum)
  26. lwz r0,0(r3)
  27. lwzu r5,4(r3)
  28. addic. r4,r4,-2
  29. addc r0,r0,r5
  30. mtctr r4
  31. blelr-
  32. 1: lwzu r4,4(r3)
  33. adde r0,r0,r4
  34. bdnz 1b
  35. addze r0,r0 /* add in final carry */
  36. rldicl r4,r0,32,0 /* fold two 32-bit halves together */
  37. add r0,r0,r4
  38. srdi r0,r0,32
  39. rlwinm r3,r0,16,0,31 /* fold two halves together */
  40. add r3,r0,r3
  41. not r3,r3
  42. srwi r3,r3,16
  43. blr
  44. /*
  45. * Compute checksum of TCP or UDP pseudo-header:
  46. * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
  47. * No real gain trying to do this specially for 64 bit, but
  48. * the 32 bit addition may spill into the upper bits of
  49. * the doubleword so we still must fold it down from 64.
  50. */
  51. _GLOBAL(csum_tcpudp_magic)
  52. rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
  53. addc r0,r3,r4 /* add 4 32-bit words together */
  54. adde r0,r0,r5
  55. adde r0,r0,r7
  56. rldicl r4,r0,32,0 /* fold 64 bit value */
  57. add r0,r4,r0
  58. srdi r0,r0,32
  59. rlwinm r3,r0,16,0,31 /* fold two halves together */
  60. add r3,r0,r3
  61. not r3,r3
  62. srwi r3,r3,16
  63. blr
  64. #define STACKFRAMESIZE 256
  65. #define STK_REG(i) (112 + ((i)-14)*8)
  66. /*
  67. * Computes the checksum of a memory block at buff, length len,
  68. * and adds in "sum" (32-bit).
  69. *
  70. * csum_partial(r3=buff, r4=len, r5=sum)
  71. */
  72. _GLOBAL(csum_partial)
  73. addic r0,r5,0 /* clear carry */
  74. srdi. r6,r4,3 /* less than 8 bytes? */
  75. beq .Lcsum_tail_word
  76. /*
  77. * If only halfword aligned, align to a double word. Since odd
  78. * aligned addresses should be rare and they would require more
  79. * work to calculate the correct checksum, we ignore that case
  80. * and take the potential slowdown of unaligned loads.
  81. */
  82. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
  83. beq .Lcsum_aligned
  84. li r7,4
  85. sub r6,r7,r6
  86. mtctr r6
  87. 1:
  88. lhz r6,0(r3) /* align to doubleword */
  89. subi r4,r4,2
  90. addi r3,r3,2
  91. adde r0,r0,r6
  92. bdnz 1b
  93. .Lcsum_aligned:
  94. /*
  95. * We unroll the loop such that each iteration is 64 bytes with an
  96. * entry and exit limb of 64 bytes, meaning a minimum size of
  97. * 128 bytes.
  98. */
  99. srdi. r6,r4,7
  100. beq .Lcsum_tail_doublewords /* len < 128 */
  101. srdi r6,r4,6
  102. subi r6,r6,1
  103. mtctr r6
  104. stdu r1,-STACKFRAMESIZE(r1)
  105. std r14,STK_REG(r14)(r1)
  106. std r15,STK_REG(r15)(r1)
  107. std r16,STK_REG(r16)(r1)
  108. ld r6,0(r3)
  109. ld r9,8(r3)
  110. ld r10,16(r3)
  111. ld r11,24(r3)
  112. /*
  113. * On POWER6 and POWER7 back to back addes take 2 cycles because of
  114. * the XER dependency. This means the fastest this loop can go is
  115. * 16 cycles per iteration. The scheduling of the loop below has
  116. * been shown to hit this on both POWER6 and POWER7.
  117. */
  118. .align 5
  119. 2:
  120. adde r0,r0,r6
  121. ld r12,32(r3)
  122. ld r14,40(r3)
  123. adde r0,r0,r9
  124. ld r15,48(r3)
  125. ld r16,56(r3)
  126. addi r3,r3,64
  127. adde r0,r0,r10
  128. adde r0,r0,r11
  129. adde r0,r0,r12
  130. adde r0,r0,r14
  131. adde r0,r0,r15
  132. ld r6,0(r3)
  133. ld r9,8(r3)
  134. adde r0,r0,r16
  135. ld r10,16(r3)
  136. ld r11,24(r3)
  137. bdnz 2b
  138. adde r0,r0,r6
  139. ld r12,32(r3)
  140. ld r14,40(r3)
  141. adde r0,r0,r9
  142. ld r15,48(r3)
  143. ld r16,56(r3)
  144. addi r3,r3,64
  145. adde r0,r0,r10
  146. adde r0,r0,r11
  147. adde r0,r0,r12
  148. adde r0,r0,r14
  149. adde r0,r0,r15
  150. adde r0,r0,r16
  151. ld r14,STK_REG(r14)(r1)
  152. ld r15,STK_REG(r15)(r1)
  153. ld r16,STK_REG(r16)(r1)
  154. addi r1,r1,STACKFRAMESIZE
  155. andi. r4,r4,63
  156. .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
  157. srdi. r6,r4,3
  158. beq .Lcsum_tail_word
  159. mtctr r6
  160. 3:
  161. ld r6,0(r3)
  162. addi r3,r3,8
  163. adde r0,r0,r6
  164. bdnz 3b
  165. andi. r4,r4,7
  166. .Lcsum_tail_word: /* Up to 7 bytes to go */
  167. srdi. r6,r4,2
  168. beq .Lcsum_tail_halfword
  169. lwz r6,0(r3)
  170. addi r3,r3,4
  171. adde r0,r0,r6
  172. subi r4,r4,4
  173. .Lcsum_tail_halfword: /* Up to 3 bytes to go */
  174. srdi. r6,r4,1
  175. beq .Lcsum_tail_byte
  176. lhz r6,0(r3)
  177. addi r3,r3,2
  178. adde r0,r0,r6
  179. subi r4,r4,2
  180. .Lcsum_tail_byte: /* Up to 1 byte to go */
  181. andi. r6,r4,1
  182. beq .Lcsum_finish
  183. lbz r6,0(r3)
  184. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  185. adde r0,r0,r9
  186. .Lcsum_finish:
  187. addze r0,r0 /* add in final carry */
  188. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  189. add r3,r4,r0
  190. srdi r3,r3,32
  191. blr
  192. /*
  193. * Computes the checksum of a memory block at src, length len,
  194. * and adds in "sum" (32-bit), while copying the block to dst.
  195. * If an access exception occurs on src or dst, it stores -EFAULT
  196. * to *src_err or *dst_err respectively, and (for an error on
  197. * src) zeroes the rest of dst.
  198. *
  199. * This code needs to be reworked to take advantage of 64 bit sum+copy.
  200. * However, due to tokenring halfword alignment problems this will be very
  201. * tricky. For now we'll leave it until we instrument it somehow.
  202. *
  203. * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
  204. */
  205. _GLOBAL(csum_partial_copy_generic)
  206. addic r0,r6,0
  207. subi r3,r3,4
  208. subi r4,r4,4
  209. srwi. r6,r5,2
  210. beq 3f /* if we're doing < 4 bytes */
  211. andi. r9,r4,2 /* Align dst to longword boundary */
  212. beq+ 1f
  213. 81: lhz r6,4(r3) /* do 2 bytes to get aligned */
  214. addi r3,r3,2
  215. subi r5,r5,2
  216. 91: sth r6,4(r4)
  217. addi r4,r4,2
  218. addc r0,r0,r6
  219. srwi. r6,r5,2 /* # words to do */
  220. beq 3f
  221. 1: mtctr r6
  222. 82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
  223. 92: stwu r6,4(r4) /* be unnecessary to unroll this loop */
  224. adde r0,r0,r6
  225. bdnz 82b
  226. andi. r5,r5,3
  227. 3: cmpwi 0,r5,2
  228. blt+ 4f
  229. 83: lhz r6,4(r3)
  230. addi r3,r3,2
  231. subi r5,r5,2
  232. 93: sth r6,4(r4)
  233. addi r4,r4,2
  234. adde r0,r0,r6
  235. 4: cmpwi 0,r5,1
  236. bne+ 5f
  237. 84: lbz r6,4(r3)
  238. 94: stb r6,4(r4)
  239. slwi r6,r6,8 /* Upper byte of word */
  240. adde r0,r0,r6
  241. 5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */
  242. rldicl r4,r3,32,0 /* fold 64 bit value */
  243. add r3,r4,r3
  244. srdi r3,r3,32
  245. blr
  246. /* These shouldn't go in the fixup section, since that would
  247. cause the ex_table addresses to get out of order. */
  248. .globl src_error_1
  249. src_error_1:
  250. li r6,0
  251. subi r5,r5,2
  252. 95: sth r6,4(r4)
  253. addi r4,r4,2
  254. srwi. r6,r5,2
  255. beq 3f
  256. mtctr r6
  257. .globl src_error_2
  258. src_error_2:
  259. li r6,0
  260. 96: stwu r6,4(r4)
  261. bdnz 96b
  262. 3: andi. r5,r5,3
  263. beq src_error
  264. .globl src_error_3
  265. src_error_3:
  266. li r6,0
  267. mtctr r5
  268. addi r4,r4,3
  269. 97: stbu r6,1(r4)
  270. bdnz 97b
  271. .globl src_error
  272. src_error:
  273. cmpdi 0,r7,0
  274. beq 1f
  275. li r6,-EFAULT
  276. stw r6,0(r7)
  277. 1: addze r3,r0
  278. blr
  279. .globl dst_error
  280. dst_error:
  281. cmpdi 0,r8,0
  282. beq 1f
  283. li r6,-EFAULT
  284. stw r6,0(r8)
  285. 1: addze r3,r0
  286. blr
  287. .section __ex_table,"a"
  288. .align 3
  289. .llong 81b,src_error_1
  290. .llong 91b,dst_error
  291. .llong 82b,src_error_2
  292. .llong 92b,dst_error
  293. .llong 83b,src_error_3
  294. .llong 93b,dst_error
  295. .llong 84b,src_error_3
  296. .llong 94b,dst_error
  297. .llong 95b,dst_error
  298. .llong 96b,dst_error
  299. .llong 97b,dst_error