checksum.S 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. /*
  2. * INET An implementation of the TCP/IP protocol suite for the LINUX
  3. * operating system. INET is implemented using the BSD Socket
  4. * interface as the means of communication with the user level.
  5. *
  6. * IP/TCP/UDP checksumming routines
  7. *
  8. * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
  9. * Optimized by Joe Taylor
  10. *
  11. * This program is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU General Public License
  13. * as published by the Free Software Foundation; either version
  14. * 2 of the License, or (at your option) any later version.
  15. */
  16. #include <asm/errno.h>
  17. #include <linux/linkage.h>
  18. #include <asm/variant/core.h>
  19. /*
  20. * computes a partial checksum, e.g. for TCP/UDP fragments
  21. */
  22. /*
  23. * unsigned int csum_partial(const unsigned char *buf, int len,
  24. * unsigned int sum);
  25. * a2 = buf
  26. * a3 = len
  27. * a4 = sum
  28. *
  29. * This function assumes 2- or 4-byte alignment. Other alignments will fail!
  30. */
  31. /* ONES_ADD converts twos-complement math to ones-complement. */
  32. #define ONES_ADD(sum, val) \
  33. add sum, sum, val ; \
  34. bgeu sum, val, 99f ; \
  35. addi sum, sum, 1 ; \
  36. 99: ;
  37. .text
  38. ENTRY(csum_partial)
  39. /*
  40. * Experiments with Ethernet and SLIP connections show that buf
  41. * is aligned on either a 2-byte or 4-byte boundary.
  42. */
  43. entry sp, 32
  44. extui a5, a2, 0, 2
  45. bnez a5, 8f /* branch if 2-byte aligned */
  46. /* Fall-through on common case, 4-byte alignment */
  47. 1:
  48. srli a5, a3, 5 /* 32-byte chunks */
  49. #if XCHAL_HAVE_LOOPS
  50. loopgtz a5, 2f
  51. #else
  52. beqz a5, 2f
  53. slli a5, a5, 5
  54. add a5, a5, a2 /* a5 = end of last 32-byte chunk */
  55. .Loop1:
  56. #endif
  57. l32i a6, a2, 0
  58. l32i a7, a2, 4
  59. ONES_ADD(a4, a6)
  60. ONES_ADD(a4, a7)
  61. l32i a6, a2, 8
  62. l32i a7, a2, 12
  63. ONES_ADD(a4, a6)
  64. ONES_ADD(a4, a7)
  65. l32i a6, a2, 16
  66. l32i a7, a2, 20
  67. ONES_ADD(a4, a6)
  68. ONES_ADD(a4, a7)
  69. l32i a6, a2, 24
  70. l32i a7, a2, 28
  71. ONES_ADD(a4, a6)
  72. ONES_ADD(a4, a7)
  73. addi a2, a2, 4*8
  74. #if !XCHAL_HAVE_LOOPS
  75. blt a2, a5, .Loop1
  76. #endif
  77. 2:
  78. extui a5, a3, 2, 3 /* remaining 4-byte chunks */
  79. #if XCHAL_HAVE_LOOPS
  80. loopgtz a5, 3f
  81. #else
  82. beqz a5, 3f
  83. slli a5, a5, 2
  84. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  85. .Loop2:
  86. #endif
  87. l32i a6, a2, 0
  88. ONES_ADD(a4, a6)
  89. addi a2, a2, 4
  90. #if !XCHAL_HAVE_LOOPS
  91. blt a2, a5, .Loop2
  92. #endif
  93. 3:
  94. _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
  95. l16ui a6, a2, 0
  96. ONES_ADD(a4, a6)
  97. addi a2, a2, 2
  98. 5:
  99. _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
  100. 6: l8ui a6, a2, 0
  101. #ifdef __XTENSA_EB__
  102. slli a6, a6, 8 /* load byte into bits 8..15 */
  103. #endif
  104. ONES_ADD(a4, a6)
  105. 7:
  106. mov a2, a4
  107. retw
  108. /* uncommon case, buf is 2-byte aligned */
  109. 8:
  110. beqz a3, 7b /* branch if len == 0 */
  111. beqi a3, 1, 6b /* branch if len == 1 */
  112. extui a5, a2, 0, 1
  113. bnez a5, 8f /* branch if 1-byte aligned */
  114. l16ui a6, a2, 0 /* common case, len >= 2 */
  115. ONES_ADD(a4, a6)
  116. addi a2, a2, 2 /* adjust buf */
  117. addi a3, a3, -2 /* adjust len */
  118. j 1b /* now buf is 4-byte aligned */
  119. /* case: odd-byte aligned, len > 1
  120. * This case is dog slow, so don't give us an odd address.
  121. * (I don't think this ever happens, but just in case.)
  122. */
  123. 8:
  124. srli a5, a3, 2 /* 4-byte chunks */
  125. #if XCHAL_HAVE_LOOPS
  126. loopgtz a5, 2f
  127. #else
  128. beqz a5, 2f
  129. slli a5, a5, 2
  130. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  131. .Loop3:
  132. #endif
  133. l8ui a6, a2, 0 /* bits 24..31 */
  134. l16ui a7, a2, 1 /* bits 8..23 */
  135. l8ui a8, a2, 3 /* bits 0.. 8 */
  136. #ifdef __XTENSA_EB__
  137. slli a6, a6, 24
  138. #else
  139. slli a8, a8, 24
  140. #endif
  141. slli a7, a7, 8
  142. or a7, a7, a6
  143. or a7, a7, a8
  144. ONES_ADD(a4, a7)
  145. addi a2, a2, 4
  146. #if !XCHAL_HAVE_LOOPS
  147. blt a2, a5, .Loop3
  148. #endif
  149. 2:
  150. _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
  151. l8ui a6, a2, 0
  152. l8ui a7, a2, 1
  153. #ifdef __XTENSA_EB__
  154. slli a6, a6, 8
  155. #else
  156. slli a7, a7, 8
  157. #endif
  158. or a7, a7, a6
  159. ONES_ADD(a4, a7)
  160. addi a2, a2, 2
  161. 3:
  162. j 5b /* branch to handle the remaining byte */
  163. /*
  164. * Copy from ds while checksumming, otherwise like csum_partial
  165. *
  166. * The macros SRC and DST specify the type of access for the instruction.
  167. * thus we can call a custom exception handler for each access type.
  168. */
  169. #define SRC(y...) \
  170. 9999: y; \
  171. .section __ex_table, "a"; \
  172. .long 9999b, 6001f ; \
  173. .previous
  174. #define DST(y...) \
  175. 9999: y; \
  176. .section __ex_table, "a"; \
  177. .long 9999b, 6002f ; \
  178. .previous
  179. /*
  180. unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
  181. int sum, int *src_err_ptr, int *dst_err_ptr)
  182. a2 = src
  183. a3 = dst
  184. a4 = len
  185. a5 = sum
  186. a6 = src_err_ptr
  187. a7 = dst_err_ptr
  188. a8 = temp
  189. a9 = temp
  190. a10 = temp
  191. a11 = original len for exception handling
  192. a12 = original dst for exception handling
  193. This function is optimized for 4-byte aligned addresses. Other
  194. alignments work, but not nearly as efficiently.
  195. */
  196. ENTRY(csum_partial_copy_generic)
  197. entry sp, 32
  198. mov a12, a3
  199. mov a11, a4
  200. or a10, a2, a3
  201. /* We optimize the following alignment tests for the 4-byte
  202. aligned case. Two bbsi.l instructions might seem more optimal
  203. (commented out below). However, both labels 5: and 3: are out
  204. of the imm8 range, so the assembler relaxes them into
  205. equivalent bbci.l, j combinations, which is actually
  206. slower. */
  207. extui a9, a10, 0, 2
  208. beqz a9, 1f /* branch if both are 4-byte aligned */
  209. bbsi.l a10, 0, 5f /* branch if one address is odd */
  210. j 3f /* one address is 2-byte aligned */
  211. /* _bbsi.l a10, 0, 5f */ /* branch if odd address */
  212. /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
  213. 1:
  214. /* src and dst are both 4-byte aligned */
  215. srli a10, a4, 5 /* 32-byte chunks */
  216. #if XCHAL_HAVE_LOOPS
  217. loopgtz a10, 2f
  218. #else
  219. beqz a10, 2f
  220. slli a10, a10, 5
  221. add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
  222. .Loop5:
  223. #endif
  224. SRC( l32i a9, a2, 0 )
  225. SRC( l32i a8, a2, 4 )
  226. DST( s32i a9, a3, 0 )
  227. DST( s32i a8, a3, 4 )
  228. ONES_ADD(a5, a9)
  229. ONES_ADD(a5, a8)
  230. SRC( l32i a9, a2, 8 )
  231. SRC( l32i a8, a2, 12 )
  232. DST( s32i a9, a3, 8 )
  233. DST( s32i a8, a3, 12 )
  234. ONES_ADD(a5, a9)
  235. ONES_ADD(a5, a8)
  236. SRC( l32i a9, a2, 16 )
  237. SRC( l32i a8, a2, 20 )
  238. DST( s32i a9, a3, 16 )
  239. DST( s32i a8, a3, 20 )
  240. ONES_ADD(a5, a9)
  241. ONES_ADD(a5, a8)
  242. SRC( l32i a9, a2, 24 )
  243. SRC( l32i a8, a2, 28 )
  244. DST( s32i a9, a3, 24 )
  245. DST( s32i a8, a3, 28 )
  246. ONES_ADD(a5, a9)
  247. ONES_ADD(a5, a8)
  248. addi a2, a2, 32
  249. addi a3, a3, 32
  250. #if !XCHAL_HAVE_LOOPS
  251. blt a2, a10, .Loop5
  252. #endif
  253. 2:
  254. extui a10, a4, 2, 3 /* remaining 4-byte chunks */
  255. extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
  256. #if XCHAL_HAVE_LOOPS
  257. loopgtz a10, 3f
  258. #else
  259. beqz a10, 3f
  260. slli a10, a10, 2
  261. add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
  262. .Loop6:
  263. #endif
  264. SRC( l32i a9, a2, 0 )
  265. DST( s32i a9, a3, 0 )
  266. ONES_ADD(a5, a9)
  267. addi a2, a2, 4
  268. addi a3, a3, 4
  269. #if !XCHAL_HAVE_LOOPS
  270. blt a2, a10, .Loop6
  271. #endif
  272. 3:
  273. /*
  274. Control comes to here in two cases: (1) It may fall through
  275. to here from the 4-byte alignment case to process, at most,
  276. one 2-byte chunk. (2) It branches to here from above if
  277. either src or dst is 2-byte aligned, and we process all bytes
  278. here, except for perhaps a trailing odd byte. It's
  279. inefficient, so align your addresses to 4-byte boundaries.
  280. a2 = src
  281. a3 = dst
  282. a4 = len
  283. a5 = sum
  284. */
  285. srli a10, a4, 1 /* 2-byte chunks */
  286. #if XCHAL_HAVE_LOOPS
  287. loopgtz a10, 4f
  288. #else
  289. beqz a10, 4f
  290. slli a10, a10, 1
  291. add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
  292. .Loop7:
  293. #endif
  294. SRC( l16ui a9, a2, 0 )
  295. DST( s16i a9, a3, 0 )
  296. ONES_ADD(a5, a9)
  297. addi a2, a2, 2
  298. addi a3, a3, 2
  299. #if !XCHAL_HAVE_LOOPS
  300. blt a2, a10, .Loop7
  301. #endif
  302. 4:
  303. /* This section processes a possible trailing odd byte. */
  304. _bbci.l a4, 0, 8f /* 1-byte chunk */
  305. SRC( l8ui a9, a2, 0 )
  306. DST( s8i a9, a3, 0 )
  307. #ifdef __XTENSA_EB__
  308. slli a9, a9, 8 /* shift byte to bits 8..15 */
  309. #endif
  310. ONES_ADD(a5, a9)
  311. 8:
  312. mov a2, a5
  313. retw
  314. 5:
  315. /* Control branch to here when either src or dst is odd. We
  316. process all bytes using 8-bit accesses. Grossly inefficient,
  317. so don't feed us an odd address. */
  318. srli a10, a4, 1 /* handle in pairs for 16-bit csum */
  319. #if XCHAL_HAVE_LOOPS
  320. loopgtz a10, 6f
  321. #else
  322. beqz a10, 6f
  323. slli a10, a10, 1
  324. add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
  325. .Loop8:
  326. #endif
  327. SRC( l8ui a9, a2, 0 )
  328. SRC( l8ui a8, a2, 1 )
  329. DST( s8i a9, a3, 0 )
  330. DST( s8i a8, a3, 1 )
  331. #ifdef __XTENSA_EB__
  332. slli a9, a9, 8 /* combine into a single 16-bit value */
  333. #else /* for checksum computation */
  334. slli a8, a8, 8
  335. #endif
  336. or a9, a9, a8
  337. ONES_ADD(a5, a9)
  338. addi a2, a2, 2
  339. addi a3, a3, 2
  340. #if !XCHAL_HAVE_LOOPS
  341. blt a2, a10, .Loop8
  342. #endif
  343. 6:
  344. j 4b /* process the possible trailing odd byte */
  345. # Exception handler:
  346. .section .fixup, "ax"
  347. /*
  348. a6 = src_err_ptr
  349. a7 = dst_err_ptr
  350. a11 = original len for exception handling
  351. a12 = original dst for exception handling
  352. */
  353. 6001:
  354. _movi a2, -EFAULT
  355. s32i a2, a6, 0 /* src_err_ptr */
  356. # clear the complete destination - computing the rest
  357. # is too much work
  358. movi a2, 0
  359. #if XCHAL_HAVE_LOOPS
  360. loopgtz a11, 2f
  361. #else
  362. beqz a11, 2f
  363. add a11, a11, a12 /* a11 = ending address */
  364. .Leloop:
  365. #endif
  366. s8i a2, a12, 0
  367. addi a12, a12, 1
  368. #if !XCHAL_HAVE_LOOPS
  369. blt a12, a11, .Leloop
  370. #endif
  371. 2:
  372. retw
  373. 6002:
  374. movi a2, -EFAULT
  375. s32i a2, a7, 0 /* dst_err_ptr */
  376. movi a2, 0
  377. retw
  378. .previous