checksum.S 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. /*
  2. * INET An implementation of the TCP/IP protocol suite for the LINUX
  3. * operating system. INET is implemented using the BSD Socket
  4. * interface as the means of communication with the user level.
  5. *
  6. * IP/TCP/UDP checksumming routines
  7. *
  8. * Authors: Jorge Cwik, <jorge@laser.satlink.net>
  9. * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  10. * Tom May, <ftom@netcom.com>
  11. * Pentium Pro/II routines:
  12. * Alexander Kjeldaas <astor@guardian.no>
  13. * Finn Arne Gangstad <finnag@guardian.no>
  14. * Lots of code moved from tcp.c and ip.c; see those files
  15. * for more names.
  16. *
  17. * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
  18. * handling.
  19. * Andi Kleen, add zeroing on error
  20. * converted to pure assembler
  21. *
  22. * This program is free software; you can redistribute it and/or
  23. * modify it under the terms of the GNU General Public License
  24. * as published by the Free Software Foundation; either version
  25. * 2 of the License, or (at your option) any later version.
  26. */
  27. #include <asm/errno.h>
  28. /*
  29. * computes a partial checksum, e.g. for TCP/UDP fragments
  30. */
  31. /*
  32. unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
  33. */
  34. .text
  35. .align 4
  36. .globl csum_partial
  37. #ifndef CONFIG_X86_USE_PPRO_CHECKSUM
  38. /*
  39. * Experiments with Ethernet and SLIP connections show that buff
  40. * is aligned on either a 2-byte or 4-byte boundary. We get at
  41. * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
  42. * Fortunately, it is easy to convert 2-byte alignment to 4-byte
  43. * alignment for the unrolled loop.
  44. */
  45. csum_partial:
  46. pushl %esi
  47. pushl %ebx
  48. movl 20(%esp),%eax # Function arg: unsigned int sum
  49. movl 16(%esp),%ecx # Function arg: int len
  50. movl 12(%esp),%esi # Function arg: unsigned char *buff
  51. testl $3, %esi # Check alignment.
  52. jz 2f # Jump if alignment is ok.
  53. testl $1, %esi # Check alignment.
  54. jz 10f # Jump if alignment is boundary of 2bytes.
  55. # buf is odd
  56. dec %ecx
  57. jl 8f
  58. movzbl (%esi), %ebx
  59. adcl %ebx, %eax
  60. roll $8, %eax
  61. inc %esi
  62. testl $2, %esi
  63. jz 2f
  64. 10:
  65. subl $2, %ecx # Alignment uses up two bytes.
  66. jae 1f # Jump if we had at least two bytes.
  67. addl $2, %ecx # ecx was < 2. Deal with it.
  68. jmp 4f
  69. 1: movw (%esi), %bx
  70. addl $2, %esi
  71. addw %bx, %ax
  72. adcl $0, %eax
  73. 2:
  74. movl %ecx, %edx
  75. shrl $5, %ecx
  76. jz 2f
  77. testl %esi, %esi
  78. 1: movl (%esi), %ebx
  79. adcl %ebx, %eax
  80. movl 4(%esi), %ebx
  81. adcl %ebx, %eax
  82. movl 8(%esi), %ebx
  83. adcl %ebx, %eax
  84. movl 12(%esi), %ebx
  85. adcl %ebx, %eax
  86. movl 16(%esi), %ebx
  87. adcl %ebx, %eax
  88. movl 20(%esi), %ebx
  89. adcl %ebx, %eax
  90. movl 24(%esi), %ebx
  91. adcl %ebx, %eax
  92. movl 28(%esi), %ebx
  93. adcl %ebx, %eax
  94. lea 32(%esi), %esi
  95. dec %ecx
  96. jne 1b
  97. adcl $0, %eax
  98. 2: movl %edx, %ecx
  99. andl $0x1c, %edx
  100. je 4f
  101. shrl $2, %edx # This clears CF
  102. 3: adcl (%esi), %eax
  103. lea 4(%esi), %esi
  104. dec %edx
  105. jne 3b
  106. adcl $0, %eax
  107. 4: andl $3, %ecx
  108. jz 7f
  109. cmpl $2, %ecx
  110. jb 5f
  111. movw (%esi),%cx
  112. leal 2(%esi),%esi
  113. je 6f
  114. shll $16,%ecx
  115. 5: movb (%esi),%cl
  116. 6: addl %ecx,%eax
  117. adcl $0, %eax
  118. 7:
  119. testl $1, 12(%esp)
  120. jz 8f
  121. roll $8, %eax
  122. 8:
  123. popl %ebx
  124. popl %esi
  125. ret
  126. #else
  127. /* Version for PentiumII/PPro */
  128. csum_partial:
  129. pushl %esi
  130. pushl %ebx
  131. movl 20(%esp),%eax # Function arg: unsigned int sum
  132. movl 16(%esp),%ecx # Function arg: int len
  133. movl 12(%esp),%esi # Function arg: const unsigned char *buf
  134. testl $3, %esi
  135. jnz 25f
  136. 10:
  137. movl %ecx, %edx
  138. movl %ecx, %ebx
  139. andl $0x7c, %ebx
  140. shrl $7, %ecx
  141. addl %ebx,%esi
  142. shrl $2, %ebx
  143. negl %ebx
  144. lea 45f(%ebx,%ebx,2), %ebx
  145. testl %esi, %esi
  146. jmp *%ebx
  147. # Handle 2-byte-aligned regions
  148. 20: addw (%esi), %ax
  149. lea 2(%esi), %esi
  150. adcl $0, %eax
  151. jmp 10b
  152. 25:
  153. testl $1, %esi
  154. jz 30f
  155. # buf is odd
  156. dec %ecx
  157. jl 90f
  158. movzbl (%esi), %ebx
  159. addl %ebx, %eax
  160. adcl $0, %eax
  161. roll $8, %eax
  162. inc %esi
  163. testl $2, %esi
  164. jz 10b
  165. 30: subl $2, %ecx
  166. ja 20b
  167. je 32f
  168. addl $2, %ecx
  169. jz 80f
  170. movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
  171. addl %ebx, %eax
  172. adcl $0, %eax
  173. jmp 80f
  174. 32:
  175. addw (%esi), %ax # csumming 2 bytes, 2-aligned
  176. adcl $0, %eax
  177. jmp 80f
  178. 40:
  179. addl -128(%esi), %eax
  180. adcl -124(%esi), %eax
  181. adcl -120(%esi), %eax
  182. adcl -116(%esi), %eax
  183. adcl -112(%esi), %eax
  184. adcl -108(%esi), %eax
  185. adcl -104(%esi), %eax
  186. adcl -100(%esi), %eax
  187. adcl -96(%esi), %eax
  188. adcl -92(%esi), %eax
  189. adcl -88(%esi), %eax
  190. adcl -84(%esi), %eax
  191. adcl -80(%esi), %eax
  192. adcl -76(%esi), %eax
  193. adcl -72(%esi), %eax
  194. adcl -68(%esi), %eax
  195. adcl -64(%esi), %eax
  196. adcl -60(%esi), %eax
  197. adcl -56(%esi), %eax
  198. adcl -52(%esi), %eax
  199. adcl -48(%esi), %eax
  200. adcl -44(%esi), %eax
  201. adcl -40(%esi), %eax
  202. adcl -36(%esi), %eax
  203. adcl -32(%esi), %eax
  204. adcl -28(%esi), %eax
  205. adcl -24(%esi), %eax
  206. adcl -20(%esi), %eax
  207. adcl -16(%esi), %eax
  208. adcl -12(%esi), %eax
  209. adcl -8(%esi), %eax
  210. adcl -4(%esi), %eax
  211. 45:
  212. lea 128(%esi), %esi
  213. adcl $0, %eax
  214. dec %ecx
  215. jge 40b
  216. movl %edx, %ecx
  217. 50: andl $3, %ecx
  218. jz 80f
  219. # Handle the last 1-3 bytes without jumping
  220. notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
  221. movl $0xffffff,%ebx # by the shll and shrl instructions
  222. shll $3,%ecx
  223. shrl %cl,%ebx
  224. andl -128(%esi),%ebx # esi is 4-aligned so should be ok
  225. addl %ebx,%eax
  226. adcl $0,%eax
  227. 80:
  228. testl $1, 12(%esp)
  229. jz 90f
  230. roll $8, %eax
  231. 90:
  232. popl %ebx
  233. popl %esi
  234. ret
  235. #endif
  236. /*
  237. unsigned int csum_partial_copy_generic (const char *src, char *dst,
  238. int len, int sum, int *src_err_ptr, int *dst_err_ptr)
  239. */
  240. /*
  241. * Copy from ds while checksumming, otherwise like csum_partial
  242. *
  243. * The macros SRC and DST specify the type of access for the instruction.
  244. * thus we can call a custom exception handler for all access types.
  245. *
  246. * FIXME: could someone double-check whether I haven't mixed up some SRC and
  247. * DST definitions? It's damn hard to trigger all cases. I hope I got
  248. * them all but there's no guarantee.
  249. */
  250. #define SRC(y...) \
  251. 9999: y; \
  252. .section __ex_table, "a"; \
  253. .long 9999b, 6001f ; \
  254. .previous
  255. #define DST(y...) \
  256. 9999: y; \
  257. .section __ex_table, "a"; \
  258. .long 9999b, 6002f ; \
  259. .previous
  260. .align 4
  261. .globl csum_partial_copy_generic
  262. #ifndef CONFIG_X86_USE_PPRO_CHECKSUM
  263. #define ARGBASE 16
  264. #define FP 12
  265. csum_partial_copy_generic:
  266. subl $4,%esp
  267. pushl %edi
  268. pushl %esi
  269. pushl %ebx
  270. movl ARGBASE+16(%esp),%eax # sum
  271. movl ARGBASE+12(%esp),%ecx # len
  272. movl ARGBASE+4(%esp),%esi # src
  273. movl ARGBASE+8(%esp),%edi # dst
  274. testl $2, %edi # Check alignment.
  275. jz 2f # Jump if alignment is ok.
  276. subl $2, %ecx # Alignment uses up two bytes.
  277. jae 1f # Jump if we had at least two bytes.
  278. addl $2, %ecx # ecx was < 2. Deal with it.
  279. jmp 4f
  280. SRC(1: movw (%esi), %bx )
  281. addl $2, %esi
  282. DST( movw %bx, (%edi) )
  283. addl $2, %edi
  284. addw %bx, %ax
  285. adcl $0, %eax
  286. 2:
  287. movl %ecx, FP(%esp)
  288. shrl $5, %ecx
  289. jz 2f
  290. testl %esi, %esi
  291. SRC(1: movl (%esi), %ebx )
  292. SRC( movl 4(%esi), %edx )
  293. adcl %ebx, %eax
  294. DST( movl %ebx, (%edi) )
  295. adcl %edx, %eax
  296. DST( movl %edx, 4(%edi) )
  297. SRC( movl 8(%esi), %ebx )
  298. SRC( movl 12(%esi), %edx )
  299. adcl %ebx, %eax
  300. DST( movl %ebx, 8(%edi) )
  301. adcl %edx, %eax
  302. DST( movl %edx, 12(%edi) )
  303. SRC( movl 16(%esi), %ebx )
  304. SRC( movl 20(%esi), %edx )
  305. adcl %ebx, %eax
  306. DST( movl %ebx, 16(%edi) )
  307. adcl %edx, %eax
  308. DST( movl %edx, 20(%edi) )
  309. SRC( movl 24(%esi), %ebx )
  310. SRC( movl 28(%esi), %edx )
  311. adcl %ebx, %eax
  312. DST( movl %ebx, 24(%edi) )
  313. adcl %edx, %eax
  314. DST( movl %edx, 28(%edi) )
  315. lea 32(%esi), %esi
  316. lea 32(%edi), %edi
  317. dec %ecx
  318. jne 1b
  319. adcl $0, %eax
  320. 2: movl FP(%esp), %edx
  321. movl %edx, %ecx
  322. andl $0x1c, %edx
  323. je 4f
  324. shrl $2, %edx # This clears CF
  325. SRC(3: movl (%esi), %ebx )
  326. adcl %ebx, %eax
  327. DST( movl %ebx, (%edi) )
  328. lea 4(%esi), %esi
  329. lea 4(%edi), %edi
  330. dec %edx
  331. jne 3b
  332. adcl $0, %eax
  333. 4: andl $3, %ecx
  334. jz 7f
  335. cmpl $2, %ecx
  336. jb 5f
  337. SRC( movw (%esi), %cx )
  338. leal 2(%esi), %esi
  339. DST( movw %cx, (%edi) )
  340. leal 2(%edi), %edi
  341. je 6f
  342. shll $16,%ecx
  343. SRC(5: movb (%esi), %cl )
  344. DST( movb %cl, (%edi) )
  345. 6: addl %ecx, %eax
  346. adcl $0, %eax
  347. 7:
  348. 5000:
  349. # Exception handler:
  350. .section .fixup, "ax"
  351. 6001:
  352. movl ARGBASE+20(%esp), %ebx # src_err_ptr
  353. movl $-EFAULT, (%ebx)
  354. # zero the complete destination - computing the rest
  355. # is too much work
  356. movl ARGBASE+8(%esp), %edi # dst
  357. movl ARGBASE+12(%esp), %ecx # len
  358. xorl %eax,%eax
  359. rep ; stosb
  360. jmp 5000b
  361. 6002:
  362. movl ARGBASE+24(%esp), %ebx # dst_err_ptr
  363. movl $-EFAULT,(%ebx)
  364. jmp 5000b
  365. .previous
  366. popl %ebx
  367. popl %esi
  368. popl %edi
  369. popl %ecx # equivalent to addl $4,%esp
  370. ret
  371. #else
  372. /* Version for PentiumII/PPro */
  373. #define ROUND1(x) \
  374. SRC(movl x(%esi), %ebx ) ; \
  375. addl %ebx, %eax ; \
  376. DST(movl %ebx, x(%edi) ) ;
  377. #define ROUND(x) \
  378. SRC(movl x(%esi), %ebx ) ; \
  379. adcl %ebx, %eax ; \
  380. DST(movl %ebx, x(%edi) ) ;
  381. #define ARGBASE 12
  382. csum_partial_copy_generic:
  383. pushl %ebx
  384. pushl %edi
  385. pushl %esi
  386. movl ARGBASE+4(%esp),%esi #src
  387. movl ARGBASE+8(%esp),%edi #dst
  388. movl ARGBASE+12(%esp),%ecx #len
  389. movl ARGBASE+16(%esp),%eax #sum
  390. # movl %ecx, %edx
  391. movl %ecx, %ebx
  392. movl %esi, %edx
  393. shrl $6, %ecx
  394. andl $0x3c, %ebx
  395. negl %ebx
  396. subl %ebx, %esi
  397. subl %ebx, %edi
  398. lea -1(%esi),%edx
  399. andl $-32,%edx
  400. lea 3f(%ebx,%ebx), %ebx
  401. testl %esi, %esi
  402. jmp *%ebx
  403. 1: addl $64,%esi
  404. addl $64,%edi
  405. SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
  406. ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
  407. ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
  408. ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
  409. ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
  410. 3: adcl $0,%eax
  411. addl $64, %edx
  412. dec %ecx
  413. jge 1b
  414. 4: movl ARGBASE+12(%esp),%edx #len
  415. andl $3, %edx
  416. jz 7f
  417. cmpl $2, %edx
  418. jb 5f
  419. SRC( movw (%esi), %dx )
  420. leal 2(%esi), %esi
  421. DST( movw %dx, (%edi) )
  422. leal 2(%edi), %edi
  423. je 6f
  424. shll $16,%edx
  425. 5:
  426. SRC( movb (%esi), %dl )
  427. DST( movb %dl, (%edi) )
  428. 6: addl %edx, %eax
  429. adcl $0, %eax
  430. 7:
  431. .section .fixup, "ax"
  432. 6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
  433. movl $-EFAULT, (%ebx)
  434. # zero the complete destination (computing the rest is too much work)
  435. movl ARGBASE+8(%esp),%edi # dst
  436. movl ARGBASE+12(%esp),%ecx # len
  437. xorl %eax,%eax
  438. rep; stosb
  439. jmp 7b
  440. 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
  441. movl $-EFAULT, (%ebx)
  442. jmp 7b
  443. .previous
  444. popl %esi
  445. popl %edi
  446. popl %ebx
  447. ret
  448. #undef ROUND
  449. #undef ROUND1
  450. #endif