ip_vs_proto_tcp.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. /*
  2. * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
  3. *
  4. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  5. * Julian Anastasov <ja@ssi.bg>
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Changes:
  13. *
  14. */
  15. #include <linux/kernel.h>
  16. #include <linux/ip.h>
  17. #include <linux/tcp.h> /* for tcphdr */
  18. #include <net/ip.h>
  19. #include <net/tcp.h> /* for csum_tcpudp_magic */
  20. #include <linux/netfilter.h>
  21. #include <linux/netfilter_ipv4.h>
  22. #include <net/ip_vs.h>
  23. static struct ip_vs_conn *
  24. tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
  25. const struct ip_vs_iphdr *iph, unsigned int proto_off,
  26. int inverse)
  27. {
  28. __be16 _ports[2], *pptr;
  29. pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  30. if (pptr == NULL)
  31. return NULL;
  32. if (likely(!inverse)) {
  33. return ip_vs_conn_in_get(af, iph->protocol,
  34. &iph->saddr, pptr[0],
  35. &iph->daddr, pptr[1]);
  36. } else {
  37. return ip_vs_conn_in_get(af, iph->protocol,
  38. &iph->daddr, pptr[1],
  39. &iph->saddr, pptr[0]);
  40. }
  41. }
  42. static struct ip_vs_conn *
  43. tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
  44. const struct ip_vs_iphdr *iph, unsigned int proto_off,
  45. int inverse)
  46. {
  47. __be16 _ports[2], *pptr;
  48. pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  49. if (pptr == NULL)
  50. return NULL;
  51. if (likely(!inverse)) {
  52. return ip_vs_conn_out_get(af, iph->protocol,
  53. &iph->saddr, pptr[0],
  54. &iph->daddr, pptr[1]);
  55. } else {
  56. return ip_vs_conn_out_get(af, iph->protocol,
  57. &iph->daddr, pptr[1],
  58. &iph->saddr, pptr[0]);
  59. }
  60. }
  61. static int
  62. tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
  63. int *verdict, struct ip_vs_conn **cpp)
  64. {
  65. struct ip_vs_service *svc;
  66. struct tcphdr _tcph, *th;
  67. struct ip_vs_iphdr iph;
  68. ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
  69. th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
  70. if (th == NULL) {
  71. *verdict = NF_DROP;
  72. return 0;
  73. }
  74. if (th->syn &&
  75. (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
  76. th->dest))) {
  77. if (ip_vs_todrop()) {
  78. /*
  79. * It seems that we are very loaded.
  80. * We have to drop this packet :(
  81. */
  82. ip_vs_service_put(svc);
  83. *verdict = NF_DROP;
  84. return 0;
  85. }
  86. /*
  87. * Let the virtual server select a real server for the
  88. * incoming connection, and create a connection entry.
  89. */
  90. *cpp = ip_vs_schedule(svc, skb);
  91. if (!*cpp) {
  92. *verdict = ip_vs_leave(svc, skb, pp);
  93. return 0;
  94. }
  95. ip_vs_service_put(svc);
  96. }
  97. return 1;
  98. }
  99. static inline void
  100. tcp_fast_csum_update(int af, struct tcphdr *tcph,
  101. const union nf_inet_addr *oldip,
  102. const union nf_inet_addr *newip,
  103. __be16 oldport, __be16 newport)
  104. {
  105. #ifdef CONFIG_IP_VS_IPV6
  106. if (af == AF_INET6)
  107. tcph->check =
  108. csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
  109. ip_vs_check_diff2(oldport, newport,
  110. ~csum_unfold(tcph->check))));
  111. else
  112. #endif
  113. tcph->check =
  114. csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
  115. ip_vs_check_diff2(oldport, newport,
  116. ~csum_unfold(tcph->check))));
  117. }
  118. static inline void
  119. tcp_partial_csum_update(int af, struct tcphdr *tcph,
  120. const union nf_inet_addr *oldip,
  121. const union nf_inet_addr *newip,
  122. __be16 oldlen, __be16 newlen)
  123. {
  124. #ifdef CONFIG_IP_VS_IPV6
  125. if (af == AF_INET6)
  126. tcph->check =
  127. csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
  128. ip_vs_check_diff2(oldlen, newlen,
  129. ~csum_unfold(tcph->check))));
  130. else
  131. #endif
  132. tcph->check =
  133. csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
  134. ip_vs_check_diff2(oldlen, newlen,
  135. ~csum_unfold(tcph->check))));
  136. }
  137. static int
  138. tcp_snat_handler(struct sk_buff *skb,
  139. struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
  140. {
  141. struct tcphdr *tcph;
  142. unsigned int tcphoff;
  143. int oldlen;
  144. #ifdef CONFIG_IP_VS_IPV6
  145. if (cp->af == AF_INET6)
  146. tcphoff = sizeof(struct ipv6hdr);
  147. else
  148. #endif
  149. tcphoff = ip_hdrlen(skb);
  150. oldlen = skb->len - tcphoff;
  151. /* csum_check requires unshared skb */
  152. if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
  153. return 0;
  154. if (unlikely(cp->app != NULL)) {
  155. /* Some checks before mangling */
  156. if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
  157. return 0;
  158. /* Call application helper if needed */
  159. if (!ip_vs_app_pkt_out(cp, skb))
  160. return 0;
  161. }
  162. tcph = (void *)skb_network_header(skb) + tcphoff;
  163. tcph->source = cp->vport;
  164. /* Adjust TCP checksums */
  165. if (skb->ip_summed == CHECKSUM_PARTIAL) {
  166. tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
  167. htonl(oldlen),
  168. htonl(skb->len - tcphoff));
  169. } else if (!cp->app) {
  170. /* Only port and addr are changed, do fast csum update */
  171. tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
  172. cp->dport, cp->vport);
  173. if (skb->ip_summed == CHECKSUM_COMPLETE)
  174. skb->ip_summed = CHECKSUM_NONE;
  175. } else {
  176. /* full checksum calculation */
  177. tcph->check = 0;
  178. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  179. #ifdef CONFIG_IP_VS_IPV6
  180. if (cp->af == AF_INET6)
  181. tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
  182. &cp->caddr.in6,
  183. skb->len - tcphoff,
  184. cp->protocol, skb->csum);
  185. else
  186. #endif
  187. tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
  188. cp->caddr.ip,
  189. skb->len - tcphoff,
  190. cp->protocol,
  191. skb->csum);
  192. IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
  193. pp->name, tcph->check,
  194. (char*)&(tcph->check) - (char*)tcph);
  195. }
  196. return 1;
  197. }
  198. static int
  199. tcp_dnat_handler(struct sk_buff *skb,
  200. struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
  201. {
  202. struct tcphdr *tcph;
  203. unsigned int tcphoff;
  204. int oldlen;
  205. #ifdef CONFIG_IP_VS_IPV6
  206. if (cp->af == AF_INET6)
  207. tcphoff = sizeof(struct ipv6hdr);
  208. else
  209. #endif
  210. tcphoff = ip_hdrlen(skb);
  211. oldlen = skb->len - tcphoff;
  212. /* csum_check requires unshared skb */
  213. if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
  214. return 0;
  215. if (unlikely(cp->app != NULL)) {
  216. /* Some checks before mangling */
  217. if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
  218. return 0;
  219. /*
  220. * Attempt ip_vs_app call.
  221. * It will fix ip_vs_conn and iph ack_seq stuff
  222. */
  223. if (!ip_vs_app_pkt_in(cp, skb))
  224. return 0;
  225. }
  226. tcph = (void *)skb_network_header(skb) + tcphoff;
  227. tcph->dest = cp->dport;
  228. /*
  229. * Adjust TCP checksums
  230. */
  231. if (skb->ip_summed == CHECKSUM_PARTIAL) {
  232. tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
  233. htonl(oldlen),
  234. htonl(skb->len - tcphoff));
  235. } else if (!cp->app) {
  236. /* Only port and addr are changed, do fast csum update */
  237. tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
  238. cp->vport, cp->dport);
  239. if (skb->ip_summed == CHECKSUM_COMPLETE)
  240. skb->ip_summed = CHECKSUM_NONE;
  241. } else {
  242. /* full checksum calculation */
  243. tcph->check = 0;
  244. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  245. #ifdef CONFIG_IP_VS_IPV6
  246. if (cp->af == AF_INET6)
  247. tcph->check = csum_ipv6_magic(&cp->caddr.in6,
  248. &cp->daddr.in6,
  249. skb->len - tcphoff,
  250. cp->protocol, skb->csum);
  251. else
  252. #endif
  253. tcph->check = csum_tcpudp_magic(cp->caddr.ip,
  254. cp->daddr.ip,
  255. skb->len - tcphoff,
  256. cp->protocol,
  257. skb->csum);
  258. skb->ip_summed = CHECKSUM_UNNECESSARY;
  259. }
  260. return 1;
  261. }
  262. static int
  263. tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
  264. {
  265. unsigned int tcphoff;
  266. #ifdef CONFIG_IP_VS_IPV6
  267. if (af == AF_INET6)
  268. tcphoff = sizeof(struct ipv6hdr);
  269. else
  270. #endif
  271. tcphoff = ip_hdrlen(skb);
  272. switch (skb->ip_summed) {
  273. case CHECKSUM_NONE:
  274. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  275. case CHECKSUM_COMPLETE:
  276. #ifdef CONFIG_IP_VS_IPV6
  277. if (af == AF_INET6) {
  278. if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
  279. &ipv6_hdr(skb)->daddr,
  280. skb->len - tcphoff,
  281. ipv6_hdr(skb)->nexthdr,
  282. skb->csum)) {
  283. IP_VS_DBG_RL_PKT(0, pp, skb, 0,
  284. "Failed checksum for");
  285. return 0;
  286. }
  287. } else
  288. #endif
  289. if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
  290. ip_hdr(skb)->daddr,
  291. skb->len - tcphoff,
  292. ip_hdr(skb)->protocol,
  293. skb->csum)) {
  294. IP_VS_DBG_RL_PKT(0, pp, skb, 0,
  295. "Failed checksum for");
  296. return 0;
  297. }
  298. break;
  299. default:
  300. /* No need to checksum. */
  301. break;
  302. }
  303. return 1;
  304. }
  305. #define TCP_DIR_INPUT 0
  306. #define TCP_DIR_OUTPUT 4
  307. #define TCP_DIR_INPUT_ONLY 8
  308. static const int tcp_state_off[IP_VS_DIR_LAST] = {
  309. [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
  310. [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
  311. [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
  312. };
  313. /*
  314. * Timeout table[state]
  315. */
  316. static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
  317. [IP_VS_TCP_S_NONE] = 2*HZ,
  318. [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
  319. [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
  320. [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
  321. [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
  322. [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
  323. [IP_VS_TCP_S_CLOSE] = 10*HZ,
  324. [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
  325. [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
  326. [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
  327. [IP_VS_TCP_S_SYNACK] = 120*HZ,
  328. [IP_VS_TCP_S_LAST] = 2*HZ,
  329. };
  330. static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
  331. [IP_VS_TCP_S_NONE] = "NONE",
  332. [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
  333. [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
  334. [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
  335. [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
  336. [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
  337. [IP_VS_TCP_S_CLOSE] = "CLOSE",
  338. [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
  339. [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
  340. [IP_VS_TCP_S_LISTEN] = "LISTEN",
  341. [IP_VS_TCP_S_SYNACK] = "SYNACK",
  342. [IP_VS_TCP_S_LAST] = "BUG!",
  343. };
  344. #define sNO IP_VS_TCP_S_NONE
  345. #define sES IP_VS_TCP_S_ESTABLISHED
  346. #define sSS IP_VS_TCP_S_SYN_SENT
  347. #define sSR IP_VS_TCP_S_SYN_RECV
  348. #define sFW IP_VS_TCP_S_FIN_WAIT
  349. #define sTW IP_VS_TCP_S_TIME_WAIT
  350. #define sCL IP_VS_TCP_S_CLOSE
  351. #define sCW IP_VS_TCP_S_CLOSE_WAIT
  352. #define sLA IP_VS_TCP_S_LAST_ACK
  353. #define sLI IP_VS_TCP_S_LISTEN
  354. #define sSA IP_VS_TCP_S_SYNACK
  355. struct tcp_states_t {
  356. int next_state[IP_VS_TCP_S_LAST];
  357. };
  358. static const char * tcp_state_name(int state)
  359. {
  360. if (state >= IP_VS_TCP_S_LAST)
  361. return "ERR!";
  362. return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
  363. }
  364. static struct tcp_states_t tcp_states [] = {
  365. /* INPUT */
  366. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  367. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  368. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
  369. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  370. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
  371. /* OUTPUT */
  372. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  373. /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
  374. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  375. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  376. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  377. /* INPUT-ONLY */
  378. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  379. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  380. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  381. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  382. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  383. };
  384. static struct tcp_states_t tcp_states_dos [] = {
  385. /* INPUT */
  386. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  387. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
  388. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
  389. /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
  390. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  391. /* OUTPUT */
  392. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  393. /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
  394. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  395. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  396. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  397. /* INPUT-ONLY */
  398. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  399. /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
  400. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  401. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  402. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  403. };
  404. static struct tcp_states_t *tcp_state_table = tcp_states;
  405. static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
  406. {
  407. int on = (flags & 1); /* secure_tcp */
  408. /*
  409. ** FIXME: change secure_tcp to independent sysctl var
  410. ** or make it per-service or per-app because it is valid
  411. ** for most if not for all of the applications. Something
  412. ** like "capabilities" (flags) for each object.
  413. */
  414. tcp_state_table = (on? tcp_states_dos : tcp_states);
  415. }
  416. static int
  417. tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
  418. {
  419. return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
  420. tcp_state_name_table, sname, to);
  421. }
  422. static inline int tcp_state_idx(struct tcphdr *th)
  423. {
  424. if (th->rst)
  425. return 3;
  426. if (th->syn)
  427. return 0;
  428. if (th->fin)
  429. return 1;
  430. if (th->ack)
  431. return 2;
  432. return -1;
  433. }
  434. static inline void
  435. set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
  436. int direction, struct tcphdr *th)
  437. {
  438. int state_idx;
  439. int new_state = IP_VS_TCP_S_CLOSE;
  440. int state_off = tcp_state_off[direction];
  441. /*
  442. * Update state offset to INPUT_ONLY if necessary
  443. * or delete NO_OUTPUT flag if output packet detected
  444. */
  445. if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
  446. if (state_off == TCP_DIR_OUTPUT)
  447. cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
  448. else
  449. state_off = TCP_DIR_INPUT_ONLY;
  450. }
  451. if ((state_idx = tcp_state_idx(th)) < 0) {
  452. IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
  453. goto tcp_state_out;
  454. }
  455. new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
  456. tcp_state_out:
  457. if (new_state != cp->state) {
  458. struct ip_vs_dest *dest = cp->dest;
  459. IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
  460. "%s:%d state: %s->%s conn->refcnt:%d\n",
  461. pp->name,
  462. ((state_off == TCP_DIR_OUTPUT) ?
  463. "output " : "input "),
  464. th->syn ? 'S' : '.',
  465. th->fin ? 'F' : '.',
  466. th->ack ? 'A' : '.',
  467. th->rst ? 'R' : '.',
  468. IP_VS_DBG_ADDR(cp->af, &cp->daddr),
  469. ntohs(cp->dport),
  470. IP_VS_DBG_ADDR(cp->af, &cp->caddr),
  471. ntohs(cp->cport),
  472. tcp_state_name(cp->state),
  473. tcp_state_name(new_state),
  474. atomic_read(&cp->refcnt));
  475. if (dest) {
  476. if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
  477. (new_state != IP_VS_TCP_S_ESTABLISHED)) {
  478. atomic_dec(&dest->activeconns);
  479. atomic_inc(&dest->inactconns);
  480. cp->flags |= IP_VS_CONN_F_INACTIVE;
  481. } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
  482. (new_state == IP_VS_TCP_S_ESTABLISHED)) {
  483. atomic_inc(&dest->activeconns);
  484. atomic_dec(&dest->inactconns);
  485. cp->flags &= ~IP_VS_CONN_F_INACTIVE;
  486. }
  487. }
  488. }
  489. cp->timeout = pp->timeout_table[cp->state = new_state];
  490. }
  491. /*
  492. * Handle state transitions
  493. */
  494. static int
  495. tcp_state_transition(struct ip_vs_conn *cp, int direction,
  496. const struct sk_buff *skb,
  497. struct ip_vs_protocol *pp)
  498. {
  499. struct tcphdr _tcph, *th;
  500. #ifdef CONFIG_IP_VS_IPV6
  501. int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
  502. #else
  503. int ihl = ip_hdrlen(skb);
  504. #endif
  505. th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
  506. if (th == NULL)
  507. return 0;
  508. spin_lock(&cp->lock);
  509. set_tcp_state(pp, cp, direction, th);
  510. spin_unlock(&cp->lock);
  511. return 1;
  512. }
  513. /*
  514. * Hash table for TCP application incarnations
  515. */
  516. #define TCP_APP_TAB_BITS 4
  517. #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
  518. #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
  519. static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
  520. static DEFINE_SPINLOCK(tcp_app_lock);
  521. static inline __u16 tcp_app_hashkey(__be16 port)
  522. {
  523. return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
  524. & TCP_APP_TAB_MASK;
  525. }
  526. static int tcp_register_app(struct ip_vs_app *inc)
  527. {
  528. struct ip_vs_app *i;
  529. __u16 hash;
  530. __be16 port = inc->port;
  531. int ret = 0;
  532. hash = tcp_app_hashkey(port);
  533. spin_lock_bh(&tcp_app_lock);
  534. list_for_each_entry(i, &tcp_apps[hash], p_list) {
  535. if (i->port == port) {
  536. ret = -EEXIST;
  537. goto out;
  538. }
  539. }
  540. list_add(&inc->p_list, &tcp_apps[hash]);
  541. atomic_inc(&ip_vs_protocol_tcp.appcnt);
  542. out:
  543. spin_unlock_bh(&tcp_app_lock);
  544. return ret;
  545. }
  546. static void
  547. tcp_unregister_app(struct ip_vs_app *inc)
  548. {
  549. spin_lock_bh(&tcp_app_lock);
  550. atomic_dec(&ip_vs_protocol_tcp.appcnt);
  551. list_del(&inc->p_list);
  552. spin_unlock_bh(&tcp_app_lock);
  553. }
  554. static int
  555. tcp_app_conn_bind(struct ip_vs_conn *cp)
  556. {
  557. int hash;
  558. struct ip_vs_app *inc;
  559. int result = 0;
  560. /* Default binding: bind app only for NAT */
  561. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
  562. return 0;
  563. /* Lookup application incarnations and bind the right one */
  564. hash = tcp_app_hashkey(cp->vport);
  565. spin_lock(&tcp_app_lock);
  566. list_for_each_entry(inc, &tcp_apps[hash], p_list) {
  567. if (inc->port == cp->vport) {
  568. if (unlikely(!ip_vs_app_inc_get(inc)))
  569. break;
  570. spin_unlock(&tcp_app_lock);
  571. IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
  572. "%s:%u to app %s on port %u\n",
  573. __func__,
  574. IP_VS_DBG_ADDR(cp->af, &cp->caddr),
  575. ntohs(cp->cport),
  576. IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
  577. ntohs(cp->vport),
  578. inc->name, ntohs(inc->port));
  579. cp->app = inc;
  580. if (inc->init_conn)
  581. result = inc->init_conn(inc, cp);
  582. goto out;
  583. }
  584. }
  585. spin_unlock(&tcp_app_lock);
  586. out:
  587. return result;
  588. }
  589. /*
  590. * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
  591. */
  592. void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
  593. {
  594. spin_lock(&cp->lock);
  595. cp->state = IP_VS_TCP_S_LISTEN;
  596. cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
  597. spin_unlock(&cp->lock);
  598. }
  599. static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
  600. {
  601. IP_VS_INIT_HASH_TABLE(tcp_apps);
  602. pp->timeout_table = tcp_timeouts;
  603. }
  604. static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
  605. {
  606. }
  607. struct ip_vs_protocol ip_vs_protocol_tcp = {
  608. .name = "TCP",
  609. .protocol = IPPROTO_TCP,
  610. .num_states = IP_VS_TCP_S_LAST,
  611. .dont_defrag = 0,
  612. .appcnt = ATOMIC_INIT(0),
  613. .init = ip_vs_tcp_init,
  614. .exit = ip_vs_tcp_exit,
  615. .register_app = tcp_register_app,
  616. .unregister_app = tcp_unregister_app,
  617. .conn_schedule = tcp_conn_schedule,
  618. .conn_in_get = tcp_conn_in_get,
  619. .conn_out_get = tcp_conn_out_get,
  620. .snat_handler = tcp_snat_handler,
  621. .dnat_handler = tcp_dnat_handler,
  622. .csum_check = tcp_csum_check,
  623. .state_name = tcp_state_name,
  624. .state_transition = tcp_state_transition,
  625. .app_conn_bind = tcp_app_conn_bind,
  626. .debug_packet = ip_vs_tcpudp_debug_packet,
  627. .timeout_change = tcp_timeout_change,
  628. .set_state_timeout = tcp_set_state_timeout,
  629. };