ip_vs_proto_tcp.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. /*
  2. * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
  3. *
  4. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  5. * Julian Anastasov <ja@ssi.bg>
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Changes:
  13. *
  14. */
  15. #include <linux/kernel.h>
  16. #include <linux/ip.h>
  17. #include <linux/tcp.h> /* for tcphdr */
  18. #include <net/ip.h>
  19. #include <net/tcp.h> /* for csum_tcpudp_magic */
  20. #include <linux/netfilter.h>
  21. #include <linux/netfilter_ipv4.h>
  22. #include <net/ip_vs.h>
  23. static struct ip_vs_conn *
  24. tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
  25. const struct ip_vs_iphdr *iph, unsigned int proto_off,
  26. int inverse)
  27. {
  28. __be16 _ports[2], *pptr;
  29. pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  30. if (pptr == NULL)
  31. return NULL;
  32. if (likely(!inverse)) {
  33. return ip_vs_conn_in_get(af, iph->protocol,
  34. &iph->saddr, pptr[0],
  35. &iph->daddr, pptr[1]);
  36. } else {
  37. return ip_vs_conn_in_get(af, iph->protocol,
  38. &iph->daddr, pptr[1],
  39. &iph->saddr, pptr[0]);
  40. }
  41. }
  42. static struct ip_vs_conn *
  43. tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
  44. const struct ip_vs_iphdr *iph, unsigned int proto_off,
  45. int inverse)
  46. {
  47. __be16 _ports[2], *pptr;
  48. pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  49. if (pptr == NULL)
  50. return NULL;
  51. if (likely(!inverse)) {
  52. return ip_vs_conn_out_get(af, iph->protocol,
  53. &iph->saddr, pptr[0],
  54. &iph->daddr, pptr[1]);
  55. } else {
  56. return ip_vs_conn_out_get(af, iph->protocol,
  57. &iph->daddr, pptr[1],
  58. &iph->saddr, pptr[0]);
  59. }
  60. }
  61. static int
  62. tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
  63. int *verdict, struct ip_vs_conn **cpp)
  64. {
  65. struct ip_vs_service *svc;
  66. struct tcphdr _tcph, *th;
  67. struct ip_vs_iphdr iph;
  68. ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
  69. th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
  70. if (th == NULL) {
  71. *verdict = NF_DROP;
  72. return 0;
  73. }
  74. if (th->syn &&
  75. (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
  76. th->dest))) {
  77. if (ip_vs_todrop()) {
  78. /*
  79. * It seems that we are very loaded.
  80. * We have to drop this packet :(
  81. */
  82. ip_vs_service_put(svc);
  83. *verdict = NF_DROP;
  84. return 0;
  85. }
  86. /*
  87. * Let the virtual server select a real server for the
  88. * incoming connection, and create a connection entry.
  89. */
  90. *cpp = ip_vs_schedule(svc, skb);
  91. if (!*cpp) {
  92. *verdict = ip_vs_leave(svc, skb, pp);
  93. return 0;
  94. }
  95. ip_vs_service_put(svc);
  96. }
  97. return 1;
  98. }
  99. static inline void
  100. tcp_fast_csum_update(int af, struct tcphdr *tcph,
  101. const union nf_inet_addr *oldip,
  102. const union nf_inet_addr *newip,
  103. __be16 oldport, __be16 newport)
  104. {
  105. #ifdef CONFIG_IP_VS_IPV6
  106. if (af == AF_INET6)
  107. tcph->check =
  108. csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
  109. ip_vs_check_diff2(oldport, newport,
  110. ~csum_unfold(tcph->check))));
  111. else
  112. #endif
  113. tcph->check =
  114. csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
  115. ip_vs_check_diff2(oldport, newport,
  116. ~csum_unfold(tcph->check))));
  117. }
  118. static int
  119. tcp_snat_handler(struct sk_buff *skb,
  120. struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
  121. {
  122. struct tcphdr *tcph;
  123. unsigned int tcphoff;
  124. #ifdef CONFIG_IP_VS_IPV6
  125. if (cp->af == AF_INET6)
  126. tcphoff = sizeof(struct ipv6hdr);
  127. else
  128. #endif
  129. tcphoff = ip_hdrlen(skb);
  130. /* csum_check requires unshared skb */
  131. if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
  132. return 0;
  133. if (unlikely(cp->app != NULL)) {
  134. /* Some checks before mangling */
  135. if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
  136. return 0;
  137. /* Call application helper if needed */
  138. if (!ip_vs_app_pkt_out(cp, skb))
  139. return 0;
  140. }
  141. tcph = (void *)skb_network_header(skb) + tcphoff;
  142. tcph->source = cp->vport;
  143. /* Adjust TCP checksums */
  144. if (!cp->app && (tcph->check != 0)) {
  145. /* Only port and addr are changed, do fast csum update */
  146. tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
  147. cp->dport, cp->vport);
  148. if (skb->ip_summed == CHECKSUM_COMPLETE)
  149. skb->ip_summed = CHECKSUM_NONE;
  150. } else {
  151. /* full checksum calculation */
  152. tcph->check = 0;
  153. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  154. #ifdef CONFIG_IP_VS_IPV6
  155. if (cp->af == AF_INET6)
  156. tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
  157. &cp->caddr.in6,
  158. skb->len - tcphoff,
  159. cp->protocol, skb->csum);
  160. else
  161. #endif
  162. tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
  163. cp->caddr.ip,
  164. skb->len - tcphoff,
  165. cp->protocol,
  166. skb->csum);
  167. IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
  168. pp->name, tcph->check,
  169. (char*)&(tcph->check) - (char*)tcph);
  170. }
  171. return 1;
  172. }
  173. static int
  174. tcp_dnat_handler(struct sk_buff *skb,
  175. struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
  176. {
  177. struct tcphdr *tcph;
  178. unsigned int tcphoff;
  179. #ifdef CONFIG_IP_VS_IPV6
  180. if (cp->af == AF_INET6)
  181. tcphoff = sizeof(struct ipv6hdr);
  182. else
  183. #endif
  184. tcphoff = ip_hdrlen(skb);
  185. /* csum_check requires unshared skb */
  186. if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
  187. return 0;
  188. if (unlikely(cp->app != NULL)) {
  189. /* Some checks before mangling */
  190. if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
  191. return 0;
  192. /*
  193. * Attempt ip_vs_app call.
  194. * It will fix ip_vs_conn and iph ack_seq stuff
  195. */
  196. if (!ip_vs_app_pkt_in(cp, skb))
  197. return 0;
  198. }
  199. tcph = (void *)skb_network_header(skb) + tcphoff;
  200. tcph->dest = cp->dport;
  201. /*
  202. * Adjust TCP checksums
  203. */
  204. if (!cp->app && (tcph->check != 0)) {
  205. /* Only port and addr are changed, do fast csum update */
  206. tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
  207. cp->vport, cp->dport);
  208. if (skb->ip_summed == CHECKSUM_COMPLETE)
  209. skb->ip_summed = CHECKSUM_NONE;
  210. } else {
  211. /* full checksum calculation */
  212. tcph->check = 0;
  213. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  214. #ifdef CONFIG_IP_VS_IPV6
  215. if (cp->af == AF_INET6)
  216. tcph->check = csum_ipv6_magic(&cp->caddr.in6,
  217. &cp->daddr.in6,
  218. skb->len - tcphoff,
  219. cp->protocol, skb->csum);
  220. else
  221. #endif
  222. tcph->check = csum_tcpudp_magic(cp->caddr.ip,
  223. cp->daddr.ip,
  224. skb->len - tcphoff,
  225. cp->protocol,
  226. skb->csum);
  227. skb->ip_summed = CHECKSUM_UNNECESSARY;
  228. }
  229. return 1;
  230. }
  231. static int
  232. tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
  233. {
  234. unsigned int tcphoff;
  235. #ifdef CONFIG_IP_VS_IPV6
  236. if (af == AF_INET6)
  237. tcphoff = sizeof(struct ipv6hdr);
  238. else
  239. #endif
  240. tcphoff = ip_hdrlen(skb);
  241. switch (skb->ip_summed) {
  242. case CHECKSUM_NONE:
  243. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  244. case CHECKSUM_COMPLETE:
  245. #ifdef CONFIG_IP_VS_IPV6
  246. if (af == AF_INET6) {
  247. if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
  248. &ipv6_hdr(skb)->daddr,
  249. skb->len - tcphoff,
  250. ipv6_hdr(skb)->nexthdr,
  251. skb->csum)) {
  252. IP_VS_DBG_RL_PKT(0, pp, skb, 0,
  253. "Failed checksum for");
  254. return 0;
  255. }
  256. } else
  257. #endif
  258. if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
  259. ip_hdr(skb)->daddr,
  260. skb->len - tcphoff,
  261. ip_hdr(skb)->protocol,
  262. skb->csum)) {
  263. IP_VS_DBG_RL_PKT(0, pp, skb, 0,
  264. "Failed checksum for");
  265. return 0;
  266. }
  267. break;
  268. default:
  269. /* No need to checksum. */
  270. break;
  271. }
  272. return 1;
  273. }
  274. #define TCP_DIR_INPUT 0
  275. #define TCP_DIR_OUTPUT 4
  276. #define TCP_DIR_INPUT_ONLY 8
  277. static const int tcp_state_off[IP_VS_DIR_LAST] = {
  278. [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
  279. [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
  280. [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
  281. };
  282. /*
  283. * Timeout table[state]
  284. */
  285. static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
  286. [IP_VS_TCP_S_NONE] = 2*HZ,
  287. [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
  288. [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
  289. [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
  290. [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
  291. [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
  292. [IP_VS_TCP_S_CLOSE] = 10*HZ,
  293. [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
  294. [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
  295. [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
  296. [IP_VS_TCP_S_SYNACK] = 120*HZ,
  297. [IP_VS_TCP_S_LAST] = 2*HZ,
  298. };
  299. static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
  300. [IP_VS_TCP_S_NONE] = "NONE",
  301. [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
  302. [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
  303. [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
  304. [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
  305. [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
  306. [IP_VS_TCP_S_CLOSE] = "CLOSE",
  307. [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
  308. [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
  309. [IP_VS_TCP_S_LISTEN] = "LISTEN",
  310. [IP_VS_TCP_S_SYNACK] = "SYNACK",
  311. [IP_VS_TCP_S_LAST] = "BUG!",
  312. };
  313. #define sNO IP_VS_TCP_S_NONE
  314. #define sES IP_VS_TCP_S_ESTABLISHED
  315. #define sSS IP_VS_TCP_S_SYN_SENT
  316. #define sSR IP_VS_TCP_S_SYN_RECV
  317. #define sFW IP_VS_TCP_S_FIN_WAIT
  318. #define sTW IP_VS_TCP_S_TIME_WAIT
  319. #define sCL IP_VS_TCP_S_CLOSE
  320. #define sCW IP_VS_TCP_S_CLOSE_WAIT
  321. #define sLA IP_VS_TCP_S_LAST_ACK
  322. #define sLI IP_VS_TCP_S_LISTEN
  323. #define sSA IP_VS_TCP_S_SYNACK
  324. struct tcp_states_t {
  325. int next_state[IP_VS_TCP_S_LAST];
  326. };
  327. static const char * tcp_state_name(int state)
  328. {
  329. if (state >= IP_VS_TCP_S_LAST)
  330. return "ERR!";
  331. return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
  332. }
  333. static struct tcp_states_t tcp_states [] = {
  334. /* INPUT */
  335. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  336. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  337. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
  338. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  339. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
  340. /* OUTPUT */
  341. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  342. /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
  343. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  344. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  345. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  346. /* INPUT-ONLY */
  347. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  348. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  349. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  350. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  351. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  352. };
  353. static struct tcp_states_t tcp_states_dos [] = {
  354. /* INPUT */
  355. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  356. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
  357. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
  358. /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
  359. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  360. /* OUTPUT */
  361. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  362. /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
  363. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  364. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  365. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  366. /* INPUT-ONLY */
  367. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  368. /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
  369. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  370. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  371. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  372. };
  373. static struct tcp_states_t *tcp_state_table = tcp_states;
  374. static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
  375. {
  376. int on = (flags & 1); /* secure_tcp */
  377. /*
  378. ** FIXME: change secure_tcp to independent sysctl var
  379. ** or make it per-service or per-app because it is valid
  380. ** for most if not for all of the applications. Something
  381. ** like "capabilities" (flags) for each object.
  382. */
  383. tcp_state_table = (on? tcp_states_dos : tcp_states);
  384. }
  385. static int
  386. tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
  387. {
  388. return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
  389. tcp_state_name_table, sname, to);
  390. }
  391. static inline int tcp_state_idx(struct tcphdr *th)
  392. {
  393. if (th->rst)
  394. return 3;
  395. if (th->syn)
  396. return 0;
  397. if (th->fin)
  398. return 1;
  399. if (th->ack)
  400. return 2;
  401. return -1;
  402. }
  403. static inline void
  404. set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
  405. int direction, struct tcphdr *th)
  406. {
  407. int state_idx;
  408. int new_state = IP_VS_TCP_S_CLOSE;
  409. int state_off = tcp_state_off[direction];
  410. /*
  411. * Update state offset to INPUT_ONLY if necessary
  412. * or delete NO_OUTPUT flag if output packet detected
  413. */
  414. if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
  415. if (state_off == TCP_DIR_OUTPUT)
  416. cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
  417. else
  418. state_off = TCP_DIR_INPUT_ONLY;
  419. }
  420. if ((state_idx = tcp_state_idx(th)) < 0) {
  421. IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
  422. goto tcp_state_out;
  423. }
  424. new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
  425. tcp_state_out:
  426. if (new_state != cp->state) {
  427. struct ip_vs_dest *dest = cp->dest;
  428. IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
  429. "%s:%d state: %s->%s conn->refcnt:%d\n",
  430. pp->name,
  431. ((state_off == TCP_DIR_OUTPUT) ?
  432. "output " : "input "),
  433. th->syn ? 'S' : '.',
  434. th->fin ? 'F' : '.',
  435. th->ack ? 'A' : '.',
  436. th->rst ? 'R' : '.',
  437. IP_VS_DBG_ADDR(cp->af, &cp->daddr),
  438. ntohs(cp->dport),
  439. IP_VS_DBG_ADDR(cp->af, &cp->caddr),
  440. ntohs(cp->cport),
  441. tcp_state_name(cp->state),
  442. tcp_state_name(new_state),
  443. atomic_read(&cp->refcnt));
  444. if (dest) {
  445. if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
  446. (new_state != IP_VS_TCP_S_ESTABLISHED)) {
  447. atomic_dec(&dest->activeconns);
  448. atomic_inc(&dest->inactconns);
  449. cp->flags |= IP_VS_CONN_F_INACTIVE;
  450. } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
  451. (new_state == IP_VS_TCP_S_ESTABLISHED)) {
  452. atomic_inc(&dest->activeconns);
  453. atomic_dec(&dest->inactconns);
  454. cp->flags &= ~IP_VS_CONN_F_INACTIVE;
  455. }
  456. }
  457. }
  458. cp->timeout = pp->timeout_table[cp->state = new_state];
  459. }
  460. /*
  461. * Handle state transitions
  462. */
  463. static int
  464. tcp_state_transition(struct ip_vs_conn *cp, int direction,
  465. const struct sk_buff *skb,
  466. struct ip_vs_protocol *pp)
  467. {
  468. struct tcphdr _tcph, *th;
  469. #ifdef CONFIG_IP_VS_IPV6
  470. int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
  471. #else
  472. int ihl = ip_hdrlen(skb);
  473. #endif
  474. th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
  475. if (th == NULL)
  476. return 0;
  477. spin_lock(&cp->lock);
  478. set_tcp_state(pp, cp, direction, th);
  479. spin_unlock(&cp->lock);
  480. return 1;
  481. }
  482. /*
  483. * Hash table for TCP application incarnations
  484. */
  485. #define TCP_APP_TAB_BITS 4
  486. #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
  487. #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
  488. static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
  489. static DEFINE_SPINLOCK(tcp_app_lock);
  490. static inline __u16 tcp_app_hashkey(__be16 port)
  491. {
  492. return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
  493. & TCP_APP_TAB_MASK;
  494. }
  495. static int tcp_register_app(struct ip_vs_app *inc)
  496. {
  497. struct ip_vs_app *i;
  498. __u16 hash;
  499. __be16 port = inc->port;
  500. int ret = 0;
  501. hash = tcp_app_hashkey(port);
  502. spin_lock_bh(&tcp_app_lock);
  503. list_for_each_entry(i, &tcp_apps[hash], p_list) {
  504. if (i->port == port) {
  505. ret = -EEXIST;
  506. goto out;
  507. }
  508. }
  509. list_add(&inc->p_list, &tcp_apps[hash]);
  510. atomic_inc(&ip_vs_protocol_tcp.appcnt);
  511. out:
  512. spin_unlock_bh(&tcp_app_lock);
  513. return ret;
  514. }
  515. static void
  516. tcp_unregister_app(struct ip_vs_app *inc)
  517. {
  518. spin_lock_bh(&tcp_app_lock);
  519. atomic_dec(&ip_vs_protocol_tcp.appcnt);
  520. list_del(&inc->p_list);
  521. spin_unlock_bh(&tcp_app_lock);
  522. }
  523. static int
  524. tcp_app_conn_bind(struct ip_vs_conn *cp)
  525. {
  526. int hash;
  527. struct ip_vs_app *inc;
  528. int result = 0;
  529. /* Default binding: bind app only for NAT */
  530. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
  531. return 0;
  532. /* Lookup application incarnations and bind the right one */
  533. hash = tcp_app_hashkey(cp->vport);
  534. spin_lock(&tcp_app_lock);
  535. list_for_each_entry(inc, &tcp_apps[hash], p_list) {
  536. if (inc->port == cp->vport) {
  537. if (unlikely(!ip_vs_app_inc_get(inc)))
  538. break;
  539. spin_unlock(&tcp_app_lock);
  540. IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
  541. "%s:%u to app %s on port %u\n",
  542. __func__,
  543. IP_VS_DBG_ADDR(cp->af, &cp->caddr),
  544. ntohs(cp->cport),
  545. IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
  546. ntohs(cp->vport),
  547. inc->name, ntohs(inc->port));
  548. cp->app = inc;
  549. if (inc->init_conn)
  550. result = inc->init_conn(inc, cp);
  551. goto out;
  552. }
  553. }
  554. spin_unlock(&tcp_app_lock);
  555. out:
  556. return result;
  557. }
  558. /*
  559. * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
  560. */
  561. void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
  562. {
  563. spin_lock(&cp->lock);
  564. cp->state = IP_VS_TCP_S_LISTEN;
  565. cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
  566. spin_unlock(&cp->lock);
  567. }
  568. static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
  569. {
  570. IP_VS_INIT_HASH_TABLE(tcp_apps);
  571. pp->timeout_table = tcp_timeouts;
  572. }
  573. static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
  574. {
  575. }
  576. struct ip_vs_protocol ip_vs_protocol_tcp = {
  577. .name = "TCP",
  578. .protocol = IPPROTO_TCP,
  579. .num_states = IP_VS_TCP_S_LAST,
  580. .dont_defrag = 0,
  581. .appcnt = ATOMIC_INIT(0),
  582. .init = ip_vs_tcp_init,
  583. .exit = ip_vs_tcp_exit,
  584. .register_app = tcp_register_app,
  585. .unregister_app = tcp_unregister_app,
  586. .conn_schedule = tcp_conn_schedule,
  587. .conn_in_get = tcp_conn_in_get,
  588. .conn_out_get = tcp_conn_out_get,
  589. .snat_handler = tcp_snat_handler,
  590. .dnat_handler = tcp_dnat_handler,
  591. .csum_check = tcp_csum_check,
  592. .state_name = tcp_state_name,
  593. .state_transition = tcp_state_transition,
  594. .app_conn_bind = tcp_app_conn_bind,
  595. .debug_packet = ip_vs_tcpudp_debug_packet,
  596. .timeout_change = tcp_timeout_change,
  597. .set_state_timeout = tcp_set_state_timeout,
  598. };