ip_vs_sync.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902
  1. /*
  2. * IPVS An implementation of the IP virtual server support for the
  3. * LINUX operating system. IPVS is now implemented as a module
  4. * over the NetFilter framework. IPVS can be used to build a
  5. * high-performance and highly available server based on a
  6. * cluster of servers.
  7. *
  8. * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
  9. *
  10. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  11. *
  12. * ip_vs_sync: sync connection info from master load balancer to backups
  13. * through multicast
  14. *
  15. * Changes:
  16. * Alexandre Cassen : Added master & backup support at a time.
  17. * Alexandre Cassen : Added SyncID support for incoming sync
  18. * messages filtering.
  19. * Justin Ossevoort : Fix endian problem on sync message size.
  20. */
  21. #include <linux/module.h>
  22. #include <linux/slab.h>
  23. #include <linux/inetdevice.h>
  24. #include <linux/net.h>
  25. #include <linux/completion.h>
  26. #include <linux/delay.h>
  27. #include <linux/skbuff.h>
  28. #include <linux/in.h>
  29. #include <linux/igmp.h> /* for ip_mc_join_group */
  30. #include <linux/udp.h>
  31. #include <net/ip.h>
  32. #include <net/sock.h>
  33. #include <asm/uaccess.h> /* for get_fs and set_fs */
  34. #include <net/ip_vs.h>
  35. #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
  36. #define IP_VS_SYNC_PORT 8848 /* multicast port */
  37. /*
  38. * IPVS sync connection entry
  39. */
  40. struct ip_vs_sync_conn {
  41. __u8 reserved;
  42. /* Protocol, addresses and port numbers */
  43. __u8 protocol; /* Which protocol (TCP/UDP) */
  44. __u16 cport;
  45. __u16 vport;
  46. __u16 dport;
  47. __u32 caddr; /* client address */
  48. __u32 vaddr; /* virtual address */
  49. __u32 daddr; /* destination address */
  50. /* Flags and state transition */
  51. __u16 flags; /* status flags */
  52. __u16 state; /* state info */
  53. /* The sequence options start here */
  54. };
  55. struct ip_vs_sync_conn_options {
  56. struct ip_vs_seq in_seq; /* incoming seq. struct */
  57. struct ip_vs_seq out_seq; /* outgoing seq. struct */
  58. };
  59. #define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
  60. #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
  61. #define FULL_CONN_SIZE \
  62. (sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
  63. /*
  64. The master mulitcasts messages to the backup load balancers in the
  65. following format.
  66. 0 1 2 3
  67. 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  68. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69. | Count Conns | SyncID | Size |
  70. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  71. | |
  72. | IPVS Sync Connection (1) |
  73. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  74. | . |
  75. | . |
  76. | . |
  77. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  78. | |
  79. | IPVS Sync Connection (n) |
  80. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  81. */
  82. #define SYNC_MESG_HEADER_LEN 4
  83. struct ip_vs_sync_mesg {
  84. __u8 nr_conns;
  85. __u8 syncid;
  86. __u16 size;
  87. /* ip_vs_sync_conn entries start here */
  88. };
  89. /* the maximum length of sync (sending/receiving) message */
  90. static int sync_send_mesg_maxlen;
  91. static int sync_recv_mesg_maxlen;
  92. struct ip_vs_sync_buff {
  93. struct list_head list;
  94. unsigned long firstuse;
  95. /* pointers for the message data */
  96. struct ip_vs_sync_mesg *mesg;
  97. unsigned char *head;
  98. unsigned char *end;
  99. };
  100. /* the sync_buff list head and the lock */
  101. static LIST_HEAD(ip_vs_sync_queue);
  102. static DEFINE_SPINLOCK(ip_vs_sync_lock);
  103. /* current sync_buff for accepting new conn entries */
  104. static struct ip_vs_sync_buff *curr_sb = NULL;
  105. static DEFINE_SPINLOCK(curr_sb_lock);
  106. /* ipvs sync daemon state */
  107. volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
  108. volatile int ip_vs_master_syncid = 0;
  109. volatile int ip_vs_backup_syncid = 0;
  110. /* multicast interface name */
  111. char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
  112. char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
  113. /* multicast addr */
  114. static struct sockaddr_in mcast_addr;
  115. static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
  116. {
  117. spin_lock(&ip_vs_sync_lock);
  118. list_add_tail(&sb->list, &ip_vs_sync_queue);
  119. spin_unlock(&ip_vs_sync_lock);
  120. }
  121. static inline struct ip_vs_sync_buff * sb_dequeue(void)
  122. {
  123. struct ip_vs_sync_buff *sb;
  124. spin_lock_bh(&ip_vs_sync_lock);
  125. if (list_empty(&ip_vs_sync_queue)) {
  126. sb = NULL;
  127. } else {
  128. sb = list_entry(ip_vs_sync_queue.next,
  129. struct ip_vs_sync_buff,
  130. list);
  131. list_del(&sb->list);
  132. }
  133. spin_unlock_bh(&ip_vs_sync_lock);
  134. return sb;
  135. }
  136. static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
  137. {
  138. struct ip_vs_sync_buff *sb;
  139. if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
  140. return NULL;
  141. if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
  142. kfree(sb);
  143. return NULL;
  144. }
  145. sb->mesg->nr_conns = 0;
  146. sb->mesg->syncid = ip_vs_master_syncid;
  147. sb->mesg->size = 4;
  148. sb->head = (unsigned char *)sb->mesg + 4;
  149. sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
  150. sb->firstuse = jiffies;
  151. return sb;
  152. }
  153. static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
  154. {
  155. kfree(sb->mesg);
  156. kfree(sb);
  157. }
  158. /*
  159. * Get the current sync buffer if it has been created for more
  160. * than the specified time or the specified time is zero.
  161. */
  162. static inline struct ip_vs_sync_buff *
  163. get_curr_sync_buff(unsigned long time)
  164. {
  165. struct ip_vs_sync_buff *sb;
  166. spin_lock_bh(&curr_sb_lock);
  167. if (curr_sb && (time == 0 ||
  168. time_before(jiffies - curr_sb->firstuse, time))) {
  169. sb = curr_sb;
  170. curr_sb = NULL;
  171. } else
  172. sb = NULL;
  173. spin_unlock_bh(&curr_sb_lock);
  174. return sb;
  175. }
  176. /*
  177. * Add an ip_vs_conn information into the current sync_buff.
  178. * Called by ip_vs_in.
  179. */
  180. void ip_vs_sync_conn(struct ip_vs_conn *cp)
  181. {
  182. struct ip_vs_sync_mesg *m;
  183. struct ip_vs_sync_conn *s;
  184. int len;
  185. spin_lock(&curr_sb_lock);
  186. if (!curr_sb) {
  187. if (!(curr_sb=ip_vs_sync_buff_create())) {
  188. spin_unlock(&curr_sb_lock);
  189. IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
  190. return;
  191. }
  192. }
  193. len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
  194. SIMPLE_CONN_SIZE;
  195. m = curr_sb->mesg;
  196. s = (struct ip_vs_sync_conn *)curr_sb->head;
  197. /* copy members */
  198. s->protocol = cp->protocol;
  199. s->cport = cp->cport;
  200. s->vport = cp->vport;
  201. s->dport = cp->dport;
  202. s->caddr = cp->caddr;
  203. s->vaddr = cp->vaddr;
  204. s->daddr = cp->daddr;
  205. s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
  206. s->state = htons(cp->state);
  207. if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
  208. struct ip_vs_sync_conn_options *opt =
  209. (struct ip_vs_sync_conn_options *)&s[1];
  210. memcpy(opt, &cp->in_seq, sizeof(*opt));
  211. }
  212. m->nr_conns++;
  213. m->size += len;
  214. curr_sb->head += len;
  215. /* check if there is a space for next one */
  216. if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
  217. sb_queue_tail(curr_sb);
  218. curr_sb = NULL;
  219. }
  220. spin_unlock(&curr_sb_lock);
  221. /* synchronize its controller if it has */
  222. if (cp->control)
  223. ip_vs_sync_conn(cp->control);
  224. }
  225. /*
  226. * Process received multicast message and create the corresponding
  227. * ip_vs_conn entries.
  228. */
  229. static void ip_vs_process_message(const char *buffer, const size_t buflen)
  230. {
  231. struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
  232. struct ip_vs_sync_conn *s;
  233. struct ip_vs_sync_conn_options *opt;
  234. struct ip_vs_conn *cp;
  235. char *p;
  236. int i;
  237. /* Convert size back to host byte order */
  238. m->size = ntohs(m->size);
  239. if (buflen != m->size) {
  240. IP_VS_ERR("bogus message\n");
  241. return;
  242. }
  243. /* SyncID sanity check */
  244. if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
  245. IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
  246. m->syncid);
  247. return;
  248. }
  249. p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
  250. for (i=0; i<m->nr_conns; i++) {
  251. unsigned flags;
  252. s = (struct ip_vs_sync_conn *)p;
  253. flags = ntohs(s->flags);
  254. if (!(flags & IP_VS_CONN_F_TEMPLATE))
  255. cp = ip_vs_conn_in_get(s->protocol,
  256. s->caddr, s->cport,
  257. s->vaddr, s->vport);
  258. else
  259. cp = ip_vs_ct_in_get(s->protocol,
  260. s->caddr, s->cport,
  261. s->vaddr, s->vport);
  262. if (!cp) {
  263. cp = ip_vs_conn_new(s->protocol,
  264. s->caddr, s->cport,
  265. s->vaddr, s->vport,
  266. s->daddr, s->dport,
  267. flags, NULL);
  268. if (!cp) {
  269. IP_VS_ERR("ip_vs_conn_new failed\n");
  270. return;
  271. }
  272. cp->state = ntohs(s->state);
  273. } else if (!cp->dest) {
  274. /* it is an entry created by the synchronization */
  275. cp->state = ntohs(s->state);
  276. cp->flags = flags | IP_VS_CONN_F_HASHED;
  277. } /* Note that we don't touch its state and flags
  278. if it is a normal entry. */
  279. if (flags & IP_VS_CONN_F_SEQ_MASK) {
  280. opt = (struct ip_vs_sync_conn_options *)&s[1];
  281. memcpy(&cp->in_seq, opt, sizeof(*opt));
  282. p += FULL_CONN_SIZE;
  283. } else
  284. p += SIMPLE_CONN_SIZE;
  285. atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
  286. cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
  287. ip_vs_conn_put(cp);
  288. if (p > buffer+buflen) {
  289. IP_VS_ERR("bogus message\n");
  290. return;
  291. }
  292. }
  293. }
  294. /*
  295. * Setup loopback of outgoing multicasts on a sending socket
  296. */
  297. static void set_mcast_loop(struct sock *sk, u_char loop)
  298. {
  299. struct inet_sock *inet = inet_sk(sk);
  300. /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
  301. lock_sock(sk);
  302. inet->mc_loop = loop ? 1 : 0;
  303. release_sock(sk);
  304. }
  305. /*
  306. * Specify TTL for outgoing multicasts on a sending socket
  307. */
  308. static void set_mcast_ttl(struct sock *sk, u_char ttl)
  309. {
  310. struct inet_sock *inet = inet_sk(sk);
  311. /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
  312. lock_sock(sk);
  313. inet->mc_ttl = ttl;
  314. release_sock(sk);
  315. }
  316. /*
  317. * Specifiy default interface for outgoing multicasts
  318. */
  319. static int set_mcast_if(struct sock *sk, char *ifname)
  320. {
  321. struct net_device *dev;
  322. struct inet_sock *inet = inet_sk(sk);
  323. if ((dev = __dev_get_by_name(ifname)) == NULL)
  324. return -ENODEV;
  325. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  326. return -EINVAL;
  327. lock_sock(sk);
  328. inet->mc_index = dev->ifindex;
  329. /* inet->mc_addr = 0; */
  330. release_sock(sk);
  331. return 0;
  332. }
  333. /*
  334. * Set the maximum length of sync message according to the
  335. * specified interface's MTU.
  336. */
  337. static int set_sync_mesg_maxlen(int sync_state)
  338. {
  339. struct net_device *dev;
  340. int num;
  341. if (sync_state == IP_VS_STATE_MASTER) {
  342. if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
  343. return -ENODEV;
  344. num = (dev->mtu - sizeof(struct iphdr) -
  345. sizeof(struct udphdr) -
  346. SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
  347. sync_send_mesg_maxlen =
  348. SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
  349. IP_VS_DBG(7, "setting the maximum length of sync sending "
  350. "message %d.\n", sync_send_mesg_maxlen);
  351. } else if (sync_state == IP_VS_STATE_BACKUP) {
  352. if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
  353. return -ENODEV;
  354. sync_recv_mesg_maxlen = dev->mtu -
  355. sizeof(struct iphdr) - sizeof(struct udphdr);
  356. IP_VS_DBG(7, "setting the maximum length of sync receiving "
  357. "message %d.\n", sync_recv_mesg_maxlen);
  358. }
  359. return 0;
  360. }
  361. /*
  362. * Join a multicast group.
  363. * the group is specified by a class D multicast address 224.0.0.0/8
  364. * in the in_addr structure passed in as a parameter.
  365. */
  366. static int
  367. join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
  368. {
  369. struct ip_mreqn mreq;
  370. struct net_device *dev;
  371. int ret;
  372. memset(&mreq, 0, sizeof(mreq));
  373. memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
  374. if ((dev = __dev_get_by_name(ifname)) == NULL)
  375. return -ENODEV;
  376. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  377. return -EINVAL;
  378. mreq.imr_ifindex = dev->ifindex;
  379. lock_sock(sk);
  380. ret = ip_mc_join_group(sk, &mreq);
  381. release_sock(sk);
  382. return ret;
  383. }
  384. static int bind_mcastif_addr(struct socket *sock, char *ifname)
  385. {
  386. struct net_device *dev;
  387. u32 addr;
  388. struct sockaddr_in sin;
  389. if ((dev = __dev_get_by_name(ifname)) == NULL)
  390. return -ENODEV;
  391. addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
  392. if (!addr)
  393. IP_VS_ERR("You probably need to specify IP address on "
  394. "multicast interface.\n");
  395. IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
  396. ifname, NIPQUAD(addr));
  397. /* Now bind the socket with the address of multicast interface */
  398. sin.sin_family = AF_INET;
  399. sin.sin_addr.s_addr = addr;
  400. sin.sin_port = 0;
  401. return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
  402. }
  403. /*
  404. * Set up sending multicast socket over UDP
  405. */
  406. static struct socket * make_send_sock(void)
  407. {
  408. struct socket *sock;
  409. /* First create a socket */
  410. if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
  411. IP_VS_ERR("Error during creation of socket; terminating\n");
  412. return NULL;
  413. }
  414. if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
  415. IP_VS_ERR("Error setting outbound mcast interface\n");
  416. goto error;
  417. }
  418. set_mcast_loop(sock->sk, 0);
  419. set_mcast_ttl(sock->sk, 1);
  420. if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
  421. IP_VS_ERR("Error binding address of the mcast interface\n");
  422. goto error;
  423. }
  424. if (sock->ops->connect(sock,
  425. (struct sockaddr*)&mcast_addr,
  426. sizeof(struct sockaddr), 0) < 0) {
  427. IP_VS_ERR("Error connecting to the multicast addr\n");
  428. goto error;
  429. }
  430. return sock;
  431. error:
  432. sock_release(sock);
  433. return NULL;
  434. }
  435. /*
  436. * Set up receiving multicast socket over UDP
  437. */
  438. static struct socket * make_receive_sock(void)
  439. {
  440. struct socket *sock;
  441. /* First create a socket */
  442. if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
  443. IP_VS_ERR("Error during creation of socket; terminating\n");
  444. return NULL;
  445. }
  446. /* it is equivalent to the REUSEADDR option in user-space */
  447. sock->sk->sk_reuse = 1;
  448. if (sock->ops->bind(sock,
  449. (struct sockaddr*)&mcast_addr,
  450. sizeof(struct sockaddr)) < 0) {
  451. IP_VS_ERR("Error binding to the multicast addr\n");
  452. goto error;
  453. }
  454. /* join the multicast group */
  455. if (join_mcast_group(sock->sk,
  456. (struct in_addr*)&mcast_addr.sin_addr,
  457. ip_vs_backup_mcast_ifn) < 0) {
  458. IP_VS_ERR("Error joining to the multicast group\n");
  459. goto error;
  460. }
  461. return sock;
  462. error:
  463. sock_release(sock);
  464. return NULL;
  465. }
  466. static int
  467. ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
  468. {
  469. struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
  470. struct kvec iov;
  471. int len;
  472. EnterFunction(7);
  473. iov.iov_base = (void *)buffer;
  474. iov.iov_len = length;
  475. len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
  476. LeaveFunction(7);
  477. return len;
  478. }
  479. static void
  480. ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
  481. {
  482. int msize;
  483. msize = msg->size;
  484. /* Put size in network byte order */
  485. msg->size = htons(msg->size);
  486. if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
  487. IP_VS_ERR("ip_vs_send_async error\n");
  488. }
  489. static int
  490. ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
  491. {
  492. struct msghdr msg = {NULL,};
  493. struct kvec iov;
  494. int len;
  495. EnterFunction(7);
  496. /* Receive a packet */
  497. iov.iov_base = buffer;
  498. iov.iov_len = (size_t)buflen;
  499. len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
  500. if (len < 0)
  501. return -1;
  502. LeaveFunction(7);
  503. return len;
  504. }
  505. static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
  506. static pid_t sync_master_pid = 0;
  507. static pid_t sync_backup_pid = 0;
  508. static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
  509. static int stop_master_sync = 0;
  510. static int stop_backup_sync = 0;
  511. static void sync_master_loop(void)
  512. {
  513. struct socket *sock;
  514. struct ip_vs_sync_buff *sb;
  515. /* create the sending multicast socket */
  516. sock = make_send_sock();
  517. if (!sock)
  518. return;
  519. IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
  520. "syncid = %d\n",
  521. ip_vs_master_mcast_ifn, ip_vs_master_syncid);
  522. for (;;) {
  523. while ((sb=sb_dequeue())) {
  524. ip_vs_send_sync_msg(sock, sb->mesg);
  525. ip_vs_sync_buff_release(sb);
  526. }
  527. /* check if entries stay in curr_sb for 2 seconds */
  528. if ((sb = get_curr_sync_buff(2*HZ))) {
  529. ip_vs_send_sync_msg(sock, sb->mesg);
  530. ip_vs_sync_buff_release(sb);
  531. }
  532. if (stop_master_sync)
  533. break;
  534. ssleep(1);
  535. }
  536. /* clean up the sync_buff queue */
  537. while ((sb=sb_dequeue())) {
  538. ip_vs_sync_buff_release(sb);
  539. }
  540. /* clean up the current sync_buff */
  541. if ((sb = get_curr_sync_buff(0))) {
  542. ip_vs_sync_buff_release(sb);
  543. }
  544. /* release the sending multicast socket */
  545. sock_release(sock);
  546. }
  547. static void sync_backup_loop(void)
  548. {
  549. struct socket *sock;
  550. char *buf;
  551. int len;
  552. if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
  553. IP_VS_ERR("sync_backup_loop: kmalloc error\n");
  554. return;
  555. }
  556. /* create the receiving multicast socket */
  557. sock = make_receive_sock();
  558. if (!sock)
  559. goto out;
  560. IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
  561. "syncid = %d\n",
  562. ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
  563. for (;;) {
  564. /* do you have data now? */
  565. while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
  566. if ((len =
  567. ip_vs_receive(sock, buf,
  568. sync_recv_mesg_maxlen)) <= 0) {
  569. IP_VS_ERR("receiving message error\n");
  570. break;
  571. }
  572. /* disable bottom half, because it accessed the data
  573. shared by softirq while getting/creating conns */
  574. local_bh_disable();
  575. ip_vs_process_message(buf, len);
  576. local_bh_enable();
  577. }
  578. if (stop_backup_sync)
  579. break;
  580. ssleep(1);
  581. }
  582. /* release the sending multicast socket */
  583. sock_release(sock);
  584. out:
  585. kfree(buf);
  586. }
  587. static void set_sync_pid(int sync_state, pid_t sync_pid)
  588. {
  589. if (sync_state == IP_VS_STATE_MASTER)
  590. sync_master_pid = sync_pid;
  591. else if (sync_state == IP_VS_STATE_BACKUP)
  592. sync_backup_pid = sync_pid;
  593. }
  594. static void set_stop_sync(int sync_state, int set)
  595. {
  596. if (sync_state == IP_VS_STATE_MASTER)
  597. stop_master_sync = set;
  598. else if (sync_state == IP_VS_STATE_BACKUP)
  599. stop_backup_sync = set;
  600. else {
  601. stop_master_sync = set;
  602. stop_backup_sync = set;
  603. }
  604. }
  605. static int sync_thread(void *startup)
  606. {
  607. DECLARE_WAITQUEUE(wait, current);
  608. mm_segment_t oldmm;
  609. int state;
  610. const char *name;
  611. /* increase the module use count */
  612. ip_vs_use_count_inc();
  613. if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
  614. state = IP_VS_STATE_MASTER;
  615. name = "ipvs_syncmaster";
  616. } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
  617. state = IP_VS_STATE_BACKUP;
  618. name = "ipvs_syncbackup";
  619. } else {
  620. IP_VS_BUG();
  621. ip_vs_use_count_dec();
  622. return -EINVAL;
  623. }
  624. daemonize(name);
  625. oldmm = get_fs();
  626. set_fs(KERNEL_DS);
  627. /* Block all signals */
  628. spin_lock_irq(&current->sighand->siglock);
  629. siginitsetinv(&current->blocked, 0);
  630. recalc_sigpending();
  631. spin_unlock_irq(&current->sighand->siglock);
  632. /* set the maximum length of sync message */
  633. set_sync_mesg_maxlen(state);
  634. /* set up multicast address */
  635. mcast_addr.sin_family = AF_INET;
  636. mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
  637. mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
  638. add_wait_queue(&sync_wait, &wait);
  639. set_sync_pid(state, current->pid);
  640. complete((struct completion *)startup);
  641. /* processing master/backup loop here */
  642. if (state == IP_VS_STATE_MASTER)
  643. sync_master_loop();
  644. else if (state == IP_VS_STATE_BACKUP)
  645. sync_backup_loop();
  646. else IP_VS_BUG();
  647. remove_wait_queue(&sync_wait, &wait);
  648. /* thread exits */
  649. set_sync_pid(state, 0);
  650. IP_VS_INFO("sync thread stopped!\n");
  651. set_fs(oldmm);
  652. /* decrease the module use count */
  653. ip_vs_use_count_dec();
  654. set_stop_sync(state, 0);
  655. wake_up(&stop_sync_wait);
  656. return 0;
  657. }
  658. static int fork_sync_thread(void *startup)
  659. {
  660. pid_t pid;
  661. /* fork the sync thread here, then the parent process of the
  662. sync thread is the init process after this thread exits. */
  663. repeat:
  664. if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
  665. IP_VS_ERR("could not create sync_thread due to %d... "
  666. "retrying.\n", pid);
  667. ssleep(1);
  668. goto repeat;
  669. }
  670. return 0;
  671. }
  672. int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
  673. {
  674. DECLARE_COMPLETION(startup);
  675. pid_t pid;
  676. if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
  677. (state == IP_VS_STATE_BACKUP && sync_backup_pid))
  678. return -EEXIST;
  679. IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
  680. IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
  681. sizeof(struct ip_vs_sync_conn));
  682. ip_vs_sync_state |= state;
  683. if (state == IP_VS_STATE_MASTER) {
  684. strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
  685. ip_vs_master_syncid = syncid;
  686. } else {
  687. strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
  688. ip_vs_backup_syncid = syncid;
  689. }
  690. repeat:
  691. if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
  692. IP_VS_ERR("could not create fork_sync_thread due to %d... "
  693. "retrying.\n", pid);
  694. ssleep(1);
  695. goto repeat;
  696. }
  697. wait_for_completion(&startup);
  698. return 0;
  699. }
  700. int stop_sync_thread(int state)
  701. {
  702. DECLARE_WAITQUEUE(wait, current);
  703. if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
  704. (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
  705. return -ESRCH;
  706. IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
  707. IP_VS_INFO("stopping sync thread %d ...\n",
  708. (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
  709. __set_current_state(TASK_UNINTERRUPTIBLE);
  710. add_wait_queue(&stop_sync_wait, &wait);
  711. set_stop_sync(state, 1);
  712. ip_vs_sync_state -= state;
  713. wake_up(&sync_wait);
  714. schedule();
  715. __set_current_state(TASK_RUNNING);
  716. remove_wait_queue(&stop_sync_wait, &wait);
  717. /* Note: no need to reap the sync thread, because its parent
  718. process is the init process */
  719. if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
  720. (state == IP_VS_STATE_BACKUP && stop_backup_sync))
  721. IP_VS_BUG();
  722. return 0;
  723. }