ip_vs_sync.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900
  1. /*
  2. * IPVS An implementation of the IP virtual server support for the
  3. * LINUX operating system. IPVS is now implemented as a module
  4. * over the NetFilter framework. IPVS can be used to build a
  5. * high-performance and highly available server based on a
  6. * cluster of servers.
  7. *
  8. * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
  9. *
  10. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  11. *
  12. * ip_vs_sync: sync connection info from master load balancer to backups
  13. * through multicast
  14. *
  15. * Changes:
  16. * Alexandre Cassen : Added master & backup support at a time.
  17. * Alexandre Cassen : Added SyncID support for incoming sync
  18. * messages filtering.
  19. * Justin Ossevoort : Fix endian problem on sync message size.
  20. */
  21. #include <linux/module.h>
  22. #include <linux/slab.h>
  23. #include <linux/net.h>
  24. #include <linux/completion.h>
  25. #include <linux/delay.h>
  26. #include <linux/skbuff.h>
  27. #include <linux/in.h>
  28. #include <linux/igmp.h> /* for ip_mc_join_group */
  29. #include <net/ip.h>
  30. #include <net/sock.h>
  31. #include <asm/uaccess.h> /* for get_fs and set_fs */
  32. #include <net/ip_vs.h>
  33. #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
  34. #define IP_VS_SYNC_PORT 8848 /* multicast port */
  35. /*
  36. * IPVS sync connection entry
  37. */
  38. struct ip_vs_sync_conn {
  39. __u8 reserved;
  40. /* Protocol, addresses and port numbers */
  41. __u8 protocol; /* Which protocol (TCP/UDP) */
  42. __u16 cport;
  43. __u16 vport;
  44. __u16 dport;
  45. __u32 caddr; /* client address */
  46. __u32 vaddr; /* virtual address */
  47. __u32 daddr; /* destination address */
  48. /* Flags and state transition */
  49. __u16 flags; /* status flags */
  50. __u16 state; /* state info */
  51. /* The sequence options start here */
  52. };
  53. struct ip_vs_sync_conn_options {
  54. struct ip_vs_seq in_seq; /* incoming seq. struct */
  55. struct ip_vs_seq out_seq; /* outgoing seq. struct */
  56. };
  57. #define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
  58. #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
  59. #define FULL_CONN_SIZE \
  60. (sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
  61. /*
  62. The master mulitcasts messages to the backup load balancers in the
  63. following format.
  64. 0 1 2 3
  65. 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  66. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67. | Count Conns | SyncID | Size |
  68. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69. | |
  70. | IPVS Sync Connection (1) |
  71. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  72. | . |
  73. | . |
  74. | . |
  75. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  76. | |
  77. | IPVS Sync Connection (n) |
  78. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  79. */
  80. #define SYNC_MESG_HEADER_LEN 4
  81. struct ip_vs_sync_mesg {
  82. __u8 nr_conns;
  83. __u8 syncid;
  84. __u16 size;
  85. /* ip_vs_sync_conn entries start here */
  86. };
  87. /* the maximum length of sync (sending/receiving) message */
  88. static int sync_send_mesg_maxlen;
  89. static int sync_recv_mesg_maxlen;
  90. struct ip_vs_sync_buff {
  91. struct list_head list;
  92. unsigned long firstuse;
  93. /* pointers for the message data */
  94. struct ip_vs_sync_mesg *mesg;
  95. unsigned char *head;
  96. unsigned char *end;
  97. };
  98. /* the sync_buff list head and the lock */
  99. static LIST_HEAD(ip_vs_sync_queue);
  100. static DEFINE_SPINLOCK(ip_vs_sync_lock);
  101. /* current sync_buff for accepting new conn entries */
  102. static struct ip_vs_sync_buff *curr_sb = NULL;
  103. static DEFINE_SPINLOCK(curr_sb_lock);
  104. /* ipvs sync daemon state */
  105. volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
  106. volatile int ip_vs_master_syncid = 0;
  107. volatile int ip_vs_backup_syncid = 0;
  108. /* multicast interface name */
  109. char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
  110. char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
  111. /* multicast addr */
  112. static struct sockaddr_in mcast_addr;
  113. static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
  114. {
  115. spin_lock(&ip_vs_sync_lock);
  116. list_add_tail(&sb->list, &ip_vs_sync_queue);
  117. spin_unlock(&ip_vs_sync_lock);
  118. }
  119. static inline struct ip_vs_sync_buff * sb_dequeue(void)
  120. {
  121. struct ip_vs_sync_buff *sb;
  122. spin_lock_bh(&ip_vs_sync_lock);
  123. if (list_empty(&ip_vs_sync_queue)) {
  124. sb = NULL;
  125. } else {
  126. sb = list_entry(ip_vs_sync_queue.next,
  127. struct ip_vs_sync_buff,
  128. list);
  129. list_del(&sb->list);
  130. }
  131. spin_unlock_bh(&ip_vs_sync_lock);
  132. return sb;
  133. }
  134. static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
  135. {
  136. struct ip_vs_sync_buff *sb;
  137. if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
  138. return NULL;
  139. if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
  140. kfree(sb);
  141. return NULL;
  142. }
  143. sb->mesg->nr_conns = 0;
  144. sb->mesg->syncid = ip_vs_master_syncid;
  145. sb->mesg->size = 4;
  146. sb->head = (unsigned char *)sb->mesg + 4;
  147. sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
  148. sb->firstuse = jiffies;
  149. return sb;
  150. }
  151. static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
  152. {
  153. kfree(sb->mesg);
  154. kfree(sb);
  155. }
  156. /*
  157. * Get the current sync buffer if it has been created for more
  158. * than the specified time or the specified time is zero.
  159. */
  160. static inline struct ip_vs_sync_buff *
  161. get_curr_sync_buff(unsigned long time)
  162. {
  163. struct ip_vs_sync_buff *sb;
  164. spin_lock_bh(&curr_sb_lock);
  165. if (curr_sb && (time == 0 ||
  166. time_before(jiffies - curr_sb->firstuse, time))) {
  167. sb = curr_sb;
  168. curr_sb = NULL;
  169. } else
  170. sb = NULL;
  171. spin_unlock_bh(&curr_sb_lock);
  172. return sb;
  173. }
  174. /*
  175. * Add an ip_vs_conn information into the current sync_buff.
  176. * Called by ip_vs_in.
  177. */
  178. void ip_vs_sync_conn(struct ip_vs_conn *cp)
  179. {
  180. struct ip_vs_sync_mesg *m;
  181. struct ip_vs_sync_conn *s;
  182. int len;
  183. spin_lock(&curr_sb_lock);
  184. if (!curr_sb) {
  185. if (!(curr_sb=ip_vs_sync_buff_create())) {
  186. spin_unlock(&curr_sb_lock);
  187. IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
  188. return;
  189. }
  190. }
  191. len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
  192. SIMPLE_CONN_SIZE;
  193. m = curr_sb->mesg;
  194. s = (struct ip_vs_sync_conn *)curr_sb->head;
  195. /* copy members */
  196. s->protocol = cp->protocol;
  197. s->cport = cp->cport;
  198. s->vport = cp->vport;
  199. s->dport = cp->dport;
  200. s->caddr = cp->caddr;
  201. s->vaddr = cp->vaddr;
  202. s->daddr = cp->daddr;
  203. s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
  204. s->state = htons(cp->state);
  205. if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
  206. struct ip_vs_sync_conn_options *opt =
  207. (struct ip_vs_sync_conn_options *)&s[1];
  208. memcpy(opt, &cp->in_seq, sizeof(*opt));
  209. }
  210. m->nr_conns++;
  211. m->size += len;
  212. curr_sb->head += len;
  213. /* check if there is a space for next one */
  214. if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
  215. sb_queue_tail(curr_sb);
  216. curr_sb = NULL;
  217. }
  218. spin_unlock(&curr_sb_lock);
  219. /* synchronize its controller if it has */
  220. if (cp->control)
  221. ip_vs_sync_conn(cp->control);
  222. }
  223. /*
  224. * Process received multicast message and create the corresponding
  225. * ip_vs_conn entries.
  226. */
  227. static void ip_vs_process_message(const char *buffer, const size_t buflen)
  228. {
  229. struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
  230. struct ip_vs_sync_conn *s;
  231. struct ip_vs_sync_conn_options *opt;
  232. struct ip_vs_conn *cp;
  233. char *p;
  234. int i;
  235. /* Convert size back to host byte order */
  236. m->size = ntohs(m->size);
  237. if (buflen != m->size) {
  238. IP_VS_ERR("bogus message\n");
  239. return;
  240. }
  241. /* SyncID sanity check */
  242. if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
  243. IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
  244. m->syncid);
  245. return;
  246. }
  247. p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
  248. for (i=0; i<m->nr_conns; i++) {
  249. unsigned flags;
  250. s = (struct ip_vs_sync_conn *)p;
  251. flags = ntohs(s->flags);
  252. if (!(flags & IP_VS_CONN_F_TEMPLATE))
  253. cp = ip_vs_conn_in_get(s->protocol,
  254. s->caddr, s->cport,
  255. s->vaddr, s->vport);
  256. else
  257. cp = ip_vs_ct_in_get(s->protocol,
  258. s->caddr, s->cport,
  259. s->vaddr, s->vport);
  260. if (!cp) {
  261. cp = ip_vs_conn_new(s->protocol,
  262. s->caddr, s->cport,
  263. s->vaddr, s->vport,
  264. s->daddr, s->dport,
  265. flags, NULL);
  266. if (!cp) {
  267. IP_VS_ERR("ip_vs_conn_new failed\n");
  268. return;
  269. }
  270. cp->state = ntohs(s->state);
  271. } else if (!cp->dest) {
  272. /* it is an entry created by the synchronization */
  273. cp->state = ntohs(s->state);
  274. cp->flags = flags | IP_VS_CONN_F_HASHED;
  275. } /* Note that we don't touch its state and flags
  276. if it is a normal entry. */
  277. if (flags & IP_VS_CONN_F_SEQ_MASK) {
  278. opt = (struct ip_vs_sync_conn_options *)&s[1];
  279. memcpy(&cp->in_seq, opt, sizeof(*opt));
  280. p += FULL_CONN_SIZE;
  281. } else
  282. p += SIMPLE_CONN_SIZE;
  283. atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
  284. cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
  285. ip_vs_conn_put(cp);
  286. if (p > buffer+buflen) {
  287. IP_VS_ERR("bogus message\n");
  288. return;
  289. }
  290. }
  291. }
  292. /*
  293. * Setup loopback of outgoing multicasts on a sending socket
  294. */
  295. static void set_mcast_loop(struct sock *sk, u_char loop)
  296. {
  297. struct inet_sock *inet = inet_sk(sk);
  298. /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
  299. lock_sock(sk);
  300. inet->mc_loop = loop ? 1 : 0;
  301. release_sock(sk);
  302. }
  303. /*
  304. * Specify TTL for outgoing multicasts on a sending socket
  305. */
  306. static void set_mcast_ttl(struct sock *sk, u_char ttl)
  307. {
  308. struct inet_sock *inet = inet_sk(sk);
  309. /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
  310. lock_sock(sk);
  311. inet->mc_ttl = ttl;
  312. release_sock(sk);
  313. }
  314. /*
  315. * Specifiy default interface for outgoing multicasts
  316. */
  317. static int set_mcast_if(struct sock *sk, char *ifname)
  318. {
  319. struct net_device *dev;
  320. struct inet_sock *inet = inet_sk(sk);
  321. if ((dev = __dev_get_by_name(ifname)) == NULL)
  322. return -ENODEV;
  323. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  324. return -EINVAL;
  325. lock_sock(sk);
  326. inet->mc_index = dev->ifindex;
  327. /* inet->mc_addr = 0; */
  328. release_sock(sk);
  329. return 0;
  330. }
  331. /*
  332. * Set the maximum length of sync message according to the
  333. * specified interface's MTU.
  334. */
  335. static int set_sync_mesg_maxlen(int sync_state)
  336. {
  337. struct net_device *dev;
  338. int num;
  339. if (sync_state == IP_VS_STATE_MASTER) {
  340. if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
  341. return -ENODEV;
  342. num = (dev->mtu - sizeof(struct iphdr) -
  343. sizeof(struct udphdr) -
  344. SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
  345. sync_send_mesg_maxlen =
  346. SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
  347. IP_VS_DBG(7, "setting the maximum length of sync sending "
  348. "message %d.\n", sync_send_mesg_maxlen);
  349. } else if (sync_state == IP_VS_STATE_BACKUP) {
  350. if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
  351. return -ENODEV;
  352. sync_recv_mesg_maxlen = dev->mtu -
  353. sizeof(struct iphdr) - sizeof(struct udphdr);
  354. IP_VS_DBG(7, "setting the maximum length of sync receiving "
  355. "message %d.\n", sync_recv_mesg_maxlen);
  356. }
  357. return 0;
  358. }
  359. /*
  360. * Join a multicast group.
  361. * the group is specified by a class D multicast address 224.0.0.0/8
  362. * in the in_addr structure passed in as a parameter.
  363. */
  364. static int
  365. join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
  366. {
  367. struct ip_mreqn mreq;
  368. struct net_device *dev;
  369. int ret;
  370. memset(&mreq, 0, sizeof(mreq));
  371. memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
  372. if ((dev = __dev_get_by_name(ifname)) == NULL)
  373. return -ENODEV;
  374. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  375. return -EINVAL;
  376. mreq.imr_ifindex = dev->ifindex;
  377. lock_sock(sk);
  378. ret = ip_mc_join_group(sk, &mreq);
  379. release_sock(sk);
  380. return ret;
  381. }
  382. static int bind_mcastif_addr(struct socket *sock, char *ifname)
  383. {
  384. struct net_device *dev;
  385. u32 addr;
  386. struct sockaddr_in sin;
  387. if ((dev = __dev_get_by_name(ifname)) == NULL)
  388. return -ENODEV;
  389. addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
  390. if (!addr)
  391. IP_VS_ERR("You probably need to specify IP address on "
  392. "multicast interface.\n");
  393. IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
  394. ifname, NIPQUAD(addr));
  395. /* Now bind the socket with the address of multicast interface */
  396. sin.sin_family = AF_INET;
  397. sin.sin_addr.s_addr = addr;
  398. sin.sin_port = 0;
  399. return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
  400. }
  401. /*
  402. * Set up sending multicast socket over UDP
  403. */
  404. static struct socket * make_send_sock(void)
  405. {
  406. struct socket *sock;
  407. /* First create a socket */
  408. if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
  409. IP_VS_ERR("Error during creation of socket; terminating\n");
  410. return NULL;
  411. }
  412. if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
  413. IP_VS_ERR("Error setting outbound mcast interface\n");
  414. goto error;
  415. }
  416. set_mcast_loop(sock->sk, 0);
  417. set_mcast_ttl(sock->sk, 1);
  418. if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
  419. IP_VS_ERR("Error binding address of the mcast interface\n");
  420. goto error;
  421. }
  422. if (sock->ops->connect(sock,
  423. (struct sockaddr*)&mcast_addr,
  424. sizeof(struct sockaddr), 0) < 0) {
  425. IP_VS_ERR("Error connecting to the multicast addr\n");
  426. goto error;
  427. }
  428. return sock;
  429. error:
  430. sock_release(sock);
  431. return NULL;
  432. }
  433. /*
  434. * Set up receiving multicast socket over UDP
  435. */
  436. static struct socket * make_receive_sock(void)
  437. {
  438. struct socket *sock;
  439. /* First create a socket */
  440. if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
  441. IP_VS_ERR("Error during creation of socket; terminating\n");
  442. return NULL;
  443. }
  444. /* it is equivalent to the REUSEADDR option in user-space */
  445. sock->sk->sk_reuse = 1;
  446. if (sock->ops->bind(sock,
  447. (struct sockaddr*)&mcast_addr,
  448. sizeof(struct sockaddr)) < 0) {
  449. IP_VS_ERR("Error binding to the multicast addr\n");
  450. goto error;
  451. }
  452. /* join the multicast group */
  453. if (join_mcast_group(sock->sk,
  454. (struct in_addr*)&mcast_addr.sin_addr,
  455. ip_vs_backup_mcast_ifn) < 0) {
  456. IP_VS_ERR("Error joining to the multicast group\n");
  457. goto error;
  458. }
  459. return sock;
  460. error:
  461. sock_release(sock);
  462. return NULL;
  463. }
  464. static int
  465. ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
  466. {
  467. struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
  468. struct kvec iov;
  469. int len;
  470. EnterFunction(7);
  471. iov.iov_base = (void *)buffer;
  472. iov.iov_len = length;
  473. len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
  474. LeaveFunction(7);
  475. return len;
  476. }
  477. static void
  478. ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
  479. {
  480. int msize;
  481. msize = msg->size;
  482. /* Put size in network byte order */
  483. msg->size = htons(msg->size);
  484. if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
  485. IP_VS_ERR("ip_vs_send_async error\n");
  486. }
  487. static int
  488. ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
  489. {
  490. struct msghdr msg = {NULL,};
  491. struct kvec iov;
  492. int len;
  493. EnterFunction(7);
  494. /* Receive a packet */
  495. iov.iov_base = buffer;
  496. iov.iov_len = (size_t)buflen;
  497. len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
  498. if (len < 0)
  499. return -1;
  500. LeaveFunction(7);
  501. return len;
  502. }
  503. static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
  504. static pid_t sync_master_pid = 0;
  505. static pid_t sync_backup_pid = 0;
  506. static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
  507. static int stop_master_sync = 0;
  508. static int stop_backup_sync = 0;
  509. static void sync_master_loop(void)
  510. {
  511. struct socket *sock;
  512. struct ip_vs_sync_buff *sb;
  513. /* create the sending multicast socket */
  514. sock = make_send_sock();
  515. if (!sock)
  516. return;
  517. IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
  518. "syncid = %d\n",
  519. ip_vs_master_mcast_ifn, ip_vs_master_syncid);
  520. for (;;) {
  521. while ((sb=sb_dequeue())) {
  522. ip_vs_send_sync_msg(sock, sb->mesg);
  523. ip_vs_sync_buff_release(sb);
  524. }
  525. /* check if entries stay in curr_sb for 2 seconds */
  526. if ((sb = get_curr_sync_buff(2*HZ))) {
  527. ip_vs_send_sync_msg(sock, sb->mesg);
  528. ip_vs_sync_buff_release(sb);
  529. }
  530. if (stop_master_sync)
  531. break;
  532. ssleep(1);
  533. }
  534. /* clean up the sync_buff queue */
  535. while ((sb=sb_dequeue())) {
  536. ip_vs_sync_buff_release(sb);
  537. }
  538. /* clean up the current sync_buff */
  539. if ((sb = get_curr_sync_buff(0))) {
  540. ip_vs_sync_buff_release(sb);
  541. }
  542. /* release the sending multicast socket */
  543. sock_release(sock);
  544. }
  545. static void sync_backup_loop(void)
  546. {
  547. struct socket *sock;
  548. char *buf;
  549. int len;
  550. if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
  551. IP_VS_ERR("sync_backup_loop: kmalloc error\n");
  552. return;
  553. }
  554. /* create the receiving multicast socket */
  555. sock = make_receive_sock();
  556. if (!sock)
  557. goto out;
  558. IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
  559. "syncid = %d\n",
  560. ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
  561. for (;;) {
  562. /* do you have data now? */
  563. while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
  564. if ((len =
  565. ip_vs_receive(sock, buf,
  566. sync_recv_mesg_maxlen)) <= 0) {
  567. IP_VS_ERR("receiving message error\n");
  568. break;
  569. }
  570. /* disable bottom half, because it accessed the data
  571. shared by softirq while getting/creating conns */
  572. local_bh_disable();
  573. ip_vs_process_message(buf, len);
  574. local_bh_enable();
  575. }
  576. if (stop_backup_sync)
  577. break;
  578. ssleep(1);
  579. }
  580. /* release the sending multicast socket */
  581. sock_release(sock);
  582. out:
  583. kfree(buf);
  584. }
  585. static void set_sync_pid(int sync_state, pid_t sync_pid)
  586. {
  587. if (sync_state == IP_VS_STATE_MASTER)
  588. sync_master_pid = sync_pid;
  589. else if (sync_state == IP_VS_STATE_BACKUP)
  590. sync_backup_pid = sync_pid;
  591. }
  592. static void set_stop_sync(int sync_state, int set)
  593. {
  594. if (sync_state == IP_VS_STATE_MASTER)
  595. stop_master_sync = set;
  596. else if (sync_state == IP_VS_STATE_BACKUP)
  597. stop_backup_sync = set;
  598. else {
  599. stop_master_sync = set;
  600. stop_backup_sync = set;
  601. }
  602. }
  603. static int sync_thread(void *startup)
  604. {
  605. DECLARE_WAITQUEUE(wait, current);
  606. mm_segment_t oldmm;
  607. int state;
  608. const char *name;
  609. /* increase the module use count */
  610. ip_vs_use_count_inc();
  611. if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
  612. state = IP_VS_STATE_MASTER;
  613. name = "ipvs_syncmaster";
  614. } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
  615. state = IP_VS_STATE_BACKUP;
  616. name = "ipvs_syncbackup";
  617. } else {
  618. IP_VS_BUG();
  619. ip_vs_use_count_dec();
  620. return -EINVAL;
  621. }
  622. daemonize(name);
  623. oldmm = get_fs();
  624. set_fs(KERNEL_DS);
  625. /* Block all signals */
  626. spin_lock_irq(&current->sighand->siglock);
  627. siginitsetinv(&current->blocked, 0);
  628. recalc_sigpending();
  629. spin_unlock_irq(&current->sighand->siglock);
  630. /* set the maximum length of sync message */
  631. set_sync_mesg_maxlen(state);
  632. /* set up multicast address */
  633. mcast_addr.sin_family = AF_INET;
  634. mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
  635. mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
  636. add_wait_queue(&sync_wait, &wait);
  637. set_sync_pid(state, current->pid);
  638. complete((struct completion *)startup);
  639. /* processing master/backup loop here */
  640. if (state == IP_VS_STATE_MASTER)
  641. sync_master_loop();
  642. else if (state == IP_VS_STATE_BACKUP)
  643. sync_backup_loop();
  644. else IP_VS_BUG();
  645. remove_wait_queue(&sync_wait, &wait);
  646. /* thread exits */
  647. set_sync_pid(state, 0);
  648. IP_VS_INFO("sync thread stopped!\n");
  649. set_fs(oldmm);
  650. /* decrease the module use count */
  651. ip_vs_use_count_dec();
  652. set_stop_sync(state, 0);
  653. wake_up(&stop_sync_wait);
  654. return 0;
  655. }
  656. static int fork_sync_thread(void *startup)
  657. {
  658. pid_t pid;
  659. /* fork the sync thread here, then the parent process of the
  660. sync thread is the init process after this thread exits. */
  661. repeat:
  662. if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
  663. IP_VS_ERR("could not create sync_thread due to %d... "
  664. "retrying.\n", pid);
  665. ssleep(1);
  666. goto repeat;
  667. }
  668. return 0;
  669. }
  670. int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
  671. {
  672. DECLARE_COMPLETION(startup);
  673. pid_t pid;
  674. if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
  675. (state == IP_VS_STATE_BACKUP && sync_backup_pid))
  676. return -EEXIST;
  677. IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
  678. IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
  679. sizeof(struct ip_vs_sync_conn));
  680. ip_vs_sync_state |= state;
  681. if (state == IP_VS_STATE_MASTER) {
  682. strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
  683. ip_vs_master_syncid = syncid;
  684. } else {
  685. strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
  686. ip_vs_backup_syncid = syncid;
  687. }
  688. repeat:
  689. if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
  690. IP_VS_ERR("could not create fork_sync_thread due to %d... "
  691. "retrying.\n", pid);
  692. ssleep(1);
  693. goto repeat;
  694. }
  695. wait_for_completion(&startup);
  696. return 0;
  697. }
  698. int stop_sync_thread(int state)
  699. {
  700. DECLARE_WAITQUEUE(wait, current);
  701. if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
  702. (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
  703. return -ESRCH;
  704. IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
  705. IP_VS_INFO("stopping sync thread %d ...\n",
  706. (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
  707. __set_current_state(TASK_UNINTERRUPTIBLE);
  708. add_wait_queue(&stop_sync_wait, &wait);
  709. set_stop_sync(state, 1);
  710. ip_vs_sync_state -= state;
  711. wake_up(&sync_wait);
  712. schedule();
  713. __set_current_state(TASK_RUNNING);
  714. remove_wait_queue(&stop_sync_wait, &wait);
  715. /* Note: no need to reap the sync thread, because its parent
  716. process is the init process */
  717. if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
  718. (state == IP_VS_STATE_BACKUP && stop_backup_sync))
  719. IP_VS_BUG();
  720. return 0;
  721. }