drop_monitor.c 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. /*
  2. * Monitoring code for network dropped packet alerts
  3. *
  4. * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
  5. */
  6. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7. #include <linux/netdevice.h>
  8. #include <linux/etherdevice.h>
  9. #include <linux/string.h>
  10. #include <linux/if_arp.h>
  11. #include <linux/inetdevice.h>
  12. #include <linux/inet.h>
  13. #include <linux/interrupt.h>
  14. #include <linux/netpoll.h>
  15. #include <linux/sched.h>
  16. #include <linux/delay.h>
  17. #include <linux/types.h>
  18. #include <linux/workqueue.h>
  19. #include <linux/netlink.h>
  20. #include <linux/net_dropmon.h>
  21. #include <linux/percpu.h>
  22. #include <linux/timer.h>
  23. #include <linux/bitops.h>
  24. #include <linux/slab.h>
  25. #include <net/genetlink.h>
  26. #include <net/netevent.h>
  27. #include <trace/events/skb.h>
  28. #include <trace/events/napi.h>
  29. #include <asm/unaligned.h>
  30. #define TRACE_ON 1
  31. #define TRACE_OFF 0
  32. static void send_dm_alert(struct work_struct *unused);
  33. /*
  34. * Globals, our netlink socket pointer
  35. * and the work handle that will send up
  36. * netlink alerts
  37. */
  38. static int trace_state = TRACE_OFF;
  39. static DEFINE_MUTEX(trace_state_mutex);
  40. struct per_cpu_dm_data {
  41. struct work_struct dm_alert_work;
  42. struct sk_buff __rcu *skb;
  43. atomic_t dm_hit_count;
  44. struct timer_list send_timer;
  45. int cpu;
  46. };
  47. struct dm_hw_stat_delta {
  48. struct net_device *dev;
  49. unsigned long last_rx;
  50. struct list_head list;
  51. struct rcu_head rcu;
  52. unsigned long last_drop_val;
  53. };
  54. static struct genl_family net_drop_monitor_family = {
  55. .id = GENL_ID_GENERATE,
  56. .hdrsize = 0,
  57. .name = "NET_DM",
  58. .version = 2,
  59. .maxattr = NET_DM_CMD_MAX,
  60. };
  61. static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
  62. static int dm_hit_limit = 64;
  63. static int dm_delay = 1;
  64. static unsigned long dm_hw_check_delta = 2*HZ;
  65. static LIST_HEAD(hw_stats_list);
  66. static void reset_per_cpu_data(struct per_cpu_dm_data *data)
  67. {
  68. size_t al;
  69. struct net_dm_alert_msg *msg;
  70. struct nlattr *nla;
  71. struct sk_buff *skb;
  72. struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1);
  73. al = sizeof(struct net_dm_alert_msg);
  74. al += dm_hit_limit * sizeof(struct net_dm_drop_point);
  75. al += sizeof(struct nlattr);
  76. skb = genlmsg_new(al, GFP_KERNEL);
  77. if (skb) {
  78. genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
  79. 0, NET_DM_CMD_ALERT);
  80. nla = nla_reserve(skb, NLA_UNSPEC,
  81. sizeof(struct net_dm_alert_msg));
  82. msg = nla_data(nla);
  83. memset(msg, 0, al);
  84. } else
  85. schedule_work_on(data->cpu, &data->dm_alert_work);
  86. /*
  87. * Don't need to lock this, since we are guaranteed to only
  88. * run this on a single cpu at a time.
  89. * Note also that we only update data->skb if the old and new skb
  90. * pointers don't match. This ensures that we don't continually call
  91. * synchornize_rcu if we repeatedly fail to alloc a new netlink message.
  92. */
  93. if (skb != oskb) {
  94. rcu_assign_pointer(data->skb, skb);
  95. synchronize_rcu();
  96. atomic_set(&data->dm_hit_count, dm_hit_limit);
  97. }
  98. }
  99. static void send_dm_alert(struct work_struct *unused)
  100. {
  101. struct sk_buff *skb;
  102. struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
  103. WARN_ON_ONCE(data->cpu != smp_processor_id());
  104. /*
  105. * Grab the skb we're about to send
  106. */
  107. skb = rcu_dereference_protected(data->skb, 1);
  108. /*
  109. * Replace it with a new one
  110. */
  111. reset_per_cpu_data(data);
  112. /*
  113. * Ship it!
  114. */
  115. if (skb)
  116. genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
  117. put_cpu_var(dm_cpu_data);
  118. }
  119. /*
  120. * This is the timer function to delay the sending of an alert
  121. * in the event that more drops will arrive during the
  122. * hysteresis period. Note that it operates under the timer interrupt
  123. * so we don't need to disable preemption here
  124. */
  125. static void sched_send_work(unsigned long unused)
  126. {
  127. struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
  128. schedule_work_on(smp_processor_id(), &data->dm_alert_work);
  129. put_cpu_var(dm_cpu_data);
  130. }
  131. static void trace_drop_common(struct sk_buff *skb, void *location)
  132. {
  133. struct net_dm_alert_msg *msg;
  134. struct nlmsghdr *nlh;
  135. struct nlattr *nla;
  136. int i;
  137. struct sk_buff *dskb;
  138. struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
  139. rcu_read_lock();
  140. dskb = rcu_dereference(data->skb);
  141. if (!dskb)
  142. goto out;
  143. if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
  144. /*
  145. * we're already at zero, discard this hit
  146. */
  147. goto out;
  148. }
  149. nlh = (struct nlmsghdr *)dskb->data;
  150. nla = genlmsg_data(nlmsg_data(nlh));
  151. msg = nla_data(nla);
  152. for (i = 0; i < msg->entries; i++) {
  153. if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
  154. msg->points[i].count++;
  155. atomic_inc(&data->dm_hit_count);
  156. goto out;
  157. }
  158. }
  159. /*
  160. * We need to create a new entry
  161. */
  162. __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
  163. nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
  164. memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
  165. msg->points[msg->entries].count = 1;
  166. msg->entries++;
  167. if (!timer_pending(&data->send_timer)) {
  168. data->send_timer.expires = jiffies + dm_delay * HZ;
  169. add_timer_on(&data->send_timer, smp_processor_id());
  170. }
  171. out:
  172. rcu_read_unlock();
  173. put_cpu_var(dm_cpu_data);
  174. return;
  175. }
  176. static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
  177. {
  178. trace_drop_common(skb, location);
  179. }
  180. static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
  181. {
  182. struct dm_hw_stat_delta *new_stat;
  183. /*
  184. * Don't check napi structures with no associated device
  185. */
  186. if (!napi->dev)
  187. return;
  188. rcu_read_lock();
  189. list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
  190. /*
  191. * only add a note to our monitor buffer if:
  192. * 1) this is the dev we received on
  193. * 2) its after the last_rx delta
  194. * 3) our rx_dropped count has gone up
  195. */
  196. if ((new_stat->dev == napi->dev) &&
  197. (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
  198. (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
  199. trace_drop_common(NULL, NULL);
  200. new_stat->last_drop_val = napi->dev->stats.rx_dropped;
  201. new_stat->last_rx = jiffies;
  202. break;
  203. }
  204. }
  205. rcu_read_unlock();
  206. }
  207. static int set_all_monitor_traces(int state)
  208. {
  209. int rc = 0;
  210. struct dm_hw_stat_delta *new_stat = NULL;
  211. struct dm_hw_stat_delta *temp;
  212. mutex_lock(&trace_state_mutex);
  213. if (state == trace_state) {
  214. rc = -EAGAIN;
  215. goto out_unlock;
  216. }
  217. switch (state) {
  218. case TRACE_ON:
  219. rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
  220. rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
  221. break;
  222. case TRACE_OFF:
  223. rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
  224. rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
  225. tracepoint_synchronize_unregister();
  226. /*
  227. * Clean the device list
  228. */
  229. list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
  230. if (new_stat->dev == NULL) {
  231. list_del_rcu(&new_stat->list);
  232. kfree_rcu(new_stat, rcu);
  233. }
  234. }
  235. break;
  236. default:
  237. rc = 1;
  238. break;
  239. }
  240. if (!rc)
  241. trace_state = state;
  242. else
  243. rc = -EINPROGRESS;
  244. out_unlock:
  245. mutex_unlock(&trace_state_mutex);
  246. return rc;
  247. }
  248. static int net_dm_cmd_config(struct sk_buff *skb,
  249. struct genl_info *info)
  250. {
  251. return -ENOTSUPP;
  252. }
  253. static int net_dm_cmd_trace(struct sk_buff *skb,
  254. struct genl_info *info)
  255. {
  256. switch (info->genlhdr->cmd) {
  257. case NET_DM_CMD_START:
  258. return set_all_monitor_traces(TRACE_ON);
  259. break;
  260. case NET_DM_CMD_STOP:
  261. return set_all_monitor_traces(TRACE_OFF);
  262. break;
  263. }
  264. return -ENOTSUPP;
  265. }
  266. static int dropmon_net_event(struct notifier_block *ev_block,
  267. unsigned long event, void *ptr)
  268. {
  269. struct net_device *dev = ptr;
  270. struct dm_hw_stat_delta *new_stat = NULL;
  271. struct dm_hw_stat_delta *tmp;
  272. switch (event) {
  273. case NETDEV_REGISTER:
  274. new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
  275. if (!new_stat)
  276. goto out;
  277. new_stat->dev = dev;
  278. new_stat->last_rx = jiffies;
  279. mutex_lock(&trace_state_mutex);
  280. list_add_rcu(&new_stat->list, &hw_stats_list);
  281. mutex_unlock(&trace_state_mutex);
  282. break;
  283. case NETDEV_UNREGISTER:
  284. mutex_lock(&trace_state_mutex);
  285. list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
  286. if (new_stat->dev == dev) {
  287. new_stat->dev = NULL;
  288. if (trace_state == TRACE_OFF) {
  289. list_del_rcu(&new_stat->list);
  290. kfree_rcu(new_stat, rcu);
  291. break;
  292. }
  293. }
  294. }
  295. mutex_unlock(&trace_state_mutex);
  296. break;
  297. }
  298. out:
  299. return NOTIFY_DONE;
  300. }
  301. static struct genl_ops dropmon_ops[] = {
  302. {
  303. .cmd = NET_DM_CMD_CONFIG,
  304. .doit = net_dm_cmd_config,
  305. },
  306. {
  307. .cmd = NET_DM_CMD_START,
  308. .doit = net_dm_cmd_trace,
  309. },
  310. {
  311. .cmd = NET_DM_CMD_STOP,
  312. .doit = net_dm_cmd_trace,
  313. },
  314. };
  315. static struct notifier_block dropmon_net_notifier = {
  316. .notifier_call = dropmon_net_event
  317. };
  318. static int __init init_net_drop_monitor(void)
  319. {
  320. struct per_cpu_dm_data *data;
  321. int cpu, rc;
  322. pr_info("Initializing network drop monitor service\n");
  323. if (sizeof(void *) > 8) {
  324. pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
  325. return -ENOSPC;
  326. }
  327. rc = genl_register_family_with_ops(&net_drop_monitor_family,
  328. dropmon_ops,
  329. ARRAY_SIZE(dropmon_ops));
  330. if (rc) {
  331. pr_err("Could not create drop monitor netlink family\n");
  332. return rc;
  333. }
  334. rc = register_netdevice_notifier(&dropmon_net_notifier);
  335. if (rc < 0) {
  336. pr_crit("Failed to register netdevice notifier\n");
  337. goto out_unreg;
  338. }
  339. rc = 0;
  340. for_each_present_cpu(cpu) {
  341. data = &per_cpu(dm_cpu_data, cpu);
  342. data->cpu = cpu;
  343. INIT_WORK(&data->dm_alert_work, send_dm_alert);
  344. init_timer(&data->send_timer);
  345. data->send_timer.data = cpu;
  346. data->send_timer.function = sched_send_work;
  347. reset_per_cpu_data(data);
  348. }
  349. goto out;
  350. out_unreg:
  351. genl_unregister_family(&net_drop_monitor_family);
  352. out:
  353. return rc;
  354. }
  355. late_initcall(init_net_drop_monitor);