hangcheck-timer.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. /*
  2. * hangcheck-timer.c
  3. *
  4. * Driver for a little io fencing timer.
  5. *
  6. * Copyright (C) 2002 Oracle Corporation. All rights reserved.
  7. *
  8. * Author: Joel Becker <joel.becker@oracle.com>
  9. *
  10. * This program is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU General Public
  12. * License version 2 as published by the Free Software Foundation.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public
  20. * License along with this program; if not, write to the
  21. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  22. * Boston, MA 021110-1307, USA.
  23. */
  24. /*
  25. * The hangcheck-timer driver uses the TSC to catch delays that
  26. * jiffies does not notice. A timer is set. When the timer fires, it
  27. * checks whether it was delayed and if that delay exceeds a given
  28. * margin of error. The hangcheck_tick module paramter takes the timer
  29. * duration in seconds. The hangcheck_margin parameter defines the
  30. * margin of error, in seconds. The defaults are 60 seconds for the
  31. * timer and 180 seconds for the margin of error. IOW, a timer is set
  32. * for 60 seconds. When the timer fires, the callback checks the
  33. * actual duration that the timer waited. If the duration exceeds the
  34. * alloted time and margin (here 60 + 180, or 240 seconds), the machine
  35. * is restarted. A healthy machine will have the duration match the
  36. * expected timeout very closely.
  37. */
  38. #include <linux/module.h>
  39. #include <linux/moduleparam.h>
  40. #include <linux/types.h>
  41. #include <linux/kernel.h>
  42. #include <linux/fs.h>
  43. #include <linux/mm.h>
  44. #include <linux/reboot.h>
  45. #include <linux/init.h>
  46. #include <asm/uaccess.h>
  47. #define VERSION_STR "0.5.0"
  48. #define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */
  49. #define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */
  50. static int hangcheck_tick = DEFAULT_IOFENCE_TICK;
  51. static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN;
  52. static int hangcheck_reboot; /* Defaults to not reboot */
  53. /* Driver options */
  54. module_param(hangcheck_tick, int, 0);
  55. MODULE_PARM_DESC(hangcheck_tick, "Timer delay.");
  56. module_param(hangcheck_margin, int, 0);
  57. MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire.");
  58. module_param(hangcheck_reboot, int, 0);
  59. MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded.");
  60. MODULE_AUTHOR("Joel Becker");
  61. MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin.");
  62. MODULE_LICENSE("GPL");
  63. /* Last time scheduled */
  64. static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
  65. static void hangcheck_fire(unsigned long);
  66. static struct timer_list hangcheck_ticktock =
  67. TIMER_INITIALIZER(hangcheck_fire, 0, 0);
  68. extern unsigned long long monotonic_clock(void);
  69. static void hangcheck_fire(unsigned long data)
  70. {
  71. unsigned long long cur_tsc, tsc_diff;
  72. cur_tsc = monotonic_clock();
  73. if (cur_tsc > hangcheck_tsc)
  74. tsc_diff = cur_tsc - hangcheck_tsc;
  75. else
  76. tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */
  77. if (tsc_diff > hangcheck_tsc_margin) {
  78. if (hangcheck_reboot) {
  79. printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n");
  80. machine_restart(NULL);
  81. } else {
  82. printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n");
  83. }
  84. }
  85. mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
  86. hangcheck_tsc = monotonic_clock();
  87. }
  88. static int __init hangcheck_init(void)
  89. {
  90. printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n",
  91. VERSION_STR, hangcheck_tick, hangcheck_margin);
  92. hangcheck_tsc_margin = hangcheck_margin + hangcheck_tick;
  93. hangcheck_tsc_margin *= 1000000000;
  94. hangcheck_tsc = monotonic_clock();
  95. mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
  96. return 0;
  97. }
  98. static void __exit hangcheck_exit(void)
  99. {
  100. del_timer_sync(&hangcheck_ticktock);
  101. }
  102. module_init(hangcheck_init);
  103. module_exit(hangcheck_exit);