proc.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. /*
  2. * kernel/sched/proc.c
  3. *
  4. * Kernel load calculations, forked from sched/core.c
  5. */
  6. #include <linux/export.h>
  7. #include "sched.h"
  8. unsigned long this_cpu_load(void)
  9. {
  10. struct rq *this = this_rq();
  11. return this->cpu_load[0];
  12. }
  13. /*
  14. * Global load-average calculations
  15. *
  16. * We take a distributed and async approach to calculating the global load-avg
  17. * in order to minimize overhead.
  18. *
  19. * The global load average is an exponentially decaying average of nr_running +
  20. * nr_uninterruptible.
  21. *
  22. * Once every LOAD_FREQ:
  23. *
  24. * nr_active = 0;
  25. * for_each_possible_cpu(cpu)
  26. * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
  27. *
  28. * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
  29. *
  30. * Due to a number of reasons the above turns in the mess below:
  31. *
  32. * - for_each_possible_cpu() is prohibitively expensive on machines with
  33. * serious number of cpus, therefore we need to take a distributed approach
  34. * to calculating nr_active.
  35. *
  36. * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
  37. * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
  38. *
  39. * So assuming nr_active := 0 when we start out -- true per definition, we
  40. * can simply take per-cpu deltas and fold those into a global accumulate
  41. * to obtain the same result. See calc_load_fold_active().
  42. *
  43. * Furthermore, in order to avoid synchronizing all per-cpu delta folding
  44. * across the machine, we assume 10 ticks is sufficient time for every
  45. * cpu to have completed this task.
  46. *
  47. * This places an upper-bound on the IRQ-off latency of the machine. Then
  48. * again, being late doesn't loose the delta, just wrecks the sample.
  49. *
  50. * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
  51. * this would add another cross-cpu cacheline miss and atomic operation
  52. * to the wakeup path. Instead we increment on whatever cpu the task ran
  53. * when it went into uninterruptible state and decrement on whatever cpu
  54. * did the wakeup. This means that only the sum of nr_uninterruptible over
  55. * all cpus yields the correct result.
  56. *
  57. * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
  58. */
  59. /* Variables and functions for calc_load */
  60. atomic_long_t calc_load_tasks;
  61. unsigned long calc_load_update;
  62. unsigned long avenrun[3];
  63. EXPORT_SYMBOL(avenrun); /* should be removed */
  64. /**
  65. * get_avenrun - get the load average array
  66. * @loads: pointer to dest load array
  67. * @offset: offset to add
  68. * @shift: shift count to shift the result left
  69. *
  70. * These values are estimates at best, so no need for locking.
  71. */
  72. void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
  73. {
  74. loads[0] = (avenrun[0] + offset) << shift;
  75. loads[1] = (avenrun[1] + offset) << shift;
  76. loads[2] = (avenrun[2] + offset) << shift;
  77. }
  78. long calc_load_fold_active(struct rq *this_rq)
  79. {
  80. long nr_active, delta = 0;
  81. nr_active = this_rq->nr_running;
  82. nr_active += (long) this_rq->nr_uninterruptible;
  83. if (nr_active != this_rq->calc_load_active) {
  84. delta = nr_active - this_rq->calc_load_active;
  85. this_rq->calc_load_active = nr_active;
  86. }
  87. return delta;
  88. }
  89. /*
  90. * a1 = a0 * e + a * (1 - e)
  91. */
  92. static unsigned long
  93. calc_load(unsigned long load, unsigned long exp, unsigned long active)
  94. {
  95. load *= exp;
  96. load += active * (FIXED_1 - exp);
  97. load += 1UL << (FSHIFT - 1);
  98. return load >> FSHIFT;
  99. }
  100. #ifdef CONFIG_NO_HZ_COMMON
  101. /*
  102. * Handle NO_HZ for the global load-average.
  103. *
  104. * Since the above described distributed algorithm to compute the global
  105. * load-average relies on per-cpu sampling from the tick, it is affected by
  106. * NO_HZ.
  107. *
  108. * The basic idea is to fold the nr_active delta into a global idle-delta upon
  109. * entering NO_HZ state such that we can include this as an 'extra' cpu delta
  110. * when we read the global state.
  111. *
  112. * Obviously reality has to ruin such a delightfully simple scheme:
  113. *
  114. * - When we go NO_HZ idle during the window, we can negate our sample
  115. * contribution, causing under-accounting.
  116. *
  117. * We avoid this by keeping two idle-delta counters and flipping them
  118. * when the window starts, thus separating old and new NO_HZ load.
  119. *
  120. * The only trick is the slight shift in index flip for read vs write.
  121. *
  122. * 0s 5s 10s 15s
  123. * +10 +10 +10 +10
  124. * |-|-----------|-|-----------|-|-----------|-|
  125. * r:0 0 1 1 0 0 1 1 0
  126. * w:0 1 1 0 0 1 1 0 0
  127. *
  128. * This ensures we'll fold the old idle contribution in this window while
  129. * accumlating the new one.
  130. *
  131. * - When we wake up from NO_HZ idle during the window, we push up our
  132. * contribution, since we effectively move our sample point to a known
  133. * busy state.
  134. *
  135. * This is solved by pushing the window forward, and thus skipping the
  136. * sample, for this cpu (effectively using the idle-delta for this cpu which
  137. * was in effect at the time the window opened). This also solves the issue
  138. * of having to deal with a cpu having been in NOHZ idle for multiple
  139. * LOAD_FREQ intervals.
  140. *
  141. * When making the ILB scale, we should try to pull this in as well.
  142. */
  143. static atomic_long_t calc_load_idle[2];
  144. static int calc_load_idx;
  145. static inline int calc_load_write_idx(void)
  146. {
  147. int idx = calc_load_idx;
  148. /*
  149. * See calc_global_nohz(), if we observe the new index, we also
  150. * need to observe the new update time.
  151. */
  152. smp_rmb();
  153. /*
  154. * If the folding window started, make sure we start writing in the
  155. * next idle-delta.
  156. */
  157. if (!time_before(jiffies, calc_load_update))
  158. idx++;
  159. return idx & 1;
  160. }
  161. static inline int calc_load_read_idx(void)
  162. {
  163. return calc_load_idx & 1;
  164. }
  165. void calc_load_enter_idle(void)
  166. {
  167. struct rq *this_rq = this_rq();
  168. long delta;
  169. /*
  170. * We're going into NOHZ mode, if there's any pending delta, fold it
  171. * into the pending idle delta.
  172. */
  173. delta = calc_load_fold_active(this_rq);
  174. if (delta) {
  175. int idx = calc_load_write_idx();
  176. atomic_long_add(delta, &calc_load_idle[idx]);
  177. }
  178. }
  179. void calc_load_exit_idle(void)
  180. {
  181. struct rq *this_rq = this_rq();
  182. /*
  183. * If we're still before the sample window, we're done.
  184. */
  185. if (time_before(jiffies, this_rq->calc_load_update))
  186. return;
  187. /*
  188. * We woke inside or after the sample window, this means we're already
  189. * accounted through the nohz accounting, so skip the entire deal and
  190. * sync up for the next window.
  191. */
  192. this_rq->calc_load_update = calc_load_update;
  193. if (time_before(jiffies, this_rq->calc_load_update + 10))
  194. this_rq->calc_load_update += LOAD_FREQ;
  195. }
  196. static long calc_load_fold_idle(void)
  197. {
  198. int idx = calc_load_read_idx();
  199. long delta = 0;
  200. if (atomic_long_read(&calc_load_idle[idx]))
  201. delta = atomic_long_xchg(&calc_load_idle[idx], 0);
  202. return delta;
  203. }
  204. /**
  205. * fixed_power_int - compute: x^n, in O(log n) time
  206. *
  207. * @x: base of the power
  208. * @frac_bits: fractional bits of @x
  209. * @n: power to raise @x to.
  210. *
  211. * By exploiting the relation between the definition of the natural power
  212. * function: x^n := x*x*...*x (x multiplied by itself for n times), and
  213. * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
  214. * (where: n_i \elem {0, 1}, the binary vector representing n),
  215. * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
  216. * of course trivially computable in O(log_2 n), the length of our binary
  217. * vector.
  218. */
  219. static unsigned long
  220. fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
  221. {
  222. unsigned long result = 1UL << frac_bits;
  223. if (n) for (;;) {
  224. if (n & 1) {
  225. result *= x;
  226. result += 1UL << (frac_bits - 1);
  227. result >>= frac_bits;
  228. }
  229. n >>= 1;
  230. if (!n)
  231. break;
  232. x *= x;
  233. x += 1UL << (frac_bits - 1);
  234. x >>= frac_bits;
  235. }
  236. return result;
  237. }
  238. /*
  239. * a1 = a0 * e + a * (1 - e)
  240. *
  241. * a2 = a1 * e + a * (1 - e)
  242. * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
  243. * = a0 * e^2 + a * (1 - e) * (1 + e)
  244. *
  245. * a3 = a2 * e + a * (1 - e)
  246. * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
  247. * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
  248. *
  249. * ...
  250. *
  251. * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
  252. * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
  253. * = a0 * e^n + a * (1 - e^n)
  254. *
  255. * [1] application of the geometric series:
  256. *
  257. * n 1 - x^(n+1)
  258. * S_n := \Sum x^i = -------------
  259. * i=0 1 - x
  260. */
  261. static unsigned long
  262. calc_load_n(unsigned long load, unsigned long exp,
  263. unsigned long active, unsigned int n)
  264. {
  265. return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
  266. }
  267. /*
  268. * NO_HZ can leave us missing all per-cpu ticks calling
  269. * calc_load_account_active(), but since an idle CPU folds its delta into
  270. * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
  271. * in the pending idle delta if our idle period crossed a load cycle boundary.
  272. *
  273. * Once we've updated the global active value, we need to apply the exponential
  274. * weights adjusted to the number of cycles missed.
  275. */
  276. static void calc_global_nohz(void)
  277. {
  278. long delta, active, n;
  279. if (!time_before(jiffies, calc_load_update + 10)) {
  280. /*
  281. * Catch-up, fold however many we are behind still
  282. */
  283. delta = jiffies - calc_load_update - 10;
  284. n = 1 + (delta / LOAD_FREQ);
  285. active = atomic_long_read(&calc_load_tasks);
  286. active = active > 0 ? active * FIXED_1 : 0;
  287. avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
  288. avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
  289. avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
  290. calc_load_update += n * LOAD_FREQ;
  291. }
  292. /*
  293. * Flip the idle index...
  294. *
  295. * Make sure we first write the new time then flip the index, so that
  296. * calc_load_write_idx() will see the new time when it reads the new
  297. * index, this avoids a double flip messing things up.
  298. */
  299. smp_wmb();
  300. calc_load_idx++;
  301. }
  302. #else /* !CONFIG_NO_HZ_COMMON */
  303. static inline long calc_load_fold_idle(void) { return 0; }
  304. static inline void calc_global_nohz(void) { }
  305. #endif /* CONFIG_NO_HZ_COMMON */
  306. /*
  307. * calc_load - update the avenrun load estimates 10 ticks after the
  308. * CPUs have updated calc_load_tasks.
  309. */
  310. void calc_global_load(unsigned long ticks)
  311. {
  312. long active, delta;
  313. if (time_before(jiffies, calc_load_update + 10))
  314. return;
  315. /*
  316. * Fold the 'old' idle-delta to include all NO_HZ cpus.
  317. */
  318. delta = calc_load_fold_idle();
  319. if (delta)
  320. atomic_long_add(delta, &calc_load_tasks);
  321. active = atomic_long_read(&calc_load_tasks);
  322. active = active > 0 ? active * FIXED_1 : 0;
  323. avenrun[0] = calc_load(avenrun[0], EXP_1, active);
  324. avenrun[1] = calc_load(avenrun[1], EXP_5, active);
  325. avenrun[2] = calc_load(avenrun[2], EXP_15, active);
  326. calc_load_update += LOAD_FREQ;
  327. /*
  328. * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
  329. */
  330. calc_global_nohz();
  331. }
  332. /*
  333. * Called from update_cpu_load() to periodically update this CPU's
  334. * active count.
  335. */
  336. static void calc_load_account_active(struct rq *this_rq)
  337. {
  338. long delta;
  339. if (time_before(jiffies, this_rq->calc_load_update))
  340. return;
  341. delta = calc_load_fold_active(this_rq);
  342. if (delta)
  343. atomic_long_add(delta, &calc_load_tasks);
  344. this_rq->calc_load_update += LOAD_FREQ;
  345. }
  346. /*
  347. * End of global load-average stuff
  348. */
  349. /*
  350. * The exact cpuload at various idx values, calculated at every tick would be
  351. * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
  352. *
  353. * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
  354. * on nth tick when cpu may be busy, then we have:
  355. * load = ((2^idx - 1) / 2^idx)^(n-1) * load
  356. * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
  357. *
  358. * decay_load_missed() below does efficient calculation of
  359. * load = ((2^idx - 1) / 2^idx)^(n-1) * load
  360. * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
  361. *
  362. * The calculation is approximated on a 128 point scale.
  363. * degrade_zero_ticks is the number of ticks after which load at any
  364. * particular idx is approximated to be zero.
  365. * degrade_factor is a precomputed table, a row for each load idx.
  366. * Each column corresponds to degradation factor for a power of two ticks,
  367. * based on 128 point scale.
  368. * Example:
  369. * row 2, col 3 (=12) says that the degradation at load idx 2 after
  370. * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
  371. *
  372. * With this power of 2 load factors, we can degrade the load n times
  373. * by looking at 1 bits in n and doing as many mult/shift instead of
  374. * n mult/shifts needed by the exact degradation.
  375. */
  376. #define DEGRADE_SHIFT 7
  377. static const unsigned char
  378. degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
  379. static const unsigned char
  380. degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
  381. {0, 0, 0, 0, 0, 0, 0, 0},
  382. {64, 32, 8, 0, 0, 0, 0, 0},
  383. {96, 72, 40, 12, 1, 0, 0},
  384. {112, 98, 75, 43, 15, 1, 0},
  385. {120, 112, 98, 76, 45, 16, 2} };
  386. /*
  387. * Update cpu_load for any missed ticks, due to tickless idle. The backlog
  388. * would be when CPU is idle and so we just decay the old load without
  389. * adding any new load.
  390. */
  391. static unsigned long
  392. decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
  393. {
  394. int j = 0;
  395. if (!missed_updates)
  396. return load;
  397. if (missed_updates >= degrade_zero_ticks[idx])
  398. return 0;
  399. if (idx == 1)
  400. return load >> missed_updates;
  401. while (missed_updates) {
  402. if (missed_updates % 2)
  403. load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
  404. missed_updates >>= 1;
  405. j++;
  406. }
  407. return load;
  408. }
  409. /*
  410. * Update rq->cpu_load[] statistics. This function is usually called every
  411. * scheduler tick (TICK_NSEC). With tickless idle this will not be called
  412. * every tick. We fix it up based on jiffies.
  413. */
  414. static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
  415. unsigned long pending_updates)
  416. {
  417. int i, scale;
  418. this_rq->nr_load_updates++;
  419. /* Update our load: */
  420. this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
  421. for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
  422. unsigned long old_load, new_load;
  423. /* scale is effectively 1 << i now, and >> i divides by scale */
  424. old_load = this_rq->cpu_load[i];
  425. old_load = decay_load_missed(old_load, pending_updates - 1, i);
  426. new_load = this_load;
  427. /*
  428. * Round up the averaging division if load is increasing. This
  429. * prevents us from getting stuck on 9 if the load is 10, for
  430. * example.
  431. */
  432. if (new_load > old_load)
  433. new_load += scale - 1;
  434. this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
  435. }
  436. sched_avg_update(this_rq);
  437. }
  438. #ifdef CONFIG_SMP
  439. static inline unsigned long get_rq_runnable_load(struct rq *rq)
  440. {
  441. return rq->cfs.runnable_load_avg;
  442. }
  443. #else
  444. static inline unsigned long get_rq_runnable_load(struct rq *rq)
  445. {
  446. return rq->load.weight;
  447. }
  448. #endif
  449. #ifdef CONFIG_NO_HZ_COMMON
  450. /*
  451. * There is no sane way to deal with nohz on smp when using jiffies because the
  452. * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
  453. * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
  454. *
  455. * Therefore we cannot use the delta approach from the regular tick since that
  456. * would seriously skew the load calculation. However we'll make do for those
  457. * updates happening while idle (nohz_idle_balance) or coming out of idle
  458. * (tick_nohz_idle_exit).
  459. *
  460. * This means we might still be one tick off for nohz periods.
  461. */
  462. /*
  463. * Called from nohz_idle_balance() to update the load ratings before doing the
  464. * idle balance.
  465. */
  466. void update_idle_cpu_load(struct rq *this_rq)
  467. {
  468. unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
  469. unsigned long load = get_rq_runnable_load(this_rq);
  470. unsigned long pending_updates;
  471. /*
  472. * bail if there's load or we're actually up-to-date.
  473. */
  474. if (load || curr_jiffies == this_rq->last_load_update_tick)
  475. return;
  476. pending_updates = curr_jiffies - this_rq->last_load_update_tick;
  477. this_rq->last_load_update_tick = curr_jiffies;
  478. __update_cpu_load(this_rq, load, pending_updates);
  479. }
  480. /*
  481. * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
  482. */
  483. void update_cpu_load_nohz(void)
  484. {
  485. struct rq *this_rq = this_rq();
  486. unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
  487. unsigned long pending_updates;
  488. if (curr_jiffies == this_rq->last_load_update_tick)
  489. return;
  490. raw_spin_lock(&this_rq->lock);
  491. pending_updates = curr_jiffies - this_rq->last_load_update_tick;
  492. if (pending_updates) {
  493. this_rq->last_load_update_tick = curr_jiffies;
  494. /*
  495. * We were idle, this means load 0, the current load might be
  496. * !0 due to remote wakeups and the sort.
  497. */
  498. __update_cpu_load(this_rq, 0, pending_updates);
  499. }
  500. raw_spin_unlock(&this_rq->lock);
  501. }
  502. #endif /* CONFIG_NO_HZ */
  503. /*
  504. * Called from scheduler_tick()
  505. */
  506. void update_cpu_load_active(struct rq *this_rq)
  507. {
  508. unsigned long load = get_rq_runnable_load(this_rq);
  509. /*
  510. * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
  511. */
  512. this_rq->last_load_update_tick = jiffies;
  513. __update_cpu_load(this_rq, load, 1);
  514. calc_load_account_active(this_rq);
  515. }