sched_rt.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. /*
  2. * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
  3. * policies)
  4. */
  5. #ifdef CONFIG_SMP
  6. static cpumask_t rt_overload_mask;
  7. static atomic_t rto_count;
  8. static inline int rt_overloaded(void)
  9. {
  10. return atomic_read(&rto_count);
  11. }
  12. static inline cpumask_t *rt_overload(void)
  13. {
  14. return &rt_overload_mask;
  15. }
  16. static inline void rt_set_overload(struct rq *rq)
  17. {
  18. cpu_set(rq->cpu, rt_overload_mask);
  19. /*
  20. * Make sure the mask is visible before we set
  21. * the overload count. That is checked to determine
  22. * if we should look at the mask. It would be a shame
  23. * if we looked at the mask, but the mask was not
  24. * updated yet.
  25. */
  26. wmb();
  27. atomic_inc(&rto_count);
  28. }
  29. static inline void rt_clear_overload(struct rq *rq)
  30. {
  31. /* the order here really doesn't matter */
  32. atomic_dec(&rto_count);
  33. cpu_clear(rq->cpu, rt_overload_mask);
  34. }
  35. static void update_rt_migration(struct rq *rq)
  36. {
  37. if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
  38. rt_set_overload(rq);
  39. else
  40. rt_clear_overload(rq);
  41. }
  42. #endif /* CONFIG_SMP */
  43. /*
  44. * Update the current task's runtime statistics. Skip current tasks that
  45. * are not in our scheduling class.
  46. */
  47. static void update_curr_rt(struct rq *rq)
  48. {
  49. struct task_struct *curr = rq->curr;
  50. u64 delta_exec;
  51. if (!task_has_rt_policy(curr))
  52. return;
  53. delta_exec = rq->clock - curr->se.exec_start;
  54. if (unlikely((s64)delta_exec < 0))
  55. delta_exec = 0;
  56. schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
  57. curr->se.sum_exec_runtime += delta_exec;
  58. curr->se.exec_start = rq->clock;
  59. cpuacct_charge(curr, delta_exec);
  60. }
  61. static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
  62. {
  63. WARN_ON(!rt_task(p));
  64. rq->rt.rt_nr_running++;
  65. #ifdef CONFIG_SMP
  66. if (p->prio < rq->rt.highest_prio)
  67. rq->rt.highest_prio = p->prio;
  68. if (p->nr_cpus_allowed > 1)
  69. rq->rt.rt_nr_migratory++;
  70. update_rt_migration(rq);
  71. #endif /* CONFIG_SMP */
  72. }
  73. static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
  74. {
  75. WARN_ON(!rt_task(p));
  76. WARN_ON(!rq->rt.rt_nr_running);
  77. rq->rt.rt_nr_running--;
  78. #ifdef CONFIG_SMP
  79. if (rq->rt.rt_nr_running) {
  80. struct rt_prio_array *array;
  81. WARN_ON(p->prio < rq->rt.highest_prio);
  82. if (p->prio == rq->rt.highest_prio) {
  83. /* recalculate */
  84. array = &rq->rt.active;
  85. rq->rt.highest_prio =
  86. sched_find_first_bit(array->bitmap);
  87. } /* otherwise leave rq->highest prio alone */
  88. } else
  89. rq->rt.highest_prio = MAX_RT_PRIO;
  90. if (p->nr_cpus_allowed > 1)
  91. rq->rt.rt_nr_migratory--;
  92. update_rt_migration(rq);
  93. #endif /* CONFIG_SMP */
  94. }
  95. static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
  96. {
  97. struct rt_prio_array *array = &rq->rt.active;
  98. list_add_tail(&p->run_list, array->queue + p->prio);
  99. __set_bit(p->prio, array->bitmap);
  100. inc_cpu_load(rq, p->se.load.weight);
  101. inc_rt_tasks(p, rq);
  102. }
  103. /*
  104. * Adding/removing a task to/from a priority array:
  105. */
  106. static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
  107. {
  108. struct rt_prio_array *array = &rq->rt.active;
  109. update_curr_rt(rq);
  110. list_del(&p->run_list);
  111. if (list_empty(array->queue + p->prio))
  112. __clear_bit(p->prio, array->bitmap);
  113. dec_cpu_load(rq, p->se.load.weight);
  114. dec_rt_tasks(p, rq);
  115. }
  116. /*
  117. * Put task to the end of the run list without the overhead of dequeue
  118. * followed by enqueue.
  119. */
  120. static void requeue_task_rt(struct rq *rq, struct task_struct *p)
  121. {
  122. struct rt_prio_array *array = &rq->rt.active;
  123. list_move_tail(&p->run_list, array->queue + p->prio);
  124. }
  125. static void
  126. yield_task_rt(struct rq *rq)
  127. {
  128. requeue_task_rt(rq, rq->curr);
  129. }
  130. #ifdef CONFIG_SMP
  131. static int select_task_rq_rt(struct task_struct *p, int sync)
  132. {
  133. return task_cpu(p);
  134. }
  135. #endif /* CONFIG_SMP */
  136. /*
  137. * Preempt the current task with a newly woken task if needed:
  138. */
  139. static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
  140. {
  141. if (p->prio < rq->curr->prio)
  142. resched_task(rq->curr);
  143. }
  144. static struct task_struct *pick_next_task_rt(struct rq *rq)
  145. {
  146. struct rt_prio_array *array = &rq->rt.active;
  147. struct task_struct *next;
  148. struct list_head *queue;
  149. int idx;
  150. idx = sched_find_first_bit(array->bitmap);
  151. if (idx >= MAX_RT_PRIO)
  152. return NULL;
  153. queue = array->queue + idx;
  154. next = list_entry(queue->next, struct task_struct, run_list);
  155. next->se.exec_start = rq->clock;
  156. return next;
  157. }
  158. static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
  159. {
  160. update_curr_rt(rq);
  161. p->se.exec_start = 0;
  162. }
  163. #ifdef CONFIG_SMP
  164. /* Only try algorithms three times */
  165. #define RT_MAX_TRIES 3
  166. static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
  167. static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
  168. static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  169. {
  170. if (!task_running(rq, p) &&
  171. (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
  172. (p->nr_cpus_allowed > 1))
  173. return 1;
  174. return 0;
  175. }
  176. /* Return the second highest RT task, NULL otherwise */
  177. static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
  178. int cpu)
  179. {
  180. struct rt_prio_array *array = &rq->rt.active;
  181. struct task_struct *next;
  182. struct list_head *queue;
  183. int idx;
  184. assert_spin_locked(&rq->lock);
  185. if (likely(rq->rt.rt_nr_running < 2))
  186. return NULL;
  187. idx = sched_find_first_bit(array->bitmap);
  188. if (unlikely(idx >= MAX_RT_PRIO)) {
  189. WARN_ON(1); /* rt_nr_running is bad */
  190. return NULL;
  191. }
  192. queue = array->queue + idx;
  193. BUG_ON(list_empty(queue));
  194. next = list_entry(queue->next, struct task_struct, run_list);
  195. if (unlikely(pick_rt_task(rq, next, cpu)))
  196. goto out;
  197. if (queue->next->next != queue) {
  198. /* same prio task */
  199. next = list_entry(queue->next->next, struct task_struct, run_list);
  200. if (pick_rt_task(rq, next, cpu))
  201. goto out;
  202. }
  203. retry:
  204. /* slower, but more flexible */
  205. idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
  206. if (unlikely(idx >= MAX_RT_PRIO))
  207. return NULL;
  208. queue = array->queue + idx;
  209. BUG_ON(list_empty(queue));
  210. list_for_each_entry(next, queue, run_list) {
  211. if (pick_rt_task(rq, next, cpu))
  212. goto out;
  213. }
  214. goto retry;
  215. out:
  216. return next;
  217. }
  218. static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
  219. static int find_lowest_rq(struct task_struct *task)
  220. {
  221. int cpu;
  222. cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
  223. struct rq *lowest_rq = NULL;
  224. cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
  225. /*
  226. * Scan each rq for the lowest prio.
  227. */
  228. for_each_cpu_mask(cpu, *cpu_mask) {
  229. struct rq *rq = cpu_rq(cpu);
  230. if (cpu == rq->cpu)
  231. continue;
  232. /* We look for lowest RT prio or non-rt CPU */
  233. if (rq->rt.highest_prio >= MAX_RT_PRIO) {
  234. lowest_rq = rq;
  235. break;
  236. }
  237. /* no locking for now */
  238. if (rq->rt.highest_prio > task->prio &&
  239. (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
  240. lowest_rq = rq;
  241. }
  242. }
  243. return lowest_rq ? lowest_rq->cpu : -1;
  244. }
  245. /* Will lock the rq it finds */
  246. static struct rq *find_lock_lowest_rq(struct task_struct *task,
  247. struct rq *rq)
  248. {
  249. struct rq *lowest_rq = NULL;
  250. int cpu;
  251. int tries;
  252. for (tries = 0; tries < RT_MAX_TRIES; tries++) {
  253. cpu = find_lowest_rq(task);
  254. if (cpu == -1)
  255. break;
  256. lowest_rq = cpu_rq(cpu);
  257. /* if the prio of this runqueue changed, try again */
  258. if (double_lock_balance(rq, lowest_rq)) {
  259. /*
  260. * We had to unlock the run queue. In
  261. * the mean time, task could have
  262. * migrated already or had its affinity changed.
  263. * Also make sure that it wasn't scheduled on its rq.
  264. */
  265. if (unlikely(task_rq(task) != rq ||
  266. !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
  267. task_running(rq, task) ||
  268. !task->se.on_rq)) {
  269. spin_unlock(&lowest_rq->lock);
  270. lowest_rq = NULL;
  271. break;
  272. }
  273. }
  274. /* If this rq is still suitable use it. */
  275. if (lowest_rq->rt.highest_prio > task->prio)
  276. break;
  277. /* try again */
  278. spin_unlock(&lowest_rq->lock);
  279. lowest_rq = NULL;
  280. }
  281. return lowest_rq;
  282. }
  283. /*
  284. * If the current CPU has more than one RT task, see if the non
  285. * running task can migrate over to a CPU that is running a task
  286. * of lesser priority.
  287. */
  288. static int push_rt_task(struct rq *rq)
  289. {
  290. struct task_struct *next_task;
  291. struct rq *lowest_rq;
  292. int ret = 0;
  293. int paranoid = RT_MAX_TRIES;
  294. assert_spin_locked(&rq->lock);
  295. next_task = pick_next_highest_task_rt(rq, -1);
  296. if (!next_task)
  297. return 0;
  298. retry:
  299. if (unlikely(next_task == rq->curr)) {
  300. WARN_ON(1);
  301. return 0;
  302. }
  303. /*
  304. * It's possible that the next_task slipped in of
  305. * higher priority than current. If that's the case
  306. * just reschedule current.
  307. */
  308. if (unlikely(next_task->prio < rq->curr->prio)) {
  309. resched_task(rq->curr);
  310. return 0;
  311. }
  312. /* We might release rq lock */
  313. get_task_struct(next_task);
  314. /* find_lock_lowest_rq locks the rq if found */
  315. lowest_rq = find_lock_lowest_rq(next_task, rq);
  316. if (!lowest_rq) {
  317. struct task_struct *task;
  318. /*
  319. * find lock_lowest_rq releases rq->lock
  320. * so it is possible that next_task has changed.
  321. * If it has, then try again.
  322. */
  323. task = pick_next_highest_task_rt(rq, -1);
  324. if (unlikely(task != next_task) && task && paranoid--) {
  325. put_task_struct(next_task);
  326. next_task = task;
  327. goto retry;
  328. }
  329. goto out;
  330. }
  331. assert_spin_locked(&lowest_rq->lock);
  332. deactivate_task(rq, next_task, 0);
  333. set_task_cpu(next_task, lowest_rq->cpu);
  334. activate_task(lowest_rq, next_task, 0);
  335. resched_task(lowest_rq->curr);
  336. spin_unlock(&lowest_rq->lock);
  337. ret = 1;
  338. out:
  339. put_task_struct(next_task);
  340. return ret;
  341. }
  342. /*
  343. * TODO: Currently we just use the second highest prio task on
  344. * the queue, and stop when it can't migrate (or there's
  345. * no more RT tasks). There may be a case where a lower
  346. * priority RT task has a different affinity than the
  347. * higher RT task. In this case the lower RT task could
  348. * possibly be able to migrate where as the higher priority
  349. * RT task could not. We currently ignore this issue.
  350. * Enhancements are welcome!
  351. */
  352. static void push_rt_tasks(struct rq *rq)
  353. {
  354. /* push_rt_task will return true if it moved an RT */
  355. while (push_rt_task(rq))
  356. ;
  357. }
  358. static int pull_rt_task(struct rq *this_rq)
  359. {
  360. struct task_struct *next;
  361. struct task_struct *p;
  362. struct rq *src_rq;
  363. cpumask_t *rto_cpumask;
  364. int this_cpu = this_rq->cpu;
  365. int cpu;
  366. int ret = 0;
  367. assert_spin_locked(&this_rq->lock);
  368. /*
  369. * If cpusets are used, and we have overlapping
  370. * run queue cpusets, then this algorithm may not catch all.
  371. * This is just the price you pay on trying to keep
  372. * dirtying caches down on large SMP machines.
  373. */
  374. if (likely(!rt_overloaded()))
  375. return 0;
  376. next = pick_next_task_rt(this_rq);
  377. rto_cpumask = rt_overload();
  378. for_each_cpu_mask(cpu, *rto_cpumask) {
  379. if (this_cpu == cpu)
  380. continue;
  381. src_rq = cpu_rq(cpu);
  382. if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
  383. /*
  384. * It is possible that overlapping cpusets
  385. * will miss clearing a non overloaded runqueue.
  386. * Clear it now.
  387. */
  388. if (double_lock_balance(this_rq, src_rq)) {
  389. /* unlocked our runqueue lock */
  390. struct task_struct *old_next = next;
  391. next = pick_next_task_rt(this_rq);
  392. if (next != old_next)
  393. ret = 1;
  394. }
  395. if (likely(src_rq->rt.rt_nr_running <= 1))
  396. /*
  397. * Small chance that this_rq->curr changed
  398. * but it's really harmless here.
  399. */
  400. rt_clear_overload(this_rq);
  401. else
  402. /*
  403. * Heh, the src_rq is now overloaded, since
  404. * we already have the src_rq lock, go straight
  405. * to pulling tasks from it.
  406. */
  407. goto try_pulling;
  408. spin_unlock(&src_rq->lock);
  409. continue;
  410. }
  411. /*
  412. * We can potentially drop this_rq's lock in
  413. * double_lock_balance, and another CPU could
  414. * steal our next task - hence we must cause
  415. * the caller to recalculate the next task
  416. * in that case:
  417. */
  418. if (double_lock_balance(this_rq, src_rq)) {
  419. struct task_struct *old_next = next;
  420. next = pick_next_task_rt(this_rq);
  421. if (next != old_next)
  422. ret = 1;
  423. }
  424. /*
  425. * Are there still pullable RT tasks?
  426. */
  427. if (src_rq->rt.rt_nr_running <= 1) {
  428. spin_unlock(&src_rq->lock);
  429. continue;
  430. }
  431. try_pulling:
  432. p = pick_next_highest_task_rt(src_rq, this_cpu);
  433. /*
  434. * Do we have an RT task that preempts
  435. * the to-be-scheduled task?
  436. */
  437. if (p && (!next || (p->prio < next->prio))) {
  438. WARN_ON(p == src_rq->curr);
  439. WARN_ON(!p->se.on_rq);
  440. /*
  441. * There's a chance that p is higher in priority
  442. * than what's currently running on its cpu.
  443. * This is just that p is wakeing up and hasn't
  444. * had a chance to schedule. We only pull
  445. * p if it is lower in priority than the
  446. * current task on the run queue or
  447. * this_rq next task is lower in prio than
  448. * the current task on that rq.
  449. */
  450. if (p->prio < src_rq->curr->prio ||
  451. (next && next->prio < src_rq->curr->prio))
  452. goto bail;
  453. ret = 1;
  454. deactivate_task(src_rq, p, 0);
  455. set_task_cpu(p, this_cpu);
  456. activate_task(this_rq, p, 0);
  457. /*
  458. * We continue with the search, just in
  459. * case there's an even higher prio task
  460. * in another runqueue. (low likelyhood
  461. * but possible)
  462. */
  463. /*
  464. * Update next so that we won't pick a task
  465. * on another cpu with a priority lower (or equal)
  466. * than the one we just picked.
  467. */
  468. next = p;
  469. }
  470. bail:
  471. spin_unlock(&src_rq->lock);
  472. }
  473. return ret;
  474. }
  475. static void schedule_balance_rt(struct rq *rq,
  476. struct task_struct *prev)
  477. {
  478. /* Try to pull RT tasks here if we lower this rq's prio */
  479. if (unlikely(rt_task(prev)) &&
  480. rq->rt.highest_prio > prev->prio)
  481. pull_rt_task(rq);
  482. }
  483. static void schedule_tail_balance_rt(struct rq *rq)
  484. {
  485. /*
  486. * If we have more than one rt_task queued, then
  487. * see if we can push the other rt_tasks off to other CPUS.
  488. * Note we may release the rq lock, and since
  489. * the lock was owned by prev, we need to release it
  490. * first via finish_lock_switch and then reaquire it here.
  491. */
  492. if (unlikely(rq->rt.rt_nr_running > 1)) {
  493. spin_lock_irq(&rq->lock);
  494. push_rt_tasks(rq);
  495. spin_unlock_irq(&rq->lock);
  496. }
  497. }
  498. static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
  499. {
  500. if (unlikely(rt_task(p)) &&
  501. !task_running(rq, p) &&
  502. (p->prio >= rq->curr->prio))
  503. push_rt_tasks(rq);
  504. }
  505. static unsigned long
  506. load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
  507. unsigned long max_load_move,
  508. struct sched_domain *sd, enum cpu_idle_type idle,
  509. int *all_pinned, int *this_best_prio)
  510. {
  511. /* don't touch RT tasks */
  512. return 0;
  513. }
  514. static int
  515. move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
  516. struct sched_domain *sd, enum cpu_idle_type idle)
  517. {
  518. /* don't touch RT tasks */
  519. return 0;
  520. }
  521. static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
  522. {
  523. int weight = cpus_weight(*new_mask);
  524. BUG_ON(!rt_task(p));
  525. /*
  526. * Update the migration status of the RQ if we have an RT task
  527. * which is running AND changing its weight value.
  528. */
  529. if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
  530. struct rq *rq = task_rq(p);
  531. if ((p->nr_cpus_allowed <= 1) && (weight > 1))
  532. rq->rt.rt_nr_migratory++;
  533. else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
  534. BUG_ON(!rq->rt.rt_nr_migratory);
  535. rq->rt.rt_nr_migratory--;
  536. }
  537. update_rt_migration(rq);
  538. }
  539. p->cpus_allowed = *new_mask;
  540. p->nr_cpus_allowed = weight;
  541. }
  542. #else /* CONFIG_SMP */
  543. # define schedule_tail_balance_rt(rq) do { } while (0)
  544. # define schedule_balance_rt(rq, prev) do { } while (0)
  545. # define wakeup_balance_rt(rq, p) do { } while (0)
  546. #endif /* CONFIG_SMP */
  547. static void task_tick_rt(struct rq *rq, struct task_struct *p)
  548. {
  549. update_curr_rt(rq);
  550. /*
  551. * RR tasks need a special form of timeslice management.
  552. * FIFO tasks have no timeslices.
  553. */
  554. if (p->policy != SCHED_RR)
  555. return;
  556. if (--p->time_slice)
  557. return;
  558. p->time_slice = DEF_TIMESLICE;
  559. /*
  560. * Requeue to the end of queue if we are not the only element
  561. * on the queue:
  562. */
  563. if (p->run_list.prev != p->run_list.next) {
  564. requeue_task_rt(rq, p);
  565. set_tsk_need_resched(p);
  566. }
  567. }
  568. static void set_curr_task_rt(struct rq *rq)
  569. {
  570. struct task_struct *p = rq->curr;
  571. p->se.exec_start = rq->clock;
  572. }
  573. const struct sched_class rt_sched_class = {
  574. .next = &fair_sched_class,
  575. .enqueue_task = enqueue_task_rt,
  576. .dequeue_task = dequeue_task_rt,
  577. .yield_task = yield_task_rt,
  578. #ifdef CONFIG_SMP
  579. .select_task_rq = select_task_rq_rt,
  580. #endif /* CONFIG_SMP */
  581. .check_preempt_curr = check_preempt_curr_rt,
  582. .pick_next_task = pick_next_task_rt,
  583. .put_prev_task = put_prev_task_rt,
  584. #ifdef CONFIG_SMP
  585. .load_balance = load_balance_rt,
  586. .move_one_task = move_one_task_rt,
  587. .set_cpus_allowed = set_cpus_allowed_rt,
  588. #endif
  589. .set_curr_task = set_curr_task_rt,
  590. .task_tick = task_tick_rt,
  591. };