sched_rt.c 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202
  1. /*
  2. * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
  3. * policies)
  4. */
  5. #ifdef CONFIG_SMP
  6. static inline int rt_overloaded(struct rq *rq)
  7. {
  8. return atomic_read(&rq->rd->rto_count);
  9. }
  10. static inline void rt_set_overload(struct rq *rq)
  11. {
  12. cpu_set(rq->cpu, rq->rd->rto_mask);
  13. /*
  14. * Make sure the mask is visible before we set
  15. * the overload count. That is checked to determine
  16. * if we should look at the mask. It would be a shame
  17. * if we looked at the mask, but the mask was not
  18. * updated yet.
  19. */
  20. wmb();
  21. atomic_inc(&rq->rd->rto_count);
  22. }
  23. static inline void rt_clear_overload(struct rq *rq)
  24. {
  25. /* the order here really doesn't matter */
  26. atomic_dec(&rq->rd->rto_count);
  27. cpu_clear(rq->cpu, rq->rd->rto_mask);
  28. }
  29. static void update_rt_migration(struct rq *rq)
  30. {
  31. if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
  32. if (!rq->rt.overloaded) {
  33. rt_set_overload(rq);
  34. rq->rt.overloaded = 1;
  35. }
  36. } else if (rq->rt.overloaded) {
  37. rt_clear_overload(rq);
  38. rq->rt.overloaded = 0;
  39. }
  40. }
  41. #endif /* CONFIG_SMP */
  42. static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  43. {
  44. return container_of(rt_se, struct task_struct, rt);
  45. }
  46. static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  47. {
  48. return !list_empty(&rt_se->run_list);
  49. }
  50. #ifdef CONFIG_FAIR_GROUP_SCHED
  51. static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
  52. {
  53. if (!rt_rq->tg)
  54. return SCHED_RT_FRAC;
  55. return rt_rq->tg->rt_ratio;
  56. }
  57. #define for_each_leaf_rt_rq(rt_rq, rq) \
  58. list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  59. static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  60. {
  61. return rt_rq->rq;
  62. }
  63. static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  64. {
  65. return rt_se->rt_rq;
  66. }
  67. #define for_each_sched_rt_entity(rt_se) \
  68. for (; rt_se; rt_se = rt_se->parent)
  69. static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  70. {
  71. return rt_se->my_q;
  72. }
  73. static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
  74. static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
  75. static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
  76. {
  77. struct sched_rt_entity *rt_se = rt_rq->rt_se;
  78. if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
  79. enqueue_rt_entity(rt_se);
  80. resched_task(rq_of_rt_rq(rt_rq)->curr);
  81. }
  82. }
  83. static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
  84. {
  85. struct sched_rt_entity *rt_se = rt_rq->rt_se;
  86. if (rt_se && on_rt_rq(rt_se))
  87. dequeue_rt_entity(rt_se);
  88. }
  89. #else
  90. static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
  91. {
  92. return sysctl_sched_rt_ratio;
  93. }
  94. #define for_each_leaf_rt_rq(rt_rq, rq) \
  95. for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
  96. static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  97. {
  98. return container_of(rt_rq, struct rq, rt);
  99. }
  100. static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  101. {
  102. struct task_struct *p = rt_task_of(rt_se);
  103. struct rq *rq = task_rq(p);
  104. return &rq->rt;
  105. }
  106. #define for_each_sched_rt_entity(rt_se) \
  107. for (; rt_se; rt_se = NULL)
  108. static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  109. {
  110. return NULL;
  111. }
  112. static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
  113. {
  114. }
  115. static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
  116. {
  117. }
  118. #endif
  119. static inline int rt_se_prio(struct sched_rt_entity *rt_se)
  120. {
  121. #ifdef CONFIG_FAIR_GROUP_SCHED
  122. struct rt_rq *rt_rq = group_rt_rq(rt_se);
  123. if (rt_rq)
  124. return rt_rq->highest_prio;
  125. #endif
  126. return rt_task_of(rt_se)->prio;
  127. }
  128. static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
  129. {
  130. unsigned int rt_ratio = sched_rt_ratio(rt_rq);
  131. u64 period, ratio;
  132. if (rt_ratio == SCHED_RT_FRAC)
  133. return 0;
  134. if (rt_rq->rt_throttled)
  135. return 1;
  136. period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
  137. ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
  138. if (rt_rq->rt_time > ratio) {
  139. rt_rq->rt_throttled = 1;
  140. sched_rt_ratio_dequeue(rt_rq);
  141. return 1;
  142. }
  143. return 0;
  144. }
  145. static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
  146. {
  147. unsigned long rt_ratio = sched_rt_ratio(rt_rq);
  148. u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
  149. rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
  150. if (rt_rq->rt_throttled) {
  151. rt_rq->rt_throttled = 0;
  152. sched_rt_ratio_enqueue(rt_rq);
  153. }
  154. }
  155. static void update_sched_rt_period(struct rq *rq)
  156. {
  157. struct rt_rq *rt_rq;
  158. u64 period;
  159. while (rq->clock > rq->rt_period_expire) {
  160. period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
  161. rq->rt_period_expire += period;
  162. for_each_leaf_rt_rq(rt_rq, rq)
  163. __update_sched_rt_period(rt_rq, period);
  164. }
  165. }
  166. /*
  167. * Update the current task's runtime statistics. Skip current tasks that
  168. * are not in our scheduling class.
  169. */
  170. static void update_curr_rt(struct rq *rq)
  171. {
  172. struct task_struct *curr = rq->curr;
  173. struct sched_rt_entity *rt_se = &curr->rt;
  174. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  175. u64 delta_exec;
  176. if (!task_has_rt_policy(curr))
  177. return;
  178. delta_exec = rq->clock - curr->se.exec_start;
  179. if (unlikely((s64)delta_exec < 0))
  180. delta_exec = 0;
  181. schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
  182. curr->se.sum_exec_runtime += delta_exec;
  183. curr->se.exec_start = rq->clock;
  184. cpuacct_charge(curr, delta_exec);
  185. rt_rq->rt_time += delta_exec;
  186. /*
  187. * might make it a tad more accurate:
  188. *
  189. * update_sched_rt_period(rq);
  190. */
  191. if (sched_rt_ratio_exceeded(rt_rq))
  192. resched_task(curr);
  193. }
  194. static inline
  195. void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  196. {
  197. WARN_ON(!rt_prio(rt_se_prio(rt_se)));
  198. rt_rq->rt_nr_running++;
  199. #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
  200. if (rt_se_prio(rt_se) < rt_rq->highest_prio)
  201. rt_rq->highest_prio = rt_se_prio(rt_se);
  202. #endif
  203. #ifdef CONFIG_SMP
  204. if (rt_se->nr_cpus_allowed > 1) {
  205. struct rq *rq = rq_of_rt_rq(rt_rq);
  206. rq->rt.rt_nr_migratory++;
  207. }
  208. update_rt_migration(rq_of_rt_rq(rt_rq));
  209. #endif
  210. }
  211. static inline
  212. void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  213. {
  214. WARN_ON(!rt_prio(rt_se_prio(rt_se)));
  215. WARN_ON(!rt_rq->rt_nr_running);
  216. rt_rq->rt_nr_running--;
  217. #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
  218. if (rt_rq->rt_nr_running) {
  219. struct rt_prio_array *array;
  220. WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
  221. if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
  222. /* recalculate */
  223. array = &rt_rq->active;
  224. rt_rq->highest_prio =
  225. sched_find_first_bit(array->bitmap);
  226. } /* otherwise leave rq->highest prio alone */
  227. } else
  228. rt_rq->highest_prio = MAX_RT_PRIO;
  229. #endif
  230. #ifdef CONFIG_SMP
  231. if (rt_se->nr_cpus_allowed > 1) {
  232. struct rq *rq = rq_of_rt_rq(rt_rq);
  233. rq->rt.rt_nr_migratory--;
  234. }
  235. update_rt_migration(rq_of_rt_rq(rt_rq));
  236. #endif /* CONFIG_SMP */
  237. }
  238. static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
  239. {
  240. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  241. struct rt_prio_array *array = &rt_rq->active;
  242. struct rt_rq *group_rq = group_rt_rq(rt_se);
  243. if (group_rq && group_rq->rt_throttled)
  244. return;
  245. list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
  246. __set_bit(rt_se_prio(rt_se), array->bitmap);
  247. inc_rt_tasks(rt_se, rt_rq);
  248. }
  249. static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  250. {
  251. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  252. struct rt_prio_array *array = &rt_rq->active;
  253. list_del_init(&rt_se->run_list);
  254. if (list_empty(array->queue + rt_se_prio(rt_se)))
  255. __clear_bit(rt_se_prio(rt_se), array->bitmap);
  256. dec_rt_tasks(rt_se, rt_rq);
  257. }
  258. /*
  259. * Because the prio of an upper entry depends on the lower
  260. * entries, we must remove entries top - down.
  261. *
  262. * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
  263. * doesn't matter much for now, as h=2 for GROUP_SCHED.
  264. */
  265. static void dequeue_rt_stack(struct task_struct *p)
  266. {
  267. struct sched_rt_entity *rt_se, *top_se;
  268. /*
  269. * dequeue all, top - down.
  270. */
  271. do {
  272. rt_se = &p->rt;
  273. top_se = NULL;
  274. for_each_sched_rt_entity(rt_se) {
  275. if (on_rt_rq(rt_se))
  276. top_se = rt_se;
  277. }
  278. if (top_se)
  279. dequeue_rt_entity(top_se);
  280. } while (top_se);
  281. }
  282. /*
  283. * Adding/removing a task to/from a priority array:
  284. */
  285. static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
  286. {
  287. struct sched_rt_entity *rt_se = &p->rt;
  288. if (wakeup)
  289. rt_se->timeout = 0;
  290. dequeue_rt_stack(p);
  291. /*
  292. * enqueue everybody, bottom - up.
  293. */
  294. for_each_sched_rt_entity(rt_se)
  295. enqueue_rt_entity(rt_se);
  296. inc_cpu_load(rq, p->se.load.weight);
  297. }
  298. static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
  299. {
  300. struct sched_rt_entity *rt_se = &p->rt;
  301. struct rt_rq *rt_rq;
  302. update_curr_rt(rq);
  303. dequeue_rt_stack(p);
  304. /*
  305. * re-enqueue all non-empty rt_rq entities.
  306. */
  307. for_each_sched_rt_entity(rt_se) {
  308. rt_rq = group_rt_rq(rt_se);
  309. if (rt_rq && rt_rq->rt_nr_running)
  310. enqueue_rt_entity(rt_se);
  311. }
  312. dec_cpu_load(rq, p->se.load.weight);
  313. }
  314. /*
  315. * Put task to the end of the run list without the overhead of dequeue
  316. * followed by enqueue.
  317. */
  318. static
  319. void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
  320. {
  321. struct rt_prio_array *array = &rt_rq->active;
  322. list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
  323. }
  324. static void requeue_task_rt(struct rq *rq, struct task_struct *p)
  325. {
  326. struct sched_rt_entity *rt_se = &p->rt;
  327. struct rt_rq *rt_rq;
  328. for_each_sched_rt_entity(rt_se) {
  329. rt_rq = rt_rq_of_se(rt_se);
  330. requeue_rt_entity(rt_rq, rt_se);
  331. }
  332. }
  333. static void yield_task_rt(struct rq *rq)
  334. {
  335. requeue_task_rt(rq, rq->curr);
  336. }
  337. #ifdef CONFIG_SMP
  338. static int find_lowest_rq(struct task_struct *task);
  339. static int select_task_rq_rt(struct task_struct *p, int sync)
  340. {
  341. struct rq *rq = task_rq(p);
  342. /*
  343. * If the current task is an RT task, then
  344. * try to see if we can wake this RT task up on another
  345. * runqueue. Otherwise simply start this RT task
  346. * on its current runqueue.
  347. *
  348. * We want to avoid overloading runqueues. Even if
  349. * the RT task is of higher priority than the current RT task.
  350. * RT tasks behave differently than other tasks. If
  351. * one gets preempted, we try to push it off to another queue.
  352. * So trying to keep a preempting RT task on the same
  353. * cache hot CPU will force the running RT task to
  354. * a cold CPU. So we waste all the cache for the lower
  355. * RT task in hopes of saving some of a RT task
  356. * that is just being woken and probably will have
  357. * cold cache anyway.
  358. */
  359. if (unlikely(rt_task(rq->curr)) &&
  360. (p->rt.nr_cpus_allowed > 1)) {
  361. int cpu = find_lowest_rq(p);
  362. return (cpu == -1) ? task_cpu(p) : cpu;
  363. }
  364. /*
  365. * Otherwise, just let it ride on the affined RQ and the
  366. * post-schedule router will push the preempted task away
  367. */
  368. return task_cpu(p);
  369. }
  370. #endif /* CONFIG_SMP */
  371. /*
  372. * Preempt the current task with a newly woken task if needed:
  373. */
  374. static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
  375. {
  376. if (p->prio < rq->curr->prio)
  377. resched_task(rq->curr);
  378. }
  379. static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
  380. struct rt_rq *rt_rq)
  381. {
  382. struct rt_prio_array *array = &rt_rq->active;
  383. struct sched_rt_entity *next = NULL;
  384. struct list_head *queue;
  385. int idx;
  386. if (sched_rt_ratio_exceeded(rt_rq))
  387. goto out;
  388. idx = sched_find_first_bit(array->bitmap);
  389. BUG_ON(idx >= MAX_RT_PRIO);
  390. queue = array->queue + idx;
  391. next = list_entry(queue->next, struct sched_rt_entity, run_list);
  392. out:
  393. return next;
  394. }
  395. static struct task_struct *pick_next_task_rt(struct rq *rq)
  396. {
  397. struct sched_rt_entity *rt_se;
  398. struct task_struct *p;
  399. struct rt_rq *rt_rq;
  400. retry:
  401. rt_rq = &rq->rt;
  402. if (unlikely(!rt_rq->rt_nr_running))
  403. return NULL;
  404. if (sched_rt_ratio_exceeded(rt_rq))
  405. return NULL;
  406. do {
  407. rt_se = pick_next_rt_entity(rq, rt_rq);
  408. if (unlikely(!rt_se))
  409. goto retry;
  410. rt_rq = group_rt_rq(rt_se);
  411. } while (rt_rq);
  412. p = rt_task_of(rt_se);
  413. p->se.exec_start = rq->clock;
  414. return p;
  415. }
  416. static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
  417. {
  418. update_curr_rt(rq);
  419. p->se.exec_start = 0;
  420. }
  421. #ifdef CONFIG_SMP
  422. /* Only try algorithms three times */
  423. #define RT_MAX_TRIES 3
  424. static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
  425. static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
  426. static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  427. {
  428. if (!task_running(rq, p) &&
  429. (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
  430. (p->rt.nr_cpus_allowed > 1))
  431. return 1;
  432. return 0;
  433. }
  434. /* Return the second highest RT task, NULL otherwise */
  435. static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
  436. {
  437. struct task_struct *next = NULL;
  438. struct sched_rt_entity *rt_se;
  439. struct rt_prio_array *array;
  440. struct rt_rq *rt_rq;
  441. int idx;
  442. for_each_leaf_rt_rq(rt_rq, rq) {
  443. array = &rt_rq->active;
  444. idx = sched_find_first_bit(array->bitmap);
  445. next_idx:
  446. if (idx >= MAX_RT_PRIO)
  447. continue;
  448. if (next && next->prio < idx)
  449. continue;
  450. list_for_each_entry(rt_se, array->queue + idx, run_list) {
  451. struct task_struct *p = rt_task_of(rt_se);
  452. if (pick_rt_task(rq, p, cpu)) {
  453. next = p;
  454. break;
  455. }
  456. }
  457. if (!next) {
  458. idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
  459. goto next_idx;
  460. }
  461. }
  462. return next;
  463. }
  464. static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
  465. static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
  466. {
  467. int lowest_prio = -1;
  468. int lowest_cpu = -1;
  469. int count = 0;
  470. int cpu;
  471. cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
  472. /*
  473. * Scan each rq for the lowest prio.
  474. */
  475. for_each_cpu_mask(cpu, *lowest_mask) {
  476. struct rq *rq = cpu_rq(cpu);
  477. /* We look for lowest RT prio or non-rt CPU */
  478. if (rq->rt.highest_prio >= MAX_RT_PRIO) {
  479. /*
  480. * if we already found a low RT queue
  481. * and now we found this non-rt queue
  482. * clear the mask and set our bit.
  483. * Otherwise just return the queue as is
  484. * and the count==1 will cause the algorithm
  485. * to use the first bit found.
  486. */
  487. if (lowest_cpu != -1) {
  488. cpus_clear(*lowest_mask);
  489. cpu_set(rq->cpu, *lowest_mask);
  490. }
  491. return 1;
  492. }
  493. /* no locking for now */
  494. if ((rq->rt.highest_prio > task->prio)
  495. && (rq->rt.highest_prio >= lowest_prio)) {
  496. if (rq->rt.highest_prio > lowest_prio) {
  497. /* new low - clear old data */
  498. lowest_prio = rq->rt.highest_prio;
  499. lowest_cpu = cpu;
  500. count = 0;
  501. }
  502. count++;
  503. } else
  504. cpu_clear(cpu, *lowest_mask);
  505. }
  506. /*
  507. * Clear out all the set bits that represent
  508. * runqueues that were of higher prio than
  509. * the lowest_prio.
  510. */
  511. if (lowest_cpu > 0) {
  512. /*
  513. * Perhaps we could add another cpumask op to
  514. * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
  515. * Then that could be optimized to use memset and such.
  516. */
  517. for_each_cpu_mask(cpu, *lowest_mask) {
  518. if (cpu >= lowest_cpu)
  519. break;
  520. cpu_clear(cpu, *lowest_mask);
  521. }
  522. }
  523. return count;
  524. }
  525. static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
  526. {
  527. int first;
  528. /* "this_cpu" is cheaper to preempt than a remote processor */
  529. if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
  530. return this_cpu;
  531. first = first_cpu(*mask);
  532. if (first != NR_CPUS)
  533. return first;
  534. return -1;
  535. }
  536. static int find_lowest_rq(struct task_struct *task)
  537. {
  538. struct sched_domain *sd;
  539. cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
  540. int this_cpu = smp_processor_id();
  541. int cpu = task_cpu(task);
  542. int count = find_lowest_cpus(task, lowest_mask);
  543. if (!count)
  544. return -1; /* No targets found */
  545. /*
  546. * There is no sense in performing an optimal search if only one
  547. * target is found.
  548. */
  549. if (count == 1)
  550. return first_cpu(*lowest_mask);
  551. /*
  552. * At this point we have built a mask of cpus representing the
  553. * lowest priority tasks in the system. Now we want to elect
  554. * the best one based on our affinity and topology.
  555. *
  556. * We prioritize the last cpu that the task executed on since
  557. * it is most likely cache-hot in that location.
  558. */
  559. if (cpu_isset(cpu, *lowest_mask))
  560. return cpu;
  561. /*
  562. * Otherwise, we consult the sched_domains span maps to figure
  563. * out which cpu is logically closest to our hot cache data.
  564. */
  565. if (this_cpu == cpu)
  566. this_cpu = -1; /* Skip this_cpu opt if the same */
  567. for_each_domain(cpu, sd) {
  568. if (sd->flags & SD_WAKE_AFFINE) {
  569. cpumask_t domain_mask;
  570. int best_cpu;
  571. cpus_and(domain_mask, sd->span, *lowest_mask);
  572. best_cpu = pick_optimal_cpu(this_cpu,
  573. &domain_mask);
  574. if (best_cpu != -1)
  575. return best_cpu;
  576. }
  577. }
  578. /*
  579. * And finally, if there were no matches within the domains
  580. * just give the caller *something* to work with from the compatible
  581. * locations.
  582. */
  583. return pick_optimal_cpu(this_cpu, lowest_mask);
  584. }
  585. /* Will lock the rq it finds */
  586. static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  587. {
  588. struct rq *lowest_rq = NULL;
  589. int tries;
  590. int cpu;
  591. for (tries = 0; tries < RT_MAX_TRIES; tries++) {
  592. cpu = find_lowest_rq(task);
  593. if ((cpu == -1) || (cpu == rq->cpu))
  594. break;
  595. lowest_rq = cpu_rq(cpu);
  596. /* if the prio of this runqueue changed, try again */
  597. if (double_lock_balance(rq, lowest_rq)) {
  598. /*
  599. * We had to unlock the run queue. In
  600. * the mean time, task could have
  601. * migrated already or had its affinity changed.
  602. * Also make sure that it wasn't scheduled on its rq.
  603. */
  604. if (unlikely(task_rq(task) != rq ||
  605. !cpu_isset(lowest_rq->cpu,
  606. task->cpus_allowed) ||
  607. task_running(rq, task) ||
  608. !task->se.on_rq)) {
  609. spin_unlock(&lowest_rq->lock);
  610. lowest_rq = NULL;
  611. break;
  612. }
  613. }
  614. /* If this rq is still suitable use it. */
  615. if (lowest_rq->rt.highest_prio > task->prio)
  616. break;
  617. /* try again */
  618. spin_unlock(&lowest_rq->lock);
  619. lowest_rq = NULL;
  620. }
  621. return lowest_rq;
  622. }
  623. /*
  624. * If the current CPU has more than one RT task, see if the non
  625. * running task can migrate over to a CPU that is running a task
  626. * of lesser priority.
  627. */
  628. static int push_rt_task(struct rq *rq)
  629. {
  630. struct task_struct *next_task;
  631. struct rq *lowest_rq;
  632. int ret = 0;
  633. int paranoid = RT_MAX_TRIES;
  634. if (!rq->rt.overloaded)
  635. return 0;
  636. next_task = pick_next_highest_task_rt(rq, -1);
  637. if (!next_task)
  638. return 0;
  639. retry:
  640. if (unlikely(next_task == rq->curr)) {
  641. WARN_ON(1);
  642. return 0;
  643. }
  644. /*
  645. * It's possible that the next_task slipped in of
  646. * higher priority than current. If that's the case
  647. * just reschedule current.
  648. */
  649. if (unlikely(next_task->prio < rq->curr->prio)) {
  650. resched_task(rq->curr);
  651. return 0;
  652. }
  653. /* We might release rq lock */
  654. get_task_struct(next_task);
  655. /* find_lock_lowest_rq locks the rq if found */
  656. lowest_rq = find_lock_lowest_rq(next_task, rq);
  657. if (!lowest_rq) {
  658. struct task_struct *task;
  659. /*
  660. * find lock_lowest_rq releases rq->lock
  661. * so it is possible that next_task has changed.
  662. * If it has, then try again.
  663. */
  664. task = pick_next_highest_task_rt(rq, -1);
  665. if (unlikely(task != next_task) && task && paranoid--) {
  666. put_task_struct(next_task);
  667. next_task = task;
  668. goto retry;
  669. }
  670. goto out;
  671. }
  672. deactivate_task(rq, next_task, 0);
  673. set_task_cpu(next_task, lowest_rq->cpu);
  674. activate_task(lowest_rq, next_task, 0);
  675. resched_task(lowest_rq->curr);
  676. spin_unlock(&lowest_rq->lock);
  677. ret = 1;
  678. out:
  679. put_task_struct(next_task);
  680. return ret;
  681. }
  682. /*
  683. * TODO: Currently we just use the second highest prio task on
  684. * the queue, and stop when it can't migrate (or there's
  685. * no more RT tasks). There may be a case where a lower
  686. * priority RT task has a different affinity than the
  687. * higher RT task. In this case the lower RT task could
  688. * possibly be able to migrate where as the higher priority
  689. * RT task could not. We currently ignore this issue.
  690. * Enhancements are welcome!
  691. */
  692. static void push_rt_tasks(struct rq *rq)
  693. {
  694. /* push_rt_task will return true if it moved an RT */
  695. while (push_rt_task(rq))
  696. ;
  697. }
  698. static int pull_rt_task(struct rq *this_rq)
  699. {
  700. int this_cpu = this_rq->cpu, ret = 0, cpu;
  701. struct task_struct *p, *next;
  702. struct rq *src_rq;
  703. if (likely(!rt_overloaded(this_rq)))
  704. return 0;
  705. next = pick_next_task_rt(this_rq);
  706. for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
  707. if (this_cpu == cpu)
  708. continue;
  709. src_rq = cpu_rq(cpu);
  710. /*
  711. * We can potentially drop this_rq's lock in
  712. * double_lock_balance, and another CPU could
  713. * steal our next task - hence we must cause
  714. * the caller to recalculate the next task
  715. * in that case:
  716. */
  717. if (double_lock_balance(this_rq, src_rq)) {
  718. struct task_struct *old_next = next;
  719. next = pick_next_task_rt(this_rq);
  720. if (next != old_next)
  721. ret = 1;
  722. }
  723. /*
  724. * Are there still pullable RT tasks?
  725. */
  726. if (src_rq->rt.rt_nr_running <= 1) {
  727. spin_unlock(&src_rq->lock);
  728. continue;
  729. }
  730. p = pick_next_highest_task_rt(src_rq, this_cpu);
  731. /*
  732. * Do we have an RT task that preempts
  733. * the to-be-scheduled task?
  734. */
  735. if (p && (!next || (p->prio < next->prio))) {
  736. WARN_ON(p == src_rq->curr);
  737. WARN_ON(!p->se.on_rq);
  738. /*
  739. * There's a chance that p is higher in priority
  740. * than what's currently running on its cpu.
  741. * This is just that p is wakeing up and hasn't
  742. * had a chance to schedule. We only pull
  743. * p if it is lower in priority than the
  744. * current task on the run queue or
  745. * this_rq next task is lower in prio than
  746. * the current task on that rq.
  747. */
  748. if (p->prio < src_rq->curr->prio ||
  749. (next && next->prio < src_rq->curr->prio))
  750. goto out;
  751. ret = 1;
  752. deactivate_task(src_rq, p, 0);
  753. set_task_cpu(p, this_cpu);
  754. activate_task(this_rq, p, 0);
  755. /*
  756. * We continue with the search, just in
  757. * case there's an even higher prio task
  758. * in another runqueue. (low likelyhood
  759. * but possible)
  760. *
  761. * Update next so that we won't pick a task
  762. * on another cpu with a priority lower (or equal)
  763. * than the one we just picked.
  764. */
  765. next = p;
  766. }
  767. out:
  768. spin_unlock(&src_rq->lock);
  769. }
  770. return ret;
  771. }
  772. static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  773. {
  774. /* Try to pull RT tasks here if we lower this rq's prio */
  775. if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
  776. pull_rt_task(rq);
  777. }
  778. static void post_schedule_rt(struct rq *rq)
  779. {
  780. /*
  781. * If we have more than one rt_task queued, then
  782. * see if we can push the other rt_tasks off to other CPUS.
  783. * Note we may release the rq lock, and since
  784. * the lock was owned by prev, we need to release it
  785. * first via finish_lock_switch and then reaquire it here.
  786. */
  787. if (unlikely(rq->rt.overloaded)) {
  788. spin_lock_irq(&rq->lock);
  789. push_rt_tasks(rq);
  790. spin_unlock_irq(&rq->lock);
  791. }
  792. }
  793. static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
  794. {
  795. if (!task_running(rq, p) &&
  796. (p->prio >= rq->rt.highest_prio) &&
  797. rq->rt.overloaded)
  798. push_rt_tasks(rq);
  799. }
  800. static unsigned long
  801. load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
  802. unsigned long max_load_move,
  803. struct sched_domain *sd, enum cpu_idle_type idle,
  804. int *all_pinned, int *this_best_prio)
  805. {
  806. /* don't touch RT tasks */
  807. return 0;
  808. }
  809. static int
  810. move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
  811. struct sched_domain *sd, enum cpu_idle_type idle)
  812. {
  813. /* don't touch RT tasks */
  814. return 0;
  815. }
  816. static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
  817. {
  818. int weight = cpus_weight(*new_mask);
  819. BUG_ON(!rt_task(p));
  820. /*
  821. * Update the migration status of the RQ if we have an RT task
  822. * which is running AND changing its weight value.
  823. */
  824. if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
  825. struct rq *rq = task_rq(p);
  826. if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
  827. rq->rt.rt_nr_migratory++;
  828. } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
  829. BUG_ON(!rq->rt.rt_nr_migratory);
  830. rq->rt.rt_nr_migratory--;
  831. }
  832. update_rt_migration(rq);
  833. }
  834. p->cpus_allowed = *new_mask;
  835. p->rt.nr_cpus_allowed = weight;
  836. }
  837. /* Assumes rq->lock is held */
  838. static void join_domain_rt(struct rq *rq)
  839. {
  840. if (rq->rt.overloaded)
  841. rt_set_overload(rq);
  842. }
  843. /* Assumes rq->lock is held */
  844. static void leave_domain_rt(struct rq *rq)
  845. {
  846. if (rq->rt.overloaded)
  847. rt_clear_overload(rq);
  848. }
  849. /*
  850. * When switch from the rt queue, we bring ourselves to a position
  851. * that we might want to pull RT tasks from other runqueues.
  852. */
  853. static void switched_from_rt(struct rq *rq, struct task_struct *p,
  854. int running)
  855. {
  856. /*
  857. * If there are other RT tasks then we will reschedule
  858. * and the scheduling of the other RT tasks will handle
  859. * the balancing. But if we are the last RT task
  860. * we may need to handle the pulling of RT tasks
  861. * now.
  862. */
  863. if (!rq->rt.rt_nr_running)
  864. pull_rt_task(rq);
  865. }
  866. #endif /* CONFIG_SMP */
  867. /*
  868. * When switching a task to RT, we may overload the runqueue
  869. * with RT tasks. In this case we try to push them off to
  870. * other runqueues.
  871. */
  872. static void switched_to_rt(struct rq *rq, struct task_struct *p,
  873. int running)
  874. {
  875. int check_resched = 1;
  876. /*
  877. * If we are already running, then there's nothing
  878. * that needs to be done. But if we are not running
  879. * we may need to preempt the current running task.
  880. * If that current running task is also an RT task
  881. * then see if we can move to another run queue.
  882. */
  883. if (!running) {
  884. #ifdef CONFIG_SMP
  885. if (rq->rt.overloaded && push_rt_task(rq) &&
  886. /* Don't resched if we changed runqueues */
  887. rq != task_rq(p))
  888. check_resched = 0;
  889. #endif /* CONFIG_SMP */
  890. if (check_resched && p->prio < rq->curr->prio)
  891. resched_task(rq->curr);
  892. }
  893. }
  894. /*
  895. * Priority of the task has changed. This may cause
  896. * us to initiate a push or pull.
  897. */
  898. static void prio_changed_rt(struct rq *rq, struct task_struct *p,
  899. int oldprio, int running)
  900. {
  901. if (running) {
  902. #ifdef CONFIG_SMP
  903. /*
  904. * If our priority decreases while running, we
  905. * may need to pull tasks to this runqueue.
  906. */
  907. if (oldprio < p->prio)
  908. pull_rt_task(rq);
  909. /*
  910. * If there's a higher priority task waiting to run
  911. * then reschedule.
  912. */
  913. if (p->prio > rq->rt.highest_prio)
  914. resched_task(p);
  915. #else
  916. /* For UP simply resched on drop of prio */
  917. if (oldprio < p->prio)
  918. resched_task(p);
  919. #endif /* CONFIG_SMP */
  920. } else {
  921. /*
  922. * This task is not running, but if it is
  923. * greater than the current running task
  924. * then reschedule.
  925. */
  926. if (p->prio < rq->curr->prio)
  927. resched_task(rq->curr);
  928. }
  929. }
  930. static void watchdog(struct rq *rq, struct task_struct *p)
  931. {
  932. unsigned long soft, hard;
  933. if (!p->signal)
  934. return;
  935. soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
  936. hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
  937. if (soft != RLIM_INFINITY) {
  938. unsigned long next;
  939. p->rt.timeout++;
  940. next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
  941. if (next > p->rt.timeout) {
  942. u64 next_time = p->se.sum_exec_runtime;
  943. next_time += next * (NSEC_PER_SEC/HZ);
  944. if (p->it_sched_expires > next_time)
  945. p->it_sched_expires = next_time;
  946. } else
  947. p->it_sched_expires = p->se.sum_exec_runtime;
  948. }
  949. }
  950. static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
  951. {
  952. update_curr_rt(rq);
  953. watchdog(rq, p);
  954. /*
  955. * RR tasks need a special form of timeslice management.
  956. * FIFO tasks have no timeslices.
  957. */
  958. if (p->policy != SCHED_RR)
  959. return;
  960. if (--p->rt.time_slice)
  961. return;
  962. p->rt.time_slice = DEF_TIMESLICE;
  963. /*
  964. * Requeue to the end of queue if we are not the only element
  965. * on the queue:
  966. */
  967. if (p->rt.run_list.prev != p->rt.run_list.next) {
  968. requeue_task_rt(rq, p);
  969. set_tsk_need_resched(p);
  970. }
  971. }
  972. static void set_curr_task_rt(struct rq *rq)
  973. {
  974. struct task_struct *p = rq->curr;
  975. p->se.exec_start = rq->clock;
  976. }
  977. const struct sched_class rt_sched_class = {
  978. .next = &fair_sched_class,
  979. .enqueue_task = enqueue_task_rt,
  980. .dequeue_task = dequeue_task_rt,
  981. .yield_task = yield_task_rt,
  982. #ifdef CONFIG_SMP
  983. .select_task_rq = select_task_rq_rt,
  984. #endif /* CONFIG_SMP */
  985. .check_preempt_curr = check_preempt_curr_rt,
  986. .pick_next_task = pick_next_task_rt,
  987. .put_prev_task = put_prev_task_rt,
  988. #ifdef CONFIG_SMP
  989. .load_balance = load_balance_rt,
  990. .move_one_task = move_one_task_rt,
  991. .set_cpus_allowed = set_cpus_allowed_rt,
  992. .join_domain = join_domain_rt,
  993. .leave_domain = leave_domain_rt,
  994. .pre_schedule = pre_schedule_rt,
  995. .post_schedule = post_schedule_rt,
  996. .task_wake_up = task_wake_up_rt,
  997. .switched_from = switched_from_rt,
  998. #endif
  999. .set_curr_task = set_curr_task_rt,
  1000. .task_tick = task_tick_rt,
  1001. .prio_changed = prio_changed_rt,
  1002. .switched_to = switched_to_rt,
  1003. };