device_cgroup.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. /*
  2. * device_cgroup.c - device cgroup subsystem
  3. *
  4. * Copyright 2007 IBM Corp
  5. */
  6. #include <linux/device_cgroup.h>
  7. #include <linux/cgroup.h>
  8. #include <linux/ctype.h>
  9. #include <linux/list.h>
  10. #include <linux/uaccess.h>
  11. #include <linux/seq_file.h>
  12. #include <linux/slab.h>
  13. #include <linux/rcupdate.h>
  14. #include <linux/mutex.h>
  15. #define ACC_MKNOD 1
  16. #define ACC_READ 2
  17. #define ACC_WRITE 4
  18. #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
  19. #define DEV_BLOCK 1
  20. #define DEV_CHAR 2
  21. #define DEV_ALL 4 /* this represents all devices */
  22. static DEFINE_MUTEX(devcgroup_mutex);
  23. /*
  24. * exception list locking rules:
  25. * hold devcgroup_mutex for update/read.
  26. * hold rcu_read_lock() for read.
  27. */
  28. struct dev_exception_item {
  29. u32 major, minor;
  30. short type;
  31. short access;
  32. struct list_head list;
  33. struct rcu_head rcu;
  34. };
  35. struct dev_cgroup {
  36. struct cgroup_subsys_state css;
  37. struct list_head exceptions;
  38. bool deny_all;
  39. };
  40. static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
  41. {
  42. return container_of(s, struct dev_cgroup, css);
  43. }
  44. static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup)
  45. {
  46. return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id));
  47. }
  48. static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
  49. {
  50. return css_to_devcgroup(task_subsys_state(task, devices_subsys_id));
  51. }
  52. struct cgroup_subsys devices_subsys;
  53. static int devcgroup_can_attach(struct cgroup *new_cgrp,
  54. struct cgroup_taskset *set)
  55. {
  56. struct task_struct *task = cgroup_taskset_first(set);
  57. if (current != task && !capable(CAP_SYS_ADMIN))
  58. return -EPERM;
  59. return 0;
  60. }
  61. /*
  62. * called under devcgroup_mutex
  63. */
  64. static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
  65. {
  66. struct dev_exception_item *ex, *tmp, *new;
  67. list_for_each_entry(ex, orig, list) {
  68. new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
  69. if (!new)
  70. goto free_and_exit;
  71. list_add_tail(&new->list, dest);
  72. }
  73. return 0;
  74. free_and_exit:
  75. list_for_each_entry_safe(ex, tmp, dest, list) {
  76. list_del(&ex->list);
  77. kfree(ex);
  78. }
  79. return -ENOMEM;
  80. }
  81. /*
  82. * called under devcgroup_mutex
  83. */
  84. static int dev_exception_add(struct dev_cgroup *dev_cgroup,
  85. struct dev_exception_item *ex)
  86. {
  87. struct dev_exception_item *excopy, *walk;
  88. excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
  89. if (!excopy)
  90. return -ENOMEM;
  91. list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
  92. if (walk->type != ex->type)
  93. continue;
  94. if (walk->major != ex->major)
  95. continue;
  96. if (walk->minor != ex->minor)
  97. continue;
  98. walk->access |= ex->access;
  99. kfree(excopy);
  100. excopy = NULL;
  101. }
  102. if (excopy != NULL)
  103. list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
  104. return 0;
  105. }
  106. /*
  107. * called under devcgroup_mutex
  108. */
  109. static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
  110. struct dev_exception_item *ex)
  111. {
  112. struct dev_exception_item *walk, *tmp;
  113. list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
  114. if (walk->type != ex->type)
  115. continue;
  116. if (walk->major != ex->major)
  117. continue;
  118. if (walk->minor != ex->minor)
  119. continue;
  120. walk->access &= ~ex->access;
  121. if (!walk->access) {
  122. list_del_rcu(&walk->list);
  123. kfree_rcu(walk, rcu);
  124. }
  125. }
  126. }
  127. /**
  128. * dev_exception_clean - frees all entries of the exception list
  129. * @dev_cgroup: dev_cgroup with the exception list to be cleaned
  130. *
  131. * called under devcgroup_mutex
  132. */
  133. static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
  134. {
  135. struct dev_exception_item *ex, *tmp;
  136. list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
  137. list_del(&ex->list);
  138. kfree(ex);
  139. }
  140. }
  141. /*
  142. * called from kernel/cgroup.c with cgroup_lock() held.
  143. */
  144. static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
  145. {
  146. struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
  147. struct cgroup *parent_cgroup;
  148. int ret;
  149. dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
  150. if (!dev_cgroup)
  151. return ERR_PTR(-ENOMEM);
  152. INIT_LIST_HEAD(&dev_cgroup->exceptions);
  153. parent_cgroup = cgroup->parent;
  154. if (parent_cgroup == NULL)
  155. dev_cgroup->deny_all = false;
  156. else {
  157. parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
  158. mutex_lock(&devcgroup_mutex);
  159. ret = dev_exceptions_copy(&dev_cgroup->exceptions,
  160. &parent_dev_cgroup->exceptions);
  161. dev_cgroup->deny_all = parent_dev_cgroup->deny_all;
  162. mutex_unlock(&devcgroup_mutex);
  163. if (ret) {
  164. kfree(dev_cgroup);
  165. return ERR_PTR(ret);
  166. }
  167. }
  168. return &dev_cgroup->css;
  169. }
  170. static void devcgroup_destroy(struct cgroup *cgroup)
  171. {
  172. struct dev_cgroup *dev_cgroup;
  173. dev_cgroup = cgroup_to_devcgroup(cgroup);
  174. dev_exception_clean(dev_cgroup);
  175. kfree(dev_cgroup);
  176. }
  177. #define DEVCG_ALLOW 1
  178. #define DEVCG_DENY 2
  179. #define DEVCG_LIST 3
  180. #define MAJMINLEN 13
  181. #define ACCLEN 4
  182. static void set_access(char *acc, short access)
  183. {
  184. int idx = 0;
  185. memset(acc, 0, ACCLEN);
  186. if (access & ACC_READ)
  187. acc[idx++] = 'r';
  188. if (access & ACC_WRITE)
  189. acc[idx++] = 'w';
  190. if (access & ACC_MKNOD)
  191. acc[idx++] = 'm';
  192. }
  193. static char type_to_char(short type)
  194. {
  195. if (type == DEV_ALL)
  196. return 'a';
  197. if (type == DEV_CHAR)
  198. return 'c';
  199. if (type == DEV_BLOCK)
  200. return 'b';
  201. return 'X';
  202. }
  203. static void set_majmin(char *str, unsigned m)
  204. {
  205. if (m == ~0)
  206. strcpy(str, "*");
  207. else
  208. sprintf(str, "%u", m);
  209. }
  210. static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
  211. struct seq_file *m)
  212. {
  213. struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
  214. struct dev_exception_item *ex;
  215. char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
  216. rcu_read_lock();
  217. /*
  218. * To preserve the compatibility:
  219. * - Only show the "all devices" when the default policy is to allow
  220. * - List the exceptions in case the default policy is to deny
  221. * This way, the file remains as a "whitelist of devices"
  222. */
  223. if (devcgroup->deny_all == false) {
  224. set_access(acc, ACC_MASK);
  225. set_majmin(maj, ~0);
  226. set_majmin(min, ~0);
  227. seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
  228. maj, min, acc);
  229. } else {
  230. list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
  231. set_access(acc, ex->access);
  232. set_majmin(maj, ex->major);
  233. set_majmin(min, ex->minor);
  234. seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
  235. maj, min, acc);
  236. }
  237. }
  238. rcu_read_unlock();
  239. return 0;
  240. }
  241. /**
  242. * may_access - verifies if a new exception is part of what is allowed
  243. * by a dev cgroup based on the default policy +
  244. * exceptions. This is used to make sure a child cgroup
  245. * won't have more privileges than its parent or to
  246. * verify if a certain access is allowed.
  247. * @dev_cgroup: dev cgroup to be tested against
  248. * @refex: new exception
  249. */
  250. static int may_access(struct dev_cgroup *dev_cgroup,
  251. struct dev_exception_item *refex)
  252. {
  253. struct dev_exception_item *ex;
  254. bool match = false;
  255. list_for_each_entry(ex, &dev_cgroup->exceptions, list) {
  256. if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
  257. continue;
  258. if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
  259. continue;
  260. if (ex->major != ~0 && ex->major != refex->major)
  261. continue;
  262. if (ex->minor != ~0 && ex->minor != refex->minor)
  263. continue;
  264. if (refex->access & (~ex->access))
  265. continue;
  266. match = true;
  267. break;
  268. }
  269. /*
  270. * In two cases we'll consider this new exception valid:
  271. * - the dev cgroup has its default policy to allow + exception list:
  272. * the new exception should *not* match any of the exceptions
  273. * (!deny_all, !match)
  274. * - the dev cgroup has its default policy to deny + exception list:
  275. * the new exception *should* match the exceptions
  276. * (deny_all, match)
  277. */
  278. if (dev_cgroup->deny_all == match)
  279. return 1;
  280. return 0;
  281. }
  282. /*
  283. * parent_has_perm:
  284. * when adding a new allow rule to a device exception list, the rule
  285. * must be allowed in the parent device
  286. */
  287. static int parent_has_perm(struct dev_cgroup *childcg,
  288. struct dev_exception_item *ex)
  289. {
  290. struct cgroup *pcg = childcg->css.cgroup->parent;
  291. struct dev_cgroup *parent;
  292. if (!pcg)
  293. return 1;
  294. parent = cgroup_to_devcgroup(pcg);
  295. return may_access(parent, ex);
  296. }
  297. /*
  298. * Modify the exception list using allow/deny rules.
  299. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD
  300. * so we can give a container CAP_MKNOD to let it create devices but not
  301. * modify the exception list.
  302. * It seems likely we'll want to add a CAP_CONTAINER capability to allow
  303. * us to also grant CAP_SYS_ADMIN to containers without giving away the
  304. * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
  305. *
  306. * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting
  307. * new access is only allowed if you're in the top-level cgroup, or your
  308. * parent cgroup has the access you're asking for.
  309. */
  310. static int devcgroup_update_access(struct dev_cgroup *devcgroup,
  311. int filetype, const char *buffer)
  312. {
  313. const char *b;
  314. char *endp;
  315. int count;
  316. struct dev_exception_item ex;
  317. if (!capable(CAP_SYS_ADMIN))
  318. return -EPERM;
  319. memset(&ex, 0, sizeof(ex));
  320. b = buffer;
  321. switch (*b) {
  322. case 'a':
  323. switch (filetype) {
  324. case DEVCG_ALLOW:
  325. if (!parent_has_perm(devcgroup, &ex))
  326. return -EPERM;
  327. dev_exception_clean(devcgroup);
  328. devcgroup->deny_all = false;
  329. break;
  330. case DEVCG_DENY:
  331. dev_exception_clean(devcgroup);
  332. devcgroup->deny_all = true;
  333. break;
  334. default:
  335. return -EINVAL;
  336. }
  337. return 0;
  338. case 'b':
  339. ex.type = DEV_BLOCK;
  340. break;
  341. case 'c':
  342. ex.type = DEV_CHAR;
  343. break;
  344. default:
  345. return -EINVAL;
  346. }
  347. b++;
  348. if (!isspace(*b))
  349. return -EINVAL;
  350. b++;
  351. if (*b == '*') {
  352. ex.major = ~0;
  353. b++;
  354. } else if (isdigit(*b)) {
  355. ex.major = simple_strtoul(b, &endp, 10);
  356. b = endp;
  357. } else {
  358. return -EINVAL;
  359. }
  360. if (*b != ':')
  361. return -EINVAL;
  362. b++;
  363. /* read minor */
  364. if (*b == '*') {
  365. ex.minor = ~0;
  366. b++;
  367. } else if (isdigit(*b)) {
  368. ex.minor = simple_strtoul(b, &endp, 10);
  369. b = endp;
  370. } else {
  371. return -EINVAL;
  372. }
  373. if (!isspace(*b))
  374. return -EINVAL;
  375. for (b++, count = 0; count < 3; count++, b++) {
  376. switch (*b) {
  377. case 'r':
  378. ex.access |= ACC_READ;
  379. break;
  380. case 'w':
  381. ex.access |= ACC_WRITE;
  382. break;
  383. case 'm':
  384. ex.access |= ACC_MKNOD;
  385. break;
  386. case '\n':
  387. case '\0':
  388. count = 3;
  389. break;
  390. default:
  391. return -EINVAL;
  392. }
  393. }
  394. switch (filetype) {
  395. case DEVCG_ALLOW:
  396. if (!parent_has_perm(devcgroup, &ex))
  397. return -EPERM;
  398. /*
  399. * If the default policy is to allow by default, try to remove
  400. * an matching exception instead. And be silent about it: we
  401. * don't want to break compatibility
  402. */
  403. if (devcgroup->deny_all == false) {
  404. dev_exception_rm(devcgroup, &ex);
  405. return 0;
  406. }
  407. return dev_exception_add(devcgroup, &ex);
  408. case DEVCG_DENY:
  409. /*
  410. * If the default policy is to deny by default, try to remove
  411. * an matching exception instead. And be silent about it: we
  412. * don't want to break compatibility
  413. */
  414. if (devcgroup->deny_all == true) {
  415. dev_exception_rm(devcgroup, &ex);
  416. return 0;
  417. }
  418. return dev_exception_add(devcgroup, &ex);
  419. default:
  420. return -EINVAL;
  421. }
  422. return 0;
  423. }
  424. static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft,
  425. const char *buffer)
  426. {
  427. int retval;
  428. mutex_lock(&devcgroup_mutex);
  429. retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp),
  430. cft->private, buffer);
  431. mutex_unlock(&devcgroup_mutex);
  432. return retval;
  433. }
  434. static struct cftype dev_cgroup_files[] = {
  435. {
  436. .name = "allow",
  437. .write_string = devcgroup_access_write,
  438. .private = DEVCG_ALLOW,
  439. },
  440. {
  441. .name = "deny",
  442. .write_string = devcgroup_access_write,
  443. .private = DEVCG_DENY,
  444. },
  445. {
  446. .name = "list",
  447. .read_seq_string = devcgroup_seq_read,
  448. .private = DEVCG_LIST,
  449. },
  450. { } /* terminate */
  451. };
  452. struct cgroup_subsys devices_subsys = {
  453. .name = "devices",
  454. .can_attach = devcgroup_can_attach,
  455. .create = devcgroup_create,
  456. .destroy = devcgroup_destroy,
  457. .subsys_id = devices_subsys_id,
  458. .base_cftypes = dev_cgroup_files,
  459. /*
  460. * While devices cgroup has the rudimentary hierarchy support which
  461. * checks the parent's restriction, it doesn't properly propagates
  462. * config changes in ancestors to their descendents. A child
  463. * should only be allowed to add more restrictions to the parent's
  464. * configuration. Fix it and remove the following.
  465. */
  466. .broken_hierarchy = true,
  467. };
  468. /**
  469. * __devcgroup_check_permission - checks if an inode operation is permitted
  470. * @dev_cgroup: the dev cgroup to be tested against
  471. * @type: device type
  472. * @major: device major number
  473. * @minor: device minor number
  474. * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD
  475. *
  476. * returns 0 on success, -EPERM case the operation is not permitted
  477. */
  478. static int __devcgroup_check_permission(struct dev_cgroup *dev_cgroup,
  479. short type, u32 major, u32 minor,
  480. short access)
  481. {
  482. struct dev_exception_item ex;
  483. int rc;
  484. memset(&ex, 0, sizeof(ex));
  485. ex.type = type;
  486. ex.major = major;
  487. ex.minor = minor;
  488. ex.access = access;
  489. rcu_read_lock();
  490. rc = may_access(dev_cgroup, &ex);
  491. rcu_read_unlock();
  492. if (!rc)
  493. return -EPERM;
  494. return 0;
  495. }
  496. int __devcgroup_inode_permission(struct inode *inode, int mask)
  497. {
  498. struct dev_cgroup *dev_cgroup = task_devcgroup(current);
  499. short type, access = 0;
  500. if (S_ISBLK(inode->i_mode))
  501. type = DEV_BLOCK;
  502. if (S_ISCHR(inode->i_mode))
  503. type = DEV_CHAR;
  504. if (mask & MAY_WRITE)
  505. access |= ACC_WRITE;
  506. if (mask & MAY_READ)
  507. access |= ACC_READ;
  508. return __devcgroup_check_permission(dev_cgroup, type, imajor(inode),
  509. iminor(inode), access);
  510. }
  511. int devcgroup_inode_mknod(int mode, dev_t dev)
  512. {
  513. struct dev_cgroup *dev_cgroup = task_devcgroup(current);
  514. short type;
  515. if (!S_ISBLK(mode) && !S_ISCHR(mode))
  516. return 0;
  517. if (S_ISBLK(mode))
  518. type = DEV_BLOCK;
  519. else
  520. type = DEV_CHAR;
  521. return __devcgroup_check_permission(dev_cgroup, type, MAJOR(dev),
  522. MINOR(dev), ACC_MKNOD);
  523. }