file.c 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. /*
  2. * linux/fs/file.c
  3. *
  4. * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
  5. *
  6. * Manage the dynamic fd arrays in the process files_struct.
  7. */
  8. #include <linux/syscalls.h>
  9. #include <linux/export.h>
  10. #include <linux/fs.h>
  11. #include <linux/mm.h>
  12. #include <linux/mmzone.h>
  13. #include <linux/time.h>
  14. #include <linux/sched.h>
  15. #include <linux/slab.h>
  16. #include <linux/vmalloc.h>
  17. #include <linux/file.h>
  18. #include <linux/fdtable.h>
  19. #include <linux/bitops.h>
  20. #include <linux/interrupt.h>
  21. #include <linux/spinlock.h>
  22. #include <linux/rcupdate.h>
  23. #include <linux/workqueue.h>
  24. struct fdtable_defer {
  25. spinlock_t lock;
  26. struct work_struct wq;
  27. struct fdtable *next;
  28. };
  29. int sysctl_nr_open __read_mostly = 1024*1024;
  30. int sysctl_nr_open_min = BITS_PER_LONG;
  31. int sysctl_nr_open_max = 1024 * 1024; /* raised later */
  32. /*
  33. * We use this list to defer free fdtables that have vmalloced
  34. * sets/arrays. By keeping a per-cpu list, we avoid having to embed
  35. * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
  36. * this per-task structure.
  37. */
  38. static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
  39. static void *alloc_fdmem(size_t size)
  40. {
  41. /*
  42. * Very large allocations can stress page reclaim, so fall back to
  43. * vmalloc() if the allocation size will be considered "large" by the VM.
  44. */
  45. if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
  46. void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
  47. if (data != NULL)
  48. return data;
  49. }
  50. return vmalloc(size);
  51. }
  52. static void free_fdmem(void *ptr)
  53. {
  54. is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
  55. }
  56. static void __free_fdtable(struct fdtable *fdt)
  57. {
  58. free_fdmem(fdt->fd);
  59. free_fdmem(fdt->open_fds);
  60. kfree(fdt);
  61. }
  62. static void free_fdtable_work(struct work_struct *work)
  63. {
  64. struct fdtable_defer *f =
  65. container_of(work, struct fdtable_defer, wq);
  66. struct fdtable *fdt;
  67. spin_lock_bh(&f->lock);
  68. fdt = f->next;
  69. f->next = NULL;
  70. spin_unlock_bh(&f->lock);
  71. while(fdt) {
  72. struct fdtable *next = fdt->next;
  73. __free_fdtable(fdt);
  74. fdt = next;
  75. }
  76. }
  77. static void free_fdtable_rcu(struct rcu_head *rcu)
  78. {
  79. struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
  80. struct fdtable_defer *fddef;
  81. BUG_ON(!fdt);
  82. BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT);
  83. if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
  84. kfree(fdt->fd);
  85. kfree(fdt->open_fds);
  86. kfree(fdt);
  87. } else {
  88. fddef = &get_cpu_var(fdtable_defer_list);
  89. spin_lock(&fddef->lock);
  90. fdt->next = fddef->next;
  91. fddef->next = fdt;
  92. /* vmallocs are handled from the workqueue context */
  93. schedule_work(&fddef->wq);
  94. spin_unlock(&fddef->lock);
  95. put_cpu_var(fdtable_defer_list);
  96. }
  97. }
  98. /*
  99. * Expand the fdset in the files_struct. Called with the files spinlock
  100. * held for write.
  101. */
  102. static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
  103. {
  104. unsigned int cpy, set;
  105. BUG_ON(nfdt->max_fds < ofdt->max_fds);
  106. cpy = ofdt->max_fds * sizeof(struct file *);
  107. set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
  108. memcpy(nfdt->fd, ofdt->fd, cpy);
  109. memset((char *)(nfdt->fd) + cpy, 0, set);
  110. cpy = ofdt->max_fds / BITS_PER_BYTE;
  111. set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
  112. memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
  113. memset((char *)(nfdt->open_fds) + cpy, 0, set);
  114. memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
  115. memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
  116. }
  117. static struct fdtable * alloc_fdtable(unsigned int nr)
  118. {
  119. struct fdtable *fdt;
  120. void *data;
  121. /*
  122. * Figure out how many fds we actually want to support in this fdtable.
  123. * Allocation steps are keyed to the size of the fdarray, since it
  124. * grows far faster than any of the other dynamic data. We try to fit
  125. * the fdarray into comfortable page-tuned chunks: starting at 1024B
  126. * and growing in powers of two from there on.
  127. */
  128. nr /= (1024 / sizeof(struct file *));
  129. nr = roundup_pow_of_two(nr + 1);
  130. nr *= (1024 / sizeof(struct file *));
  131. /*
  132. * Note that this can drive nr *below* what we had passed if sysctl_nr_open
  133. * had been set lower between the check in expand_files() and here. Deal
  134. * with that in caller, it's cheaper that way.
  135. *
  136. * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
  137. * bitmaps handling below becomes unpleasant, to put it mildly...
  138. */
  139. if (unlikely(nr > sysctl_nr_open))
  140. nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
  141. fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
  142. if (!fdt)
  143. goto out;
  144. fdt->max_fds = nr;
  145. data = alloc_fdmem(nr * sizeof(struct file *));
  146. if (!data)
  147. goto out_fdt;
  148. fdt->fd = data;
  149. data = alloc_fdmem(max_t(size_t,
  150. 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
  151. if (!data)
  152. goto out_arr;
  153. fdt->open_fds = data;
  154. data += nr / BITS_PER_BYTE;
  155. fdt->close_on_exec = data;
  156. fdt->next = NULL;
  157. return fdt;
  158. out_arr:
  159. free_fdmem(fdt->fd);
  160. out_fdt:
  161. kfree(fdt);
  162. out:
  163. return NULL;
  164. }
  165. /*
  166. * Expand the file descriptor table.
  167. * This function will allocate a new fdtable and both fd array and fdset, of
  168. * the given size.
  169. * Return <0 error code on error; 1 on successful completion.
  170. * The files->file_lock should be held on entry, and will be held on exit.
  171. */
  172. static int expand_fdtable(struct files_struct *files, int nr)
  173. __releases(files->file_lock)
  174. __acquires(files->file_lock)
  175. {
  176. struct fdtable *new_fdt, *cur_fdt;
  177. spin_unlock(&files->file_lock);
  178. new_fdt = alloc_fdtable(nr);
  179. spin_lock(&files->file_lock);
  180. if (!new_fdt)
  181. return -ENOMEM;
  182. /*
  183. * extremely unlikely race - sysctl_nr_open decreased between the check in
  184. * caller and alloc_fdtable(). Cheaper to catch it here...
  185. */
  186. if (unlikely(new_fdt->max_fds <= nr)) {
  187. __free_fdtable(new_fdt);
  188. return -EMFILE;
  189. }
  190. /*
  191. * Check again since another task may have expanded the fd table while
  192. * we dropped the lock
  193. */
  194. cur_fdt = files_fdtable(files);
  195. if (nr >= cur_fdt->max_fds) {
  196. /* Continue as planned */
  197. copy_fdtable(new_fdt, cur_fdt);
  198. rcu_assign_pointer(files->fdt, new_fdt);
  199. if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
  200. call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
  201. } else {
  202. /* Somebody else expanded, so undo our attempt */
  203. __free_fdtable(new_fdt);
  204. }
  205. return 1;
  206. }
  207. /*
  208. * Expand files.
  209. * This function will expand the file structures, if the requested size exceeds
  210. * the current capacity and there is room for expansion.
  211. * Return <0 error code on error; 0 when nothing done; 1 when files were
  212. * expanded and execution may have blocked.
  213. * The files->file_lock should be held on entry, and will be held on exit.
  214. */
  215. static int expand_files(struct files_struct *files, int nr)
  216. {
  217. struct fdtable *fdt;
  218. fdt = files_fdtable(files);
  219. /* Do we need to expand? */
  220. if (nr < fdt->max_fds)
  221. return 0;
  222. /* Can we expand? */
  223. if (nr >= sysctl_nr_open)
  224. return -EMFILE;
  225. /* All good, so we try */
  226. return expand_fdtable(files, nr);
  227. }
  228. static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
  229. {
  230. __set_bit(fd, fdt->close_on_exec);
  231. }
  232. static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
  233. {
  234. __clear_bit(fd, fdt->close_on_exec);
  235. }
  236. static inline void __set_open_fd(int fd, struct fdtable *fdt)
  237. {
  238. __set_bit(fd, fdt->open_fds);
  239. }
  240. static inline void __clear_open_fd(int fd, struct fdtable *fdt)
  241. {
  242. __clear_bit(fd, fdt->open_fds);
  243. }
  244. static int count_open_files(struct fdtable *fdt)
  245. {
  246. int size = fdt->max_fds;
  247. int i;
  248. /* Find the last open fd */
  249. for (i = size / BITS_PER_LONG; i > 0; ) {
  250. if (fdt->open_fds[--i])
  251. break;
  252. }
  253. i = (i + 1) * BITS_PER_LONG;
  254. return i;
  255. }
  256. /*
  257. * Allocate a new files structure and copy contents from the
  258. * passed in files structure.
  259. * errorp will be valid only when the returned files_struct is NULL.
  260. */
  261. struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
  262. {
  263. struct files_struct *newf;
  264. struct file **old_fds, **new_fds;
  265. int open_files, size, i;
  266. struct fdtable *old_fdt, *new_fdt;
  267. *errorp = -ENOMEM;
  268. newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
  269. if (!newf)
  270. goto out;
  271. atomic_set(&newf->count, 1);
  272. spin_lock_init(&newf->file_lock);
  273. newf->next_fd = 0;
  274. new_fdt = &newf->fdtab;
  275. new_fdt->max_fds = NR_OPEN_DEFAULT;
  276. new_fdt->close_on_exec = newf->close_on_exec_init;
  277. new_fdt->open_fds = newf->open_fds_init;
  278. new_fdt->fd = &newf->fd_array[0];
  279. new_fdt->next = NULL;
  280. spin_lock(&oldf->file_lock);
  281. old_fdt = files_fdtable(oldf);
  282. open_files = count_open_files(old_fdt);
  283. /*
  284. * Check whether we need to allocate a larger fd array and fd set.
  285. */
  286. while (unlikely(open_files > new_fdt->max_fds)) {
  287. spin_unlock(&oldf->file_lock);
  288. if (new_fdt != &newf->fdtab)
  289. __free_fdtable(new_fdt);
  290. new_fdt = alloc_fdtable(open_files - 1);
  291. if (!new_fdt) {
  292. *errorp = -ENOMEM;
  293. goto out_release;
  294. }
  295. /* beyond sysctl_nr_open; nothing to do */
  296. if (unlikely(new_fdt->max_fds < open_files)) {
  297. __free_fdtable(new_fdt);
  298. *errorp = -EMFILE;
  299. goto out_release;
  300. }
  301. /*
  302. * Reacquire the oldf lock and a pointer to its fd table
  303. * who knows it may have a new bigger fd table. We need
  304. * the latest pointer.
  305. */
  306. spin_lock(&oldf->file_lock);
  307. old_fdt = files_fdtable(oldf);
  308. open_files = count_open_files(old_fdt);
  309. }
  310. old_fds = old_fdt->fd;
  311. new_fds = new_fdt->fd;
  312. memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
  313. memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
  314. for (i = open_files; i != 0; i--) {
  315. struct file *f = *old_fds++;
  316. if (f) {
  317. get_file(f);
  318. } else {
  319. /*
  320. * The fd may be claimed in the fd bitmap but not yet
  321. * instantiated in the files array if a sibling thread
  322. * is partway through open(). So make sure that this
  323. * fd is available to the new process.
  324. */
  325. __clear_open_fd(open_files - i, new_fdt);
  326. }
  327. rcu_assign_pointer(*new_fds++, f);
  328. }
  329. spin_unlock(&oldf->file_lock);
  330. /* compute the remainder to be cleared */
  331. size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
  332. /* This is long word aligned thus could use a optimized version */
  333. memset(new_fds, 0, size);
  334. if (new_fdt->max_fds > open_files) {
  335. int left = (new_fdt->max_fds - open_files) / 8;
  336. int start = open_files / BITS_PER_LONG;
  337. memset(&new_fdt->open_fds[start], 0, left);
  338. memset(&new_fdt->close_on_exec[start], 0, left);
  339. }
  340. rcu_assign_pointer(newf->fdt, new_fdt);
  341. return newf;
  342. out_release:
  343. kmem_cache_free(files_cachep, newf);
  344. out:
  345. return NULL;
  346. }
  347. static void close_files(struct files_struct * files)
  348. {
  349. int i, j;
  350. struct fdtable *fdt;
  351. j = 0;
  352. /*
  353. * It is safe to dereference the fd table without RCU or
  354. * ->file_lock because this is the last reference to the
  355. * files structure. But use RCU to shut RCU-lockdep up.
  356. */
  357. rcu_read_lock();
  358. fdt = files_fdtable(files);
  359. rcu_read_unlock();
  360. for (;;) {
  361. unsigned long set;
  362. i = j * BITS_PER_LONG;
  363. if (i >= fdt->max_fds)
  364. break;
  365. set = fdt->open_fds[j++];
  366. while (set) {
  367. if (set & 1) {
  368. struct file * file = xchg(&fdt->fd[i], NULL);
  369. if (file) {
  370. filp_close(file, files);
  371. cond_resched();
  372. }
  373. }
  374. i++;
  375. set >>= 1;
  376. }
  377. }
  378. }
  379. struct files_struct *get_files_struct(struct task_struct *task)
  380. {
  381. struct files_struct *files;
  382. task_lock(task);
  383. files = task->files;
  384. if (files)
  385. atomic_inc(&files->count);
  386. task_unlock(task);
  387. return files;
  388. }
  389. void put_files_struct(struct files_struct *files)
  390. {
  391. struct fdtable *fdt;
  392. if (atomic_dec_and_test(&files->count)) {
  393. close_files(files);
  394. /* not really needed, since nobody can see us */
  395. rcu_read_lock();
  396. fdt = files_fdtable(files);
  397. rcu_read_unlock();
  398. /* free the arrays if they are not embedded */
  399. if (fdt != &files->fdtab)
  400. __free_fdtable(fdt);
  401. kmem_cache_free(files_cachep, files);
  402. }
  403. }
  404. void reset_files_struct(struct files_struct *files)
  405. {
  406. struct task_struct *tsk = current;
  407. struct files_struct *old;
  408. old = tsk->files;
  409. task_lock(tsk);
  410. tsk->files = files;
  411. task_unlock(tsk);
  412. put_files_struct(old);
  413. }
  414. void exit_files(struct task_struct *tsk)
  415. {
  416. struct files_struct * files = tsk->files;
  417. if (files) {
  418. task_lock(tsk);
  419. tsk->files = NULL;
  420. task_unlock(tsk);
  421. put_files_struct(files);
  422. }
  423. }
  424. static void __devinit fdtable_defer_list_init(int cpu)
  425. {
  426. struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
  427. spin_lock_init(&fddef->lock);
  428. INIT_WORK(&fddef->wq, free_fdtable_work);
  429. fddef->next = NULL;
  430. }
  431. void __init files_defer_init(void)
  432. {
  433. int i;
  434. for_each_possible_cpu(i)
  435. fdtable_defer_list_init(i);
  436. sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
  437. -BITS_PER_LONG;
  438. }
  439. struct files_struct init_files = {
  440. .count = ATOMIC_INIT(1),
  441. .fdt = &init_files.fdtab,
  442. .fdtab = {
  443. .max_fds = NR_OPEN_DEFAULT,
  444. .fd = &init_files.fd_array[0],
  445. .close_on_exec = init_files.close_on_exec_init,
  446. .open_fds = init_files.open_fds_init,
  447. },
  448. .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
  449. };
  450. void daemonize_descriptors(void)
  451. {
  452. atomic_inc(&init_files.count);
  453. reset_files_struct(&init_files);
  454. }
  455. /*
  456. * allocate a file descriptor, mark it busy.
  457. */
  458. int __alloc_fd(struct files_struct *files,
  459. unsigned start, unsigned end, unsigned flags)
  460. {
  461. unsigned int fd;
  462. int error;
  463. struct fdtable *fdt;
  464. spin_lock(&files->file_lock);
  465. repeat:
  466. fdt = files_fdtable(files);
  467. fd = start;
  468. if (fd < files->next_fd)
  469. fd = files->next_fd;
  470. if (fd < fdt->max_fds)
  471. fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
  472. /*
  473. * N.B. For clone tasks sharing a files structure, this test
  474. * will limit the total number of files that can be opened.
  475. */
  476. error = -EMFILE;
  477. if (fd >= end)
  478. goto out;
  479. error = expand_files(files, fd);
  480. if (error < 0)
  481. goto out;
  482. /*
  483. * If we needed to expand the fs array we
  484. * might have blocked - try again.
  485. */
  486. if (error)
  487. goto repeat;
  488. if (start <= files->next_fd)
  489. files->next_fd = fd + 1;
  490. __set_open_fd(fd, fdt);
  491. if (flags & O_CLOEXEC)
  492. __set_close_on_exec(fd, fdt);
  493. else
  494. __clear_close_on_exec(fd, fdt);
  495. error = fd;
  496. #if 1
  497. /* Sanity check */
  498. if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
  499. printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
  500. rcu_assign_pointer(fdt->fd[fd], NULL);
  501. }
  502. #endif
  503. out:
  504. spin_unlock(&files->file_lock);
  505. return error;
  506. }
  507. static int alloc_fd(unsigned start, unsigned flags)
  508. {
  509. return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
  510. }
  511. int get_unused_fd_flags(unsigned flags)
  512. {
  513. return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
  514. }
  515. EXPORT_SYMBOL(get_unused_fd_flags);
  516. static void __put_unused_fd(struct files_struct *files, unsigned int fd)
  517. {
  518. struct fdtable *fdt = files_fdtable(files);
  519. __clear_open_fd(fd, fdt);
  520. if (fd < files->next_fd)
  521. files->next_fd = fd;
  522. }
  523. void put_unused_fd(unsigned int fd)
  524. {
  525. struct files_struct *files = current->files;
  526. spin_lock(&files->file_lock);
  527. __put_unused_fd(files, fd);
  528. spin_unlock(&files->file_lock);
  529. }
  530. EXPORT_SYMBOL(put_unused_fd);
  531. /*
  532. * Install a file pointer in the fd array.
  533. *
  534. * The VFS is full of places where we drop the files lock between
  535. * setting the open_fds bitmap and installing the file in the file
  536. * array. At any such point, we are vulnerable to a dup2() race
  537. * installing a file in the array before us. We need to detect this and
  538. * fput() the struct file we are about to overwrite in this case.
  539. *
  540. * It should never happen - if we allow dup2() do it, _really_ bad things
  541. * will follow.
  542. *
  543. * NOTE: __fd_install() variant is really, really low-level; don't
  544. * use it unless you are forced to by truly lousy API shoved down
  545. * your throat. 'files' *MUST* be either current->files or obtained
  546. * by get_files_struct(current) done by whoever had given it to you,
  547. * or really bad things will happen. Normally you want to use
  548. * fd_install() instead.
  549. */
  550. void __fd_install(struct files_struct *files, unsigned int fd,
  551. struct file *file)
  552. {
  553. struct fdtable *fdt;
  554. spin_lock(&files->file_lock);
  555. fdt = files_fdtable(files);
  556. BUG_ON(fdt->fd[fd] != NULL);
  557. rcu_assign_pointer(fdt->fd[fd], file);
  558. spin_unlock(&files->file_lock);
  559. }
  560. void fd_install(unsigned int fd, struct file *file)
  561. {
  562. __fd_install(current->files, fd, file);
  563. }
  564. EXPORT_SYMBOL(fd_install);
  565. /*
  566. * The same warnings as for __alloc_fd()/__fd_install() apply here...
  567. */
  568. int __close_fd(struct files_struct *files, unsigned fd)
  569. {
  570. struct file *file;
  571. struct fdtable *fdt;
  572. spin_lock(&files->file_lock);
  573. fdt = files_fdtable(files);
  574. if (fd >= fdt->max_fds)
  575. goto out_unlock;
  576. file = fdt->fd[fd];
  577. if (!file)
  578. goto out_unlock;
  579. rcu_assign_pointer(fdt->fd[fd], NULL);
  580. __clear_close_on_exec(fd, fdt);
  581. __put_unused_fd(files, fd);
  582. spin_unlock(&files->file_lock);
  583. return filp_close(file, files);
  584. out_unlock:
  585. spin_unlock(&files->file_lock);
  586. return -EBADF;
  587. }
  588. void do_close_on_exec(struct files_struct *files)
  589. {
  590. unsigned i;
  591. struct fdtable *fdt;
  592. /* exec unshares first */
  593. BUG_ON(atomic_read(&files->count) != 1);
  594. spin_lock(&files->file_lock);
  595. for (i = 0; ; i++) {
  596. unsigned long set;
  597. unsigned fd = i * BITS_PER_LONG;
  598. fdt = files_fdtable(files);
  599. if (fd >= fdt->max_fds)
  600. break;
  601. set = fdt->close_on_exec[i];
  602. if (!set)
  603. continue;
  604. fdt->close_on_exec[i] = 0;
  605. for ( ; set ; fd++, set >>= 1) {
  606. struct file *file;
  607. if (!(set & 1))
  608. continue;
  609. file = fdt->fd[fd];
  610. if (!file)
  611. continue;
  612. rcu_assign_pointer(fdt->fd[fd], NULL);
  613. __put_unused_fd(files, fd);
  614. spin_unlock(&files->file_lock);
  615. filp_close(file, files);
  616. cond_resched();
  617. spin_lock(&files->file_lock);
  618. }
  619. }
  620. spin_unlock(&files->file_lock);
  621. }
  622. struct file *fget(unsigned int fd)
  623. {
  624. struct file *file;
  625. struct files_struct *files = current->files;
  626. rcu_read_lock();
  627. file = fcheck_files(files, fd);
  628. if (file) {
  629. /* File object ref couldn't be taken */
  630. if (file->f_mode & FMODE_PATH ||
  631. !atomic_long_inc_not_zero(&file->f_count))
  632. file = NULL;
  633. }
  634. rcu_read_unlock();
  635. return file;
  636. }
  637. EXPORT_SYMBOL(fget);
  638. struct file *fget_raw(unsigned int fd)
  639. {
  640. struct file *file;
  641. struct files_struct *files = current->files;
  642. rcu_read_lock();
  643. file = fcheck_files(files, fd);
  644. if (file) {
  645. /* File object ref couldn't be taken */
  646. if (!atomic_long_inc_not_zero(&file->f_count))
  647. file = NULL;
  648. }
  649. rcu_read_unlock();
  650. return file;
  651. }
  652. EXPORT_SYMBOL(fget_raw);
  653. /*
  654. * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  655. *
  656. * You can use this instead of fget if you satisfy all of the following
  657. * conditions:
  658. * 1) You must call fput_light before exiting the syscall and returning control
  659. * to userspace (i.e. you cannot remember the returned struct file * after
  660. * returning to userspace).
  661. * 2) You must not call filp_close on the returned struct file * in between
  662. * calls to fget_light and fput_light.
  663. * 3) You must not clone the current task in between the calls to fget_light
  664. * and fput_light.
  665. *
  666. * The fput_needed flag returned by fget_light should be passed to the
  667. * corresponding fput_light.
  668. */
  669. struct file *fget_light(unsigned int fd, int *fput_needed)
  670. {
  671. struct file *file;
  672. struct files_struct *files = current->files;
  673. *fput_needed = 0;
  674. if (atomic_read(&files->count) == 1) {
  675. file = fcheck_files(files, fd);
  676. if (file && (file->f_mode & FMODE_PATH))
  677. file = NULL;
  678. } else {
  679. rcu_read_lock();
  680. file = fcheck_files(files, fd);
  681. if (file) {
  682. if (!(file->f_mode & FMODE_PATH) &&
  683. atomic_long_inc_not_zero(&file->f_count))
  684. *fput_needed = 1;
  685. else
  686. /* Didn't get the reference, someone's freed */
  687. file = NULL;
  688. }
  689. rcu_read_unlock();
  690. }
  691. return file;
  692. }
  693. EXPORT_SYMBOL(fget_light);
  694. struct file *fget_raw_light(unsigned int fd, int *fput_needed)
  695. {
  696. struct file *file;
  697. struct files_struct *files = current->files;
  698. *fput_needed = 0;
  699. if (atomic_read(&files->count) == 1) {
  700. file = fcheck_files(files, fd);
  701. } else {
  702. rcu_read_lock();
  703. file = fcheck_files(files, fd);
  704. if (file) {
  705. if (atomic_long_inc_not_zero(&file->f_count))
  706. *fput_needed = 1;
  707. else
  708. /* Didn't get the reference, someone's freed */
  709. file = NULL;
  710. }
  711. rcu_read_unlock();
  712. }
  713. return file;
  714. }
  715. void set_close_on_exec(unsigned int fd, int flag)
  716. {
  717. struct files_struct *files = current->files;
  718. struct fdtable *fdt;
  719. spin_lock(&files->file_lock);
  720. fdt = files_fdtable(files);
  721. if (flag)
  722. __set_close_on_exec(fd, fdt);
  723. else
  724. __clear_close_on_exec(fd, fdt);
  725. spin_unlock(&files->file_lock);
  726. }
  727. bool get_close_on_exec(unsigned int fd)
  728. {
  729. struct files_struct *files = current->files;
  730. struct fdtable *fdt;
  731. bool res;
  732. rcu_read_lock();
  733. fdt = files_fdtable(files);
  734. res = close_on_exec(fd, fdt);
  735. rcu_read_unlock();
  736. return res;
  737. }
  738. static int do_dup2(struct files_struct *files,
  739. struct file *file, unsigned fd, unsigned flags)
  740. {
  741. struct file *tofree;
  742. struct fdtable *fdt;
  743. /*
  744. * We need to detect attempts to do dup2() over allocated but still
  745. * not finished descriptor. NB: OpenBSD avoids that at the price of
  746. * extra work in their equivalent of fget() - they insert struct
  747. * file immediately after grabbing descriptor, mark it larval if
  748. * more work (e.g. actual opening) is needed and make sure that
  749. * fget() treats larval files as absent. Potentially interesting,
  750. * but while extra work in fget() is trivial, locking implications
  751. * and amount of surgery on open()-related paths in VFS are not.
  752. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
  753. * deadlocks in rather amusing ways, AFAICS. All of that is out of
  754. * scope of POSIX or SUS, since neither considers shared descriptor
  755. * tables and this condition does not arise without those.
  756. */
  757. fdt = files_fdtable(files);
  758. tofree = fdt->fd[fd];
  759. if (!tofree && fd_is_open(fd, fdt))
  760. goto Ebusy;
  761. get_file(file);
  762. rcu_assign_pointer(fdt->fd[fd], file);
  763. __set_open_fd(fd, fdt);
  764. if (flags & O_CLOEXEC)
  765. __set_close_on_exec(fd, fdt);
  766. else
  767. __clear_close_on_exec(fd, fdt);
  768. spin_unlock(&files->file_lock);
  769. if (tofree)
  770. filp_close(tofree, files);
  771. return fd;
  772. Ebusy:
  773. spin_unlock(&files->file_lock);
  774. return -EBUSY;
  775. }
  776. int replace_fd(unsigned fd, struct file *file, unsigned flags)
  777. {
  778. int err;
  779. struct files_struct *files = current->files;
  780. if (!file)
  781. return __close_fd(files, fd);
  782. if (fd >= rlimit(RLIMIT_NOFILE))
  783. return -EMFILE;
  784. spin_lock(&files->file_lock);
  785. err = expand_files(files, fd);
  786. if (unlikely(err < 0))
  787. goto out_unlock;
  788. return do_dup2(files, file, fd, flags);
  789. out_unlock:
  790. spin_unlock(&files->file_lock);
  791. return err;
  792. }
  793. SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
  794. {
  795. int err = -EBADF;
  796. struct file *file;
  797. struct files_struct *files = current->files;
  798. if ((flags & ~O_CLOEXEC) != 0)
  799. return -EINVAL;
  800. if (unlikely(oldfd == newfd))
  801. return -EINVAL;
  802. if (newfd >= rlimit(RLIMIT_NOFILE))
  803. return -EMFILE;
  804. spin_lock(&files->file_lock);
  805. err = expand_files(files, newfd);
  806. file = fcheck(oldfd);
  807. if (unlikely(!file))
  808. goto Ebadf;
  809. if (unlikely(err < 0)) {
  810. if (err == -EMFILE)
  811. goto Ebadf;
  812. goto out_unlock;
  813. }
  814. return do_dup2(files, file, newfd, flags);
  815. Ebadf:
  816. err = -EBADF;
  817. out_unlock:
  818. spin_unlock(&files->file_lock);
  819. return err;
  820. }
  821. SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
  822. {
  823. if (unlikely(newfd == oldfd)) { /* corner case */
  824. struct files_struct *files = current->files;
  825. int retval = oldfd;
  826. rcu_read_lock();
  827. if (!fcheck_files(files, oldfd))
  828. retval = -EBADF;
  829. rcu_read_unlock();
  830. return retval;
  831. }
  832. return sys_dup3(oldfd, newfd, 0);
  833. }
  834. SYSCALL_DEFINE1(dup, unsigned int, fildes)
  835. {
  836. int ret = -EBADF;
  837. struct file *file = fget_raw(fildes);
  838. if (file) {
  839. ret = get_unused_fd();
  840. if (ret >= 0)
  841. fd_install(ret, file);
  842. else
  843. fput(file);
  844. }
  845. return ret;
  846. }
  847. int f_dupfd(unsigned int from, struct file *file, unsigned flags)
  848. {
  849. int err;
  850. if (from >= rlimit(RLIMIT_NOFILE))
  851. return -EINVAL;
  852. err = alloc_fd(from, flags);
  853. if (err >= 0) {
  854. get_file(file);
  855. fd_install(err, file);
  856. }
  857. return err;
  858. }
  859. int iterate_fd(struct files_struct *files, unsigned n,
  860. int (*f)(const void *, struct file *, unsigned),
  861. const void *p)
  862. {
  863. struct fdtable *fdt;
  864. struct file *file;
  865. int res = 0;
  866. if (!files)
  867. return 0;
  868. spin_lock(&files->file_lock);
  869. fdt = files_fdtable(files);
  870. while (!res && n < fdt->max_fds) {
  871. file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
  872. if (file)
  873. res = f(p, file, n);
  874. }
  875. spin_unlock(&files->file_lock);
  876. return res;
  877. }
  878. EXPORT_SYMBOL(iterate_fd);