file.c 23 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007
  1. /*
  2. * linux/fs/file.c
  3. *
  4. * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
  5. *
  6. * Manage the dynamic fd arrays in the process files_struct.
  7. */
  8. #include <linux/syscalls.h>
  9. #include <linux/export.h>
  10. #include <linux/fs.h>
  11. #include <linux/mm.h>
  12. #include <linux/mmzone.h>
  13. #include <linux/time.h>
  14. #include <linux/sched.h>
  15. #include <linux/slab.h>
  16. #include <linux/vmalloc.h>
  17. #include <linux/file.h>
  18. #include <linux/fdtable.h>
  19. #include <linux/bitops.h>
  20. #include <linux/interrupt.h>
  21. #include <linux/spinlock.h>
  22. #include <linux/rcupdate.h>
  23. #include <linux/workqueue.h>
  24. struct fdtable_defer {
  25. spinlock_t lock;
  26. struct work_struct wq;
  27. struct fdtable *next;
  28. };
  29. int sysctl_nr_open __read_mostly = 1024*1024;
  30. int sysctl_nr_open_min = BITS_PER_LONG;
  31. int sysctl_nr_open_max = 1024 * 1024; /* raised later */
  32. /*
  33. * We use this list to defer free fdtables that have vmalloced
  34. * sets/arrays. By keeping a per-cpu list, we avoid having to embed
  35. * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
  36. * this per-task structure.
  37. */
  38. static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
  39. static void *alloc_fdmem(size_t size)
  40. {
  41. /*
  42. * Very large allocations can stress page reclaim, so fall back to
  43. * vmalloc() if the allocation size will be considered "large" by the VM.
  44. */
  45. if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
  46. void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
  47. if (data != NULL)
  48. return data;
  49. }
  50. return vmalloc(size);
  51. }
  52. static void free_fdmem(void *ptr)
  53. {
  54. is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
  55. }
  56. static void __free_fdtable(struct fdtable *fdt)
  57. {
  58. free_fdmem(fdt->fd);
  59. free_fdmem(fdt->open_fds);
  60. kfree(fdt);
  61. }
  62. static void free_fdtable_work(struct work_struct *work)
  63. {
  64. struct fdtable_defer *f =
  65. container_of(work, struct fdtable_defer, wq);
  66. struct fdtable *fdt;
  67. spin_lock_bh(&f->lock);
  68. fdt = f->next;
  69. f->next = NULL;
  70. spin_unlock_bh(&f->lock);
  71. while(fdt) {
  72. struct fdtable *next = fdt->next;
  73. __free_fdtable(fdt);
  74. fdt = next;
  75. }
  76. }
  77. static void free_fdtable_rcu(struct rcu_head *rcu)
  78. {
  79. struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
  80. struct fdtable_defer *fddef;
  81. BUG_ON(!fdt);
  82. BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT);
  83. if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
  84. kfree(fdt->fd);
  85. kfree(fdt->open_fds);
  86. kfree(fdt);
  87. } else {
  88. fddef = &get_cpu_var(fdtable_defer_list);
  89. spin_lock(&fddef->lock);
  90. fdt->next = fddef->next;
  91. fddef->next = fdt;
  92. /* vmallocs are handled from the workqueue context */
  93. schedule_work(&fddef->wq);
  94. spin_unlock(&fddef->lock);
  95. put_cpu_var(fdtable_defer_list);
  96. }
  97. }
  98. /*
  99. * Expand the fdset in the files_struct. Called with the files spinlock
  100. * held for write.
  101. */
  102. static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
  103. {
  104. unsigned int cpy, set;
  105. BUG_ON(nfdt->max_fds < ofdt->max_fds);
  106. cpy = ofdt->max_fds * sizeof(struct file *);
  107. set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
  108. memcpy(nfdt->fd, ofdt->fd, cpy);
  109. memset((char *)(nfdt->fd) + cpy, 0, set);
  110. cpy = ofdt->max_fds / BITS_PER_BYTE;
  111. set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
  112. memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
  113. memset((char *)(nfdt->open_fds) + cpy, 0, set);
  114. memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
  115. memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
  116. }
  117. static struct fdtable * alloc_fdtable(unsigned int nr)
  118. {
  119. struct fdtable *fdt;
  120. void *data;
  121. /*
  122. * Figure out how many fds we actually want to support in this fdtable.
  123. * Allocation steps are keyed to the size of the fdarray, since it
  124. * grows far faster than any of the other dynamic data. We try to fit
  125. * the fdarray into comfortable page-tuned chunks: starting at 1024B
  126. * and growing in powers of two from there on.
  127. */
  128. nr /= (1024 / sizeof(struct file *));
  129. nr = roundup_pow_of_two(nr + 1);
  130. nr *= (1024 / sizeof(struct file *));
  131. /*
  132. * Note that this can drive nr *below* what we had passed if sysctl_nr_open
  133. * had been set lower between the check in expand_files() and here. Deal
  134. * with that in caller, it's cheaper that way.
  135. *
  136. * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
  137. * bitmaps handling below becomes unpleasant, to put it mildly...
  138. */
  139. if (unlikely(nr > sysctl_nr_open))
  140. nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
  141. fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
  142. if (!fdt)
  143. goto out;
  144. fdt->max_fds = nr;
  145. data = alloc_fdmem(nr * sizeof(struct file *));
  146. if (!data)
  147. goto out_fdt;
  148. fdt->fd = data;
  149. data = alloc_fdmem(max_t(size_t,
  150. 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
  151. if (!data)
  152. goto out_arr;
  153. fdt->open_fds = data;
  154. data += nr / BITS_PER_BYTE;
  155. fdt->close_on_exec = data;
  156. fdt->next = NULL;
  157. return fdt;
  158. out_arr:
  159. free_fdmem(fdt->fd);
  160. out_fdt:
  161. kfree(fdt);
  162. out:
  163. return NULL;
  164. }
  165. /*
  166. * Expand the file descriptor table.
  167. * This function will allocate a new fdtable and both fd array and fdset, of
  168. * the given size.
  169. * Return <0 error code on error; 1 on successful completion.
  170. * The files->file_lock should be held on entry, and will be held on exit.
  171. */
  172. static int expand_fdtable(struct files_struct *files, int nr)
  173. __releases(files->file_lock)
  174. __acquires(files->file_lock)
  175. {
  176. struct fdtable *new_fdt, *cur_fdt;
  177. spin_unlock(&files->file_lock);
  178. new_fdt = alloc_fdtable(nr);
  179. spin_lock(&files->file_lock);
  180. if (!new_fdt)
  181. return -ENOMEM;
  182. /*
  183. * extremely unlikely race - sysctl_nr_open decreased between the check in
  184. * caller and alloc_fdtable(). Cheaper to catch it here...
  185. */
  186. if (unlikely(new_fdt->max_fds <= nr)) {
  187. __free_fdtable(new_fdt);
  188. return -EMFILE;
  189. }
  190. /*
  191. * Check again since another task may have expanded the fd table while
  192. * we dropped the lock
  193. */
  194. cur_fdt = files_fdtable(files);
  195. if (nr >= cur_fdt->max_fds) {
  196. /* Continue as planned */
  197. copy_fdtable(new_fdt, cur_fdt);
  198. rcu_assign_pointer(files->fdt, new_fdt);
  199. if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
  200. call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
  201. } else {
  202. /* Somebody else expanded, so undo our attempt */
  203. __free_fdtable(new_fdt);
  204. }
  205. return 1;
  206. }
  207. /*
  208. * Expand files.
  209. * This function will expand the file structures, if the requested size exceeds
  210. * the current capacity and there is room for expansion.
  211. * Return <0 error code on error; 0 when nothing done; 1 when files were
  212. * expanded and execution may have blocked.
  213. * The files->file_lock should be held on entry, and will be held on exit.
  214. */
  215. static int expand_files(struct files_struct *files, int nr)
  216. {
  217. struct fdtable *fdt;
  218. fdt = files_fdtable(files);
  219. /* Do we need to expand? */
  220. if (nr < fdt->max_fds)
  221. return 0;
  222. /* Can we expand? */
  223. if (nr >= sysctl_nr_open)
  224. return -EMFILE;
  225. /* All good, so we try */
  226. return expand_fdtable(files, nr);
  227. }
  228. static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
  229. {
  230. __set_bit(fd, fdt->close_on_exec);
  231. }
  232. static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
  233. {
  234. __clear_bit(fd, fdt->close_on_exec);
  235. }
  236. static inline void __set_open_fd(int fd, struct fdtable *fdt)
  237. {
  238. __set_bit(fd, fdt->open_fds);
  239. }
  240. static inline void __clear_open_fd(int fd, struct fdtable *fdt)
  241. {
  242. __clear_bit(fd, fdt->open_fds);
  243. }
  244. static int count_open_files(struct fdtable *fdt)
  245. {
  246. int size = fdt->max_fds;
  247. int i;
  248. /* Find the last open fd */
  249. for (i = size / BITS_PER_LONG; i > 0; ) {
  250. if (fdt->open_fds[--i])
  251. break;
  252. }
  253. i = (i + 1) * BITS_PER_LONG;
  254. return i;
  255. }
  256. /*
  257. * Allocate a new files structure and copy contents from the
  258. * passed in files structure.
  259. * errorp will be valid only when the returned files_struct is NULL.
  260. */
  261. struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
  262. {
  263. struct files_struct *newf;
  264. struct file **old_fds, **new_fds;
  265. int open_files, size, i;
  266. struct fdtable *old_fdt, *new_fdt;
  267. *errorp = -ENOMEM;
  268. newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
  269. if (!newf)
  270. goto out;
  271. atomic_set(&newf->count, 1);
  272. spin_lock_init(&newf->file_lock);
  273. newf->next_fd = 0;
  274. new_fdt = &newf->fdtab;
  275. new_fdt->max_fds = NR_OPEN_DEFAULT;
  276. new_fdt->close_on_exec = newf->close_on_exec_init;
  277. new_fdt->open_fds = newf->open_fds_init;
  278. new_fdt->fd = &newf->fd_array[0];
  279. new_fdt->next = NULL;
  280. spin_lock(&oldf->file_lock);
  281. old_fdt = files_fdtable(oldf);
  282. open_files = count_open_files(old_fdt);
  283. /*
  284. * Check whether we need to allocate a larger fd array and fd set.
  285. */
  286. while (unlikely(open_files > new_fdt->max_fds)) {
  287. spin_unlock(&oldf->file_lock);
  288. if (new_fdt != &newf->fdtab)
  289. __free_fdtable(new_fdt);
  290. new_fdt = alloc_fdtable(open_files - 1);
  291. if (!new_fdt) {
  292. *errorp = -ENOMEM;
  293. goto out_release;
  294. }
  295. /* beyond sysctl_nr_open; nothing to do */
  296. if (unlikely(new_fdt->max_fds < open_files)) {
  297. __free_fdtable(new_fdt);
  298. *errorp = -EMFILE;
  299. goto out_release;
  300. }
  301. /*
  302. * Reacquire the oldf lock and a pointer to its fd table
  303. * who knows it may have a new bigger fd table. We need
  304. * the latest pointer.
  305. */
  306. spin_lock(&oldf->file_lock);
  307. old_fdt = files_fdtable(oldf);
  308. open_files = count_open_files(old_fdt);
  309. }
  310. old_fds = old_fdt->fd;
  311. new_fds = new_fdt->fd;
  312. memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
  313. memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
  314. for (i = open_files; i != 0; i--) {
  315. struct file *f = *old_fds++;
  316. if (f) {
  317. get_file(f);
  318. } else {
  319. /*
  320. * The fd may be claimed in the fd bitmap but not yet
  321. * instantiated in the files array if a sibling thread
  322. * is partway through open(). So make sure that this
  323. * fd is available to the new process.
  324. */
  325. __clear_open_fd(open_files - i, new_fdt);
  326. }
  327. rcu_assign_pointer(*new_fds++, f);
  328. }
  329. spin_unlock(&oldf->file_lock);
  330. /* compute the remainder to be cleared */
  331. size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
  332. /* This is long word aligned thus could use a optimized version */
  333. memset(new_fds, 0, size);
  334. if (new_fdt->max_fds > open_files) {
  335. int left = (new_fdt->max_fds - open_files) / 8;
  336. int start = open_files / BITS_PER_LONG;
  337. memset(&new_fdt->open_fds[start], 0, left);
  338. memset(&new_fdt->close_on_exec[start], 0, left);
  339. }
  340. rcu_assign_pointer(newf->fdt, new_fdt);
  341. return newf;
  342. out_release:
  343. kmem_cache_free(files_cachep, newf);
  344. out:
  345. return NULL;
  346. }
  347. static void close_files(struct files_struct * files)
  348. {
  349. int i, j;
  350. struct fdtable *fdt;
  351. j = 0;
  352. /*
  353. * It is safe to dereference the fd table without RCU or
  354. * ->file_lock because this is the last reference to the
  355. * files structure. But use RCU to shut RCU-lockdep up.
  356. */
  357. rcu_read_lock();
  358. fdt = files_fdtable(files);
  359. rcu_read_unlock();
  360. for (;;) {
  361. unsigned long set;
  362. i = j * BITS_PER_LONG;
  363. if (i >= fdt->max_fds)
  364. break;
  365. set = fdt->open_fds[j++];
  366. while (set) {
  367. if (set & 1) {
  368. struct file * file = xchg(&fdt->fd[i], NULL);
  369. if (file) {
  370. filp_close(file, files);
  371. cond_resched();
  372. }
  373. }
  374. i++;
  375. set >>= 1;
  376. }
  377. }
  378. }
  379. struct files_struct *get_files_struct(struct task_struct *task)
  380. {
  381. struct files_struct *files;
  382. task_lock(task);
  383. files = task->files;
  384. if (files)
  385. atomic_inc(&files->count);
  386. task_unlock(task);
  387. return files;
  388. }
  389. void put_files_struct(struct files_struct *files)
  390. {
  391. struct fdtable *fdt;
  392. if (atomic_dec_and_test(&files->count)) {
  393. close_files(files);
  394. /* not really needed, since nobody can see us */
  395. rcu_read_lock();
  396. fdt = files_fdtable(files);
  397. rcu_read_unlock();
  398. /* free the arrays if they are not embedded */
  399. if (fdt != &files->fdtab)
  400. __free_fdtable(fdt);
  401. kmem_cache_free(files_cachep, files);
  402. }
  403. }
  404. void reset_files_struct(struct files_struct *files)
  405. {
  406. struct task_struct *tsk = current;
  407. struct files_struct *old;
  408. old = tsk->files;
  409. task_lock(tsk);
  410. tsk->files = files;
  411. task_unlock(tsk);
  412. put_files_struct(old);
  413. }
  414. void exit_files(struct task_struct *tsk)
  415. {
  416. struct files_struct * files = tsk->files;
  417. if (files) {
  418. task_lock(tsk);
  419. tsk->files = NULL;
  420. task_unlock(tsk);
  421. put_files_struct(files);
  422. }
  423. }
  424. static void fdtable_defer_list_init(int cpu)
  425. {
  426. struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
  427. spin_lock_init(&fddef->lock);
  428. INIT_WORK(&fddef->wq, free_fdtable_work);
  429. fddef->next = NULL;
  430. }
  431. void __init files_defer_init(void)
  432. {
  433. int i;
  434. for_each_possible_cpu(i)
  435. fdtable_defer_list_init(i);
  436. sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
  437. -BITS_PER_LONG;
  438. }
  439. struct files_struct init_files = {
  440. .count = ATOMIC_INIT(1),
  441. .fdt = &init_files.fdtab,
  442. .fdtab = {
  443. .max_fds = NR_OPEN_DEFAULT,
  444. .fd = &init_files.fd_array[0],
  445. .close_on_exec = init_files.close_on_exec_init,
  446. .open_fds = init_files.open_fds_init,
  447. },
  448. .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
  449. };
  450. /*
  451. * allocate a file descriptor, mark it busy.
  452. */
  453. int __alloc_fd(struct files_struct *files,
  454. unsigned start, unsigned end, unsigned flags)
  455. {
  456. unsigned int fd;
  457. int error;
  458. struct fdtable *fdt;
  459. spin_lock(&files->file_lock);
  460. repeat:
  461. fdt = files_fdtable(files);
  462. fd = start;
  463. if (fd < files->next_fd)
  464. fd = files->next_fd;
  465. if (fd < fdt->max_fds)
  466. fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
  467. /*
  468. * N.B. For clone tasks sharing a files structure, this test
  469. * will limit the total number of files that can be opened.
  470. */
  471. error = -EMFILE;
  472. if (fd >= end)
  473. goto out;
  474. error = expand_files(files, fd);
  475. if (error < 0)
  476. goto out;
  477. /*
  478. * If we needed to expand the fs array we
  479. * might have blocked - try again.
  480. */
  481. if (error)
  482. goto repeat;
  483. if (start <= files->next_fd)
  484. files->next_fd = fd + 1;
  485. __set_open_fd(fd, fdt);
  486. if (flags & O_CLOEXEC)
  487. __set_close_on_exec(fd, fdt);
  488. else
  489. __clear_close_on_exec(fd, fdt);
  490. error = fd;
  491. #if 1
  492. /* Sanity check */
  493. if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
  494. printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
  495. rcu_assign_pointer(fdt->fd[fd], NULL);
  496. }
  497. #endif
  498. out:
  499. spin_unlock(&files->file_lock);
  500. return error;
  501. }
  502. static int alloc_fd(unsigned start, unsigned flags)
  503. {
  504. return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
  505. }
  506. int get_unused_fd_flags(unsigned flags)
  507. {
  508. return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
  509. }
  510. EXPORT_SYMBOL(get_unused_fd_flags);
  511. static void __put_unused_fd(struct files_struct *files, unsigned int fd)
  512. {
  513. struct fdtable *fdt = files_fdtable(files);
  514. __clear_open_fd(fd, fdt);
  515. if (fd < files->next_fd)
  516. files->next_fd = fd;
  517. }
  518. void put_unused_fd(unsigned int fd)
  519. {
  520. struct files_struct *files = current->files;
  521. spin_lock(&files->file_lock);
  522. __put_unused_fd(files, fd);
  523. spin_unlock(&files->file_lock);
  524. }
  525. EXPORT_SYMBOL(put_unused_fd);
  526. /*
  527. * Install a file pointer in the fd array.
  528. *
  529. * The VFS is full of places where we drop the files lock between
  530. * setting the open_fds bitmap and installing the file in the file
  531. * array. At any such point, we are vulnerable to a dup2() race
  532. * installing a file in the array before us. We need to detect this and
  533. * fput() the struct file we are about to overwrite in this case.
  534. *
  535. * It should never happen - if we allow dup2() do it, _really_ bad things
  536. * will follow.
  537. *
  538. * NOTE: __fd_install() variant is really, really low-level; don't
  539. * use it unless you are forced to by truly lousy API shoved down
  540. * your throat. 'files' *MUST* be either current->files or obtained
  541. * by get_files_struct(current) done by whoever had given it to you,
  542. * or really bad things will happen. Normally you want to use
  543. * fd_install() instead.
  544. */
  545. void __fd_install(struct files_struct *files, unsigned int fd,
  546. struct file *file)
  547. {
  548. struct fdtable *fdt;
  549. spin_lock(&files->file_lock);
  550. fdt = files_fdtable(files);
  551. BUG_ON(fdt->fd[fd] != NULL);
  552. rcu_assign_pointer(fdt->fd[fd], file);
  553. spin_unlock(&files->file_lock);
  554. }
  555. void fd_install(unsigned int fd, struct file *file)
  556. {
  557. __fd_install(current->files, fd, file);
  558. }
  559. EXPORT_SYMBOL(fd_install);
  560. /*
  561. * The same warnings as for __alloc_fd()/__fd_install() apply here...
  562. */
  563. int __close_fd(struct files_struct *files, unsigned fd)
  564. {
  565. struct file *file;
  566. struct fdtable *fdt;
  567. spin_lock(&files->file_lock);
  568. fdt = files_fdtable(files);
  569. if (fd >= fdt->max_fds)
  570. goto out_unlock;
  571. file = fdt->fd[fd];
  572. if (!file)
  573. goto out_unlock;
  574. rcu_assign_pointer(fdt->fd[fd], NULL);
  575. __clear_close_on_exec(fd, fdt);
  576. __put_unused_fd(files, fd);
  577. spin_unlock(&files->file_lock);
  578. return filp_close(file, files);
  579. out_unlock:
  580. spin_unlock(&files->file_lock);
  581. return -EBADF;
  582. }
  583. void do_close_on_exec(struct files_struct *files)
  584. {
  585. unsigned i;
  586. struct fdtable *fdt;
  587. /* exec unshares first */
  588. spin_lock(&files->file_lock);
  589. for (i = 0; ; i++) {
  590. unsigned long set;
  591. unsigned fd = i * BITS_PER_LONG;
  592. fdt = files_fdtable(files);
  593. if (fd >= fdt->max_fds)
  594. break;
  595. set = fdt->close_on_exec[i];
  596. if (!set)
  597. continue;
  598. fdt->close_on_exec[i] = 0;
  599. for ( ; set ; fd++, set >>= 1) {
  600. struct file *file;
  601. if (!(set & 1))
  602. continue;
  603. file = fdt->fd[fd];
  604. if (!file)
  605. continue;
  606. rcu_assign_pointer(fdt->fd[fd], NULL);
  607. __put_unused_fd(files, fd);
  608. spin_unlock(&files->file_lock);
  609. filp_close(file, files);
  610. cond_resched();
  611. spin_lock(&files->file_lock);
  612. }
  613. }
  614. spin_unlock(&files->file_lock);
  615. }
  616. struct file *fget(unsigned int fd)
  617. {
  618. struct file *file;
  619. struct files_struct *files = current->files;
  620. rcu_read_lock();
  621. file = fcheck_files(files, fd);
  622. if (file) {
  623. /* File object ref couldn't be taken */
  624. if (file->f_mode & FMODE_PATH ||
  625. !atomic_long_inc_not_zero(&file->f_count))
  626. file = NULL;
  627. }
  628. rcu_read_unlock();
  629. return file;
  630. }
  631. EXPORT_SYMBOL(fget);
  632. struct file *fget_raw(unsigned int fd)
  633. {
  634. struct file *file;
  635. struct files_struct *files = current->files;
  636. rcu_read_lock();
  637. file = fcheck_files(files, fd);
  638. if (file) {
  639. /* File object ref couldn't be taken */
  640. if (!atomic_long_inc_not_zero(&file->f_count))
  641. file = NULL;
  642. }
  643. rcu_read_unlock();
  644. return file;
  645. }
  646. EXPORT_SYMBOL(fget_raw);
  647. /*
  648. * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  649. *
  650. * You can use this instead of fget if you satisfy all of the following
  651. * conditions:
  652. * 1) You must call fput_light before exiting the syscall and returning control
  653. * to userspace (i.e. you cannot remember the returned struct file * after
  654. * returning to userspace).
  655. * 2) You must not call filp_close on the returned struct file * in between
  656. * calls to fget_light and fput_light.
  657. * 3) You must not clone the current task in between the calls to fget_light
  658. * and fput_light.
  659. *
  660. * The fput_needed flag returned by fget_light should be passed to the
  661. * corresponding fput_light.
  662. */
  663. struct file *fget_light(unsigned int fd, int *fput_needed)
  664. {
  665. struct file *file;
  666. struct files_struct *files = current->files;
  667. *fput_needed = 0;
  668. if (atomic_read(&files->count) == 1) {
  669. file = fcheck_files(files, fd);
  670. if (file && (file->f_mode & FMODE_PATH))
  671. file = NULL;
  672. } else {
  673. rcu_read_lock();
  674. file = fcheck_files(files, fd);
  675. if (file) {
  676. if (!(file->f_mode & FMODE_PATH) &&
  677. atomic_long_inc_not_zero(&file->f_count))
  678. *fput_needed = 1;
  679. else
  680. /* Didn't get the reference, someone's freed */
  681. file = NULL;
  682. }
  683. rcu_read_unlock();
  684. }
  685. return file;
  686. }
  687. EXPORT_SYMBOL(fget_light);
  688. struct file *fget_raw_light(unsigned int fd, int *fput_needed)
  689. {
  690. struct file *file;
  691. struct files_struct *files = current->files;
  692. *fput_needed = 0;
  693. if (atomic_read(&files->count) == 1) {
  694. file = fcheck_files(files, fd);
  695. } else {
  696. rcu_read_lock();
  697. file = fcheck_files(files, fd);
  698. if (file) {
  699. if (atomic_long_inc_not_zero(&file->f_count))
  700. *fput_needed = 1;
  701. else
  702. /* Didn't get the reference, someone's freed */
  703. file = NULL;
  704. }
  705. rcu_read_unlock();
  706. }
  707. return file;
  708. }
  709. void set_close_on_exec(unsigned int fd, int flag)
  710. {
  711. struct files_struct *files = current->files;
  712. struct fdtable *fdt;
  713. spin_lock(&files->file_lock);
  714. fdt = files_fdtable(files);
  715. if (flag)
  716. __set_close_on_exec(fd, fdt);
  717. else
  718. __clear_close_on_exec(fd, fdt);
  719. spin_unlock(&files->file_lock);
  720. }
  721. bool get_close_on_exec(unsigned int fd)
  722. {
  723. struct files_struct *files = current->files;
  724. struct fdtable *fdt;
  725. bool res;
  726. rcu_read_lock();
  727. fdt = files_fdtable(files);
  728. res = close_on_exec(fd, fdt);
  729. rcu_read_unlock();
  730. return res;
  731. }
  732. static int do_dup2(struct files_struct *files,
  733. struct file *file, unsigned fd, unsigned flags)
  734. {
  735. struct file *tofree;
  736. struct fdtable *fdt;
  737. /*
  738. * We need to detect attempts to do dup2() over allocated but still
  739. * not finished descriptor. NB: OpenBSD avoids that at the price of
  740. * extra work in their equivalent of fget() - they insert struct
  741. * file immediately after grabbing descriptor, mark it larval if
  742. * more work (e.g. actual opening) is needed and make sure that
  743. * fget() treats larval files as absent. Potentially interesting,
  744. * but while extra work in fget() is trivial, locking implications
  745. * and amount of surgery on open()-related paths in VFS are not.
  746. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
  747. * deadlocks in rather amusing ways, AFAICS. All of that is out of
  748. * scope of POSIX or SUS, since neither considers shared descriptor
  749. * tables and this condition does not arise without those.
  750. */
  751. fdt = files_fdtable(files);
  752. tofree = fdt->fd[fd];
  753. if (!tofree && fd_is_open(fd, fdt))
  754. goto Ebusy;
  755. get_file(file);
  756. rcu_assign_pointer(fdt->fd[fd], file);
  757. __set_open_fd(fd, fdt);
  758. if (flags & O_CLOEXEC)
  759. __set_close_on_exec(fd, fdt);
  760. else
  761. __clear_close_on_exec(fd, fdt);
  762. spin_unlock(&files->file_lock);
  763. if (tofree)
  764. filp_close(tofree, files);
  765. return fd;
  766. Ebusy:
  767. spin_unlock(&files->file_lock);
  768. return -EBUSY;
  769. }
  770. int replace_fd(unsigned fd, struct file *file, unsigned flags)
  771. {
  772. int err;
  773. struct files_struct *files = current->files;
  774. if (!file)
  775. return __close_fd(files, fd);
  776. if (fd >= rlimit(RLIMIT_NOFILE))
  777. return -EBADF;
  778. spin_lock(&files->file_lock);
  779. err = expand_files(files, fd);
  780. if (unlikely(err < 0))
  781. goto out_unlock;
  782. return do_dup2(files, file, fd, flags);
  783. out_unlock:
  784. spin_unlock(&files->file_lock);
  785. return err;
  786. }
  787. SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
  788. {
  789. int err = -EBADF;
  790. struct file *file;
  791. struct files_struct *files = current->files;
  792. if ((flags & ~O_CLOEXEC) != 0)
  793. return -EINVAL;
  794. if (unlikely(oldfd == newfd))
  795. return -EINVAL;
  796. if (newfd >= rlimit(RLIMIT_NOFILE))
  797. return -EBADF;
  798. spin_lock(&files->file_lock);
  799. err = expand_files(files, newfd);
  800. file = fcheck(oldfd);
  801. if (unlikely(!file))
  802. goto Ebadf;
  803. if (unlikely(err < 0)) {
  804. if (err == -EMFILE)
  805. goto Ebadf;
  806. goto out_unlock;
  807. }
  808. return do_dup2(files, file, newfd, flags);
  809. Ebadf:
  810. err = -EBADF;
  811. out_unlock:
  812. spin_unlock(&files->file_lock);
  813. return err;
  814. }
  815. SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
  816. {
  817. if (unlikely(newfd == oldfd)) { /* corner case */
  818. struct files_struct *files = current->files;
  819. int retval = oldfd;
  820. rcu_read_lock();
  821. if (!fcheck_files(files, oldfd))
  822. retval = -EBADF;
  823. rcu_read_unlock();
  824. return retval;
  825. }
  826. return sys_dup3(oldfd, newfd, 0);
  827. }
  828. SYSCALL_DEFINE1(dup, unsigned int, fildes)
  829. {
  830. int ret = -EBADF;
  831. struct file *file = fget_raw(fildes);
  832. if (file) {
  833. ret = get_unused_fd();
  834. if (ret >= 0)
  835. fd_install(ret, file);
  836. else
  837. fput(file);
  838. }
  839. return ret;
  840. }
  841. int f_dupfd(unsigned int from, struct file *file, unsigned flags)
  842. {
  843. int err;
  844. if (from >= rlimit(RLIMIT_NOFILE))
  845. return -EINVAL;
  846. err = alloc_fd(from, flags);
  847. if (err >= 0) {
  848. get_file(file);
  849. fd_install(err, file);
  850. }
  851. return err;
  852. }
  853. int iterate_fd(struct files_struct *files, unsigned n,
  854. int (*f)(const void *, struct file *, unsigned),
  855. const void *p)
  856. {
  857. struct fdtable *fdt;
  858. int res = 0;
  859. if (!files)
  860. return 0;
  861. spin_lock(&files->file_lock);
  862. for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
  863. struct file *file;
  864. file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
  865. if (!file)
  866. continue;
  867. res = f(p, file, n);
  868. if (res)
  869. break;
  870. }
  871. spin_unlock(&files->file_lock);
  872. return res;
  873. }
  874. EXPORT_SYMBOL(iterate_fd);