file.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227
  1. /*
  2. * Copyright (C) 2007 Oracle. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public
  6. * License v2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public
  14. * License along with this program; if not, write to the
  15. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16. * Boston, MA 021110-1307, USA.
  17. */
  18. #include <linux/fs.h>
  19. #include <linux/pagemap.h>
  20. #include <linux/highmem.h>
  21. #include <linux/time.h>
  22. #include <linux/init.h>
  23. #include <linux/string.h>
  24. #include <linux/backing-dev.h>
  25. #include <linux/mpage.h>
  26. #include <linux/swap.h>
  27. #include <linux/writeback.h>
  28. #include <linux/statfs.h>
  29. #include <linux/compat.h>
  30. #include <linux/slab.h>
  31. #include "ctree.h"
  32. #include "disk-io.h"
  33. #include "transaction.h"
  34. #include "btrfs_inode.h"
  35. #include "ioctl.h"
  36. #include "print-tree.h"
  37. #include "tree-log.h"
  38. #include "locking.h"
  39. #include "compat.h"
  40. /* simple helper to fault in pages and copy. This should go away
  41. * and be replaced with calls into generic code.
  42. */
  43. static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
  44. int write_bytes,
  45. struct page **prepared_pages,
  46. struct iov_iter *i)
  47. {
  48. size_t copied;
  49. int pg = 0;
  50. int offset = pos & (PAGE_CACHE_SIZE - 1);
  51. while (write_bytes > 0) {
  52. size_t count = min_t(size_t,
  53. PAGE_CACHE_SIZE - offset, write_bytes);
  54. struct page *page = prepared_pages[pg];
  55. again:
  56. if (unlikely(iov_iter_fault_in_readable(i, count)))
  57. return -EFAULT;
  58. /* Copy data from userspace to the current page */
  59. copied = iov_iter_copy_from_user(page, i, offset, count);
  60. /* Flush processor's dcache for this page */
  61. flush_dcache_page(page);
  62. iov_iter_advance(i, copied);
  63. write_bytes -= copied;
  64. if (unlikely(copied == 0)) {
  65. count = min_t(size_t, PAGE_CACHE_SIZE - offset,
  66. iov_iter_single_seg_count(i));
  67. goto again;
  68. }
  69. if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
  70. offset += copied;
  71. } else {
  72. pg++;
  73. offset = 0;
  74. }
  75. }
  76. return 0;
  77. }
  78. /*
  79. * unlocks pages after btrfs_file_write is done with them
  80. */
  81. static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
  82. {
  83. size_t i;
  84. for (i = 0; i < num_pages; i++) {
  85. if (!pages[i])
  86. break;
  87. /* page checked is some magic around finding pages that
  88. * have been modified without going through btrfs_set_page_dirty
  89. * clear it here
  90. */
  91. ClearPageChecked(pages[i]);
  92. unlock_page(pages[i]);
  93. mark_page_accessed(pages[i]);
  94. page_cache_release(pages[i]);
  95. }
  96. }
  97. /*
  98. * after copy_from_user, pages need to be dirtied and we need to make
  99. * sure holes are created between the current EOF and the start of
  100. * any next extents (if required).
  101. *
  102. * this also makes the decision about creating an inline extent vs
  103. * doing real data extents, marking pages dirty and delalloc as required.
  104. */
  105. static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
  106. struct btrfs_root *root,
  107. struct file *file,
  108. struct page **pages,
  109. size_t num_pages,
  110. loff_t pos,
  111. size_t write_bytes)
  112. {
  113. int err = 0;
  114. int i;
  115. struct inode *inode = fdentry(file)->d_inode;
  116. u64 num_bytes;
  117. u64 start_pos;
  118. u64 end_of_last_block;
  119. u64 end_pos = pos + write_bytes;
  120. loff_t isize = i_size_read(inode);
  121. start_pos = pos & ~((u64)root->sectorsize - 1);
  122. num_bytes = (write_bytes + pos - start_pos +
  123. root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  124. end_of_last_block = start_pos + num_bytes - 1;
  125. err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
  126. NULL);
  127. BUG_ON(err);
  128. for (i = 0; i < num_pages; i++) {
  129. struct page *p = pages[i];
  130. SetPageUptodate(p);
  131. ClearPageChecked(p);
  132. set_page_dirty(p);
  133. }
  134. if (end_pos > isize) {
  135. i_size_write(inode, end_pos);
  136. /* we've only changed i_size in ram, and we haven't updated
  137. * the disk i_size. There is no need to log the inode
  138. * at this time.
  139. */
  140. }
  141. return 0;
  142. }
  143. /*
  144. * this drops all the extents in the cache that intersect the range
  145. * [start, end]. Existing extents are split as required.
  146. */
  147. int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  148. int skip_pinned)
  149. {
  150. struct extent_map *em;
  151. struct extent_map *split = NULL;
  152. struct extent_map *split2 = NULL;
  153. struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
  154. u64 len = end - start + 1;
  155. int ret;
  156. int testend = 1;
  157. unsigned long flags;
  158. int compressed = 0;
  159. WARN_ON(end < start);
  160. if (end == (u64)-1) {
  161. len = (u64)-1;
  162. testend = 0;
  163. }
  164. while (1) {
  165. if (!split)
  166. split = alloc_extent_map(GFP_NOFS);
  167. if (!split2)
  168. split2 = alloc_extent_map(GFP_NOFS);
  169. write_lock(&em_tree->lock);
  170. em = lookup_extent_mapping(em_tree, start, len);
  171. if (!em) {
  172. write_unlock(&em_tree->lock);
  173. break;
  174. }
  175. flags = em->flags;
  176. if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
  177. if (testend && em->start + em->len >= start + len) {
  178. free_extent_map(em);
  179. write_unlock(&em_tree->lock);
  180. break;
  181. }
  182. start = em->start + em->len;
  183. if (testend)
  184. len = start + len - (em->start + em->len);
  185. free_extent_map(em);
  186. write_unlock(&em_tree->lock);
  187. continue;
  188. }
  189. compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
  190. clear_bit(EXTENT_FLAG_PINNED, &em->flags);
  191. remove_extent_mapping(em_tree, em);
  192. if (em->block_start < EXTENT_MAP_LAST_BYTE &&
  193. em->start < start) {
  194. split->start = em->start;
  195. split->len = start - em->start;
  196. split->orig_start = em->orig_start;
  197. split->block_start = em->block_start;
  198. if (compressed)
  199. split->block_len = em->block_len;
  200. else
  201. split->block_len = split->len;
  202. split->bdev = em->bdev;
  203. split->flags = flags;
  204. ret = add_extent_mapping(em_tree, split);
  205. BUG_ON(ret);
  206. free_extent_map(split);
  207. split = split2;
  208. split2 = NULL;
  209. }
  210. if (em->block_start < EXTENT_MAP_LAST_BYTE &&
  211. testend && em->start + em->len > start + len) {
  212. u64 diff = start + len - em->start;
  213. split->start = start + len;
  214. split->len = em->start + em->len - (start + len);
  215. split->bdev = em->bdev;
  216. split->flags = flags;
  217. if (compressed) {
  218. split->block_len = em->block_len;
  219. split->block_start = em->block_start;
  220. split->orig_start = em->orig_start;
  221. } else {
  222. split->block_len = split->len;
  223. split->block_start = em->block_start + diff;
  224. split->orig_start = split->start;
  225. }
  226. ret = add_extent_mapping(em_tree, split);
  227. BUG_ON(ret);
  228. free_extent_map(split);
  229. split = NULL;
  230. }
  231. write_unlock(&em_tree->lock);
  232. /* once for us */
  233. free_extent_map(em);
  234. /* once for the tree*/
  235. free_extent_map(em);
  236. }
  237. if (split)
  238. free_extent_map(split);
  239. if (split2)
  240. free_extent_map(split2);
  241. return 0;
  242. }
  243. /*
  244. * this is very complex, but the basic idea is to drop all extents
  245. * in the range start - end. hint_block is filled in with a block number
  246. * that would be a good hint to the block allocator for this file.
  247. *
  248. * If an extent intersects the range but is not entirely inside the range
  249. * it is either truncated or split. Anything entirely inside the range
  250. * is deleted from the tree.
  251. */
  252. int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
  253. u64 start, u64 end, u64 *hint_byte, int drop_cache)
  254. {
  255. struct btrfs_root *root = BTRFS_I(inode)->root;
  256. struct extent_buffer *leaf;
  257. struct btrfs_file_extent_item *fi;
  258. struct btrfs_path *path;
  259. struct btrfs_key key;
  260. struct btrfs_key new_key;
  261. u64 search_start = start;
  262. u64 disk_bytenr = 0;
  263. u64 num_bytes = 0;
  264. u64 extent_offset = 0;
  265. u64 extent_end = 0;
  266. int del_nr = 0;
  267. int del_slot = 0;
  268. int extent_type;
  269. int recow;
  270. int ret;
  271. if (drop_cache)
  272. btrfs_drop_extent_cache(inode, start, end - 1, 0);
  273. path = btrfs_alloc_path();
  274. if (!path)
  275. return -ENOMEM;
  276. while (1) {
  277. recow = 0;
  278. ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
  279. search_start, -1);
  280. if (ret < 0)
  281. break;
  282. if (ret > 0 && path->slots[0] > 0 && search_start == start) {
  283. leaf = path->nodes[0];
  284. btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
  285. if (key.objectid == inode->i_ino &&
  286. key.type == BTRFS_EXTENT_DATA_KEY)
  287. path->slots[0]--;
  288. }
  289. ret = 0;
  290. next_slot:
  291. leaf = path->nodes[0];
  292. if (path->slots[0] >= btrfs_header_nritems(leaf)) {
  293. BUG_ON(del_nr > 0);
  294. ret = btrfs_next_leaf(root, path);
  295. if (ret < 0)
  296. break;
  297. if (ret > 0) {
  298. ret = 0;
  299. break;
  300. }
  301. leaf = path->nodes[0];
  302. recow = 1;
  303. }
  304. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  305. if (key.objectid > inode->i_ino ||
  306. key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
  307. break;
  308. fi = btrfs_item_ptr(leaf, path->slots[0],
  309. struct btrfs_file_extent_item);
  310. extent_type = btrfs_file_extent_type(leaf, fi);
  311. if (extent_type == BTRFS_FILE_EXTENT_REG ||
  312. extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
  313. disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
  314. num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
  315. extent_offset = btrfs_file_extent_offset(leaf, fi);
  316. extent_end = key.offset +
  317. btrfs_file_extent_num_bytes(leaf, fi);
  318. } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
  319. extent_end = key.offset +
  320. btrfs_file_extent_inline_len(leaf, fi);
  321. } else {
  322. WARN_ON(1);
  323. extent_end = search_start;
  324. }
  325. if (extent_end <= search_start) {
  326. path->slots[0]++;
  327. goto next_slot;
  328. }
  329. search_start = max(key.offset, start);
  330. if (recow) {
  331. btrfs_release_path(root, path);
  332. continue;
  333. }
  334. /*
  335. * | - range to drop - |
  336. * | -------- extent -------- |
  337. */
  338. if (start > key.offset && end < extent_end) {
  339. BUG_ON(del_nr > 0);
  340. BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
  341. memcpy(&new_key, &key, sizeof(new_key));
  342. new_key.offset = start;
  343. ret = btrfs_duplicate_item(trans, root, path,
  344. &new_key);
  345. if (ret == -EAGAIN) {
  346. btrfs_release_path(root, path);
  347. continue;
  348. }
  349. if (ret < 0)
  350. break;
  351. leaf = path->nodes[0];
  352. fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
  353. struct btrfs_file_extent_item);
  354. btrfs_set_file_extent_num_bytes(leaf, fi,
  355. start - key.offset);
  356. fi = btrfs_item_ptr(leaf, path->slots[0],
  357. struct btrfs_file_extent_item);
  358. extent_offset += start - key.offset;
  359. btrfs_set_file_extent_offset(leaf, fi, extent_offset);
  360. btrfs_set_file_extent_num_bytes(leaf, fi,
  361. extent_end - start);
  362. btrfs_mark_buffer_dirty(leaf);
  363. if (disk_bytenr > 0) {
  364. ret = btrfs_inc_extent_ref(trans, root,
  365. disk_bytenr, num_bytes, 0,
  366. root->root_key.objectid,
  367. new_key.objectid,
  368. start - extent_offset);
  369. BUG_ON(ret);
  370. *hint_byte = disk_bytenr;
  371. }
  372. key.offset = start;
  373. }
  374. /*
  375. * | ---- range to drop ----- |
  376. * | -------- extent -------- |
  377. */
  378. if (start <= key.offset && end < extent_end) {
  379. BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
  380. memcpy(&new_key, &key, sizeof(new_key));
  381. new_key.offset = end;
  382. btrfs_set_item_key_safe(trans, root, path, &new_key);
  383. extent_offset += end - key.offset;
  384. btrfs_set_file_extent_offset(leaf, fi, extent_offset);
  385. btrfs_set_file_extent_num_bytes(leaf, fi,
  386. extent_end - end);
  387. btrfs_mark_buffer_dirty(leaf);
  388. if (disk_bytenr > 0) {
  389. inode_sub_bytes(inode, end - key.offset);
  390. *hint_byte = disk_bytenr;
  391. }
  392. break;
  393. }
  394. search_start = extent_end;
  395. /*
  396. * | ---- range to drop ----- |
  397. * | -------- extent -------- |
  398. */
  399. if (start > key.offset && end >= extent_end) {
  400. BUG_ON(del_nr > 0);
  401. BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
  402. btrfs_set_file_extent_num_bytes(leaf, fi,
  403. start - key.offset);
  404. btrfs_mark_buffer_dirty(leaf);
  405. if (disk_bytenr > 0) {
  406. inode_sub_bytes(inode, extent_end - start);
  407. *hint_byte = disk_bytenr;
  408. }
  409. if (end == extent_end)
  410. break;
  411. path->slots[0]++;
  412. goto next_slot;
  413. }
  414. /*
  415. * | ---- range to drop ----- |
  416. * | ------ extent ------ |
  417. */
  418. if (start <= key.offset && end >= extent_end) {
  419. if (del_nr == 0) {
  420. del_slot = path->slots[0];
  421. del_nr = 1;
  422. } else {
  423. BUG_ON(del_slot + del_nr != path->slots[0]);
  424. del_nr++;
  425. }
  426. if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
  427. inode_sub_bytes(inode,
  428. extent_end - key.offset);
  429. extent_end = ALIGN(extent_end,
  430. root->sectorsize);
  431. } else if (disk_bytenr > 0) {
  432. ret = btrfs_free_extent(trans, root,
  433. disk_bytenr, num_bytes, 0,
  434. root->root_key.objectid,
  435. key.objectid, key.offset -
  436. extent_offset);
  437. BUG_ON(ret);
  438. inode_sub_bytes(inode,
  439. extent_end - key.offset);
  440. *hint_byte = disk_bytenr;
  441. }
  442. if (end == extent_end)
  443. break;
  444. if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
  445. path->slots[0]++;
  446. goto next_slot;
  447. }
  448. ret = btrfs_del_items(trans, root, path, del_slot,
  449. del_nr);
  450. BUG_ON(ret);
  451. del_nr = 0;
  452. del_slot = 0;
  453. btrfs_release_path(root, path);
  454. continue;
  455. }
  456. BUG_ON(1);
  457. }
  458. if (del_nr > 0) {
  459. ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
  460. BUG_ON(ret);
  461. }
  462. btrfs_free_path(path);
  463. return ret;
  464. }
  465. static int extent_mergeable(struct extent_buffer *leaf, int slot,
  466. u64 objectid, u64 bytenr, u64 orig_offset,
  467. u64 *start, u64 *end)
  468. {
  469. struct btrfs_file_extent_item *fi;
  470. struct btrfs_key key;
  471. u64 extent_end;
  472. if (slot < 0 || slot >= btrfs_header_nritems(leaf))
  473. return 0;
  474. btrfs_item_key_to_cpu(leaf, &key, slot);
  475. if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
  476. return 0;
  477. fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
  478. if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
  479. btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
  480. btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
  481. btrfs_file_extent_compression(leaf, fi) ||
  482. btrfs_file_extent_encryption(leaf, fi) ||
  483. btrfs_file_extent_other_encoding(leaf, fi))
  484. return 0;
  485. extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
  486. if ((*start && *start != key.offset) || (*end && *end != extent_end))
  487. return 0;
  488. *start = key.offset;
  489. *end = extent_end;
  490. return 1;
  491. }
  492. /*
  493. * Mark extent in the range start - end as written.
  494. *
  495. * This changes extent type from 'pre-allocated' to 'regular'. If only
  496. * part of extent is marked as written, the extent will be split into
  497. * two or three.
  498. */
  499. int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
  500. struct inode *inode, u64 start, u64 end)
  501. {
  502. struct btrfs_root *root = BTRFS_I(inode)->root;
  503. struct extent_buffer *leaf;
  504. struct btrfs_path *path;
  505. struct btrfs_file_extent_item *fi;
  506. struct btrfs_key key;
  507. struct btrfs_key new_key;
  508. u64 bytenr;
  509. u64 num_bytes;
  510. u64 extent_end;
  511. u64 orig_offset;
  512. u64 other_start;
  513. u64 other_end;
  514. u64 split;
  515. int del_nr = 0;
  516. int del_slot = 0;
  517. int recow;
  518. int ret;
  519. btrfs_drop_extent_cache(inode, start, end - 1, 0);
  520. path = btrfs_alloc_path();
  521. BUG_ON(!path);
  522. again:
  523. recow = 0;
  524. split = start;
  525. key.objectid = inode->i_ino;
  526. key.type = BTRFS_EXTENT_DATA_KEY;
  527. key.offset = split;
  528. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  529. if (ret > 0 && path->slots[0] > 0)
  530. path->slots[0]--;
  531. leaf = path->nodes[0];
  532. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  533. BUG_ON(key.objectid != inode->i_ino ||
  534. key.type != BTRFS_EXTENT_DATA_KEY);
  535. fi = btrfs_item_ptr(leaf, path->slots[0],
  536. struct btrfs_file_extent_item);
  537. BUG_ON(btrfs_file_extent_type(leaf, fi) !=
  538. BTRFS_FILE_EXTENT_PREALLOC);
  539. extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
  540. BUG_ON(key.offset > start || extent_end < end);
  541. bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
  542. num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
  543. orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
  544. memcpy(&new_key, &key, sizeof(new_key));
  545. if (start == key.offset && end < extent_end) {
  546. other_start = 0;
  547. other_end = start;
  548. if (extent_mergeable(leaf, path->slots[0] - 1,
  549. inode->i_ino, bytenr, orig_offset,
  550. &other_start, &other_end)) {
  551. new_key.offset = end;
  552. btrfs_set_item_key_safe(trans, root, path, &new_key);
  553. fi = btrfs_item_ptr(leaf, path->slots[0],
  554. struct btrfs_file_extent_item);
  555. btrfs_set_file_extent_num_bytes(leaf, fi,
  556. extent_end - end);
  557. btrfs_set_file_extent_offset(leaf, fi,
  558. end - orig_offset);
  559. fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
  560. struct btrfs_file_extent_item);
  561. btrfs_set_file_extent_num_bytes(leaf, fi,
  562. end - other_start);
  563. btrfs_mark_buffer_dirty(leaf);
  564. goto out;
  565. }
  566. }
  567. if (start > key.offset && end == extent_end) {
  568. other_start = end;
  569. other_end = 0;
  570. if (extent_mergeable(leaf, path->slots[0] + 1,
  571. inode->i_ino, bytenr, orig_offset,
  572. &other_start, &other_end)) {
  573. fi = btrfs_item_ptr(leaf, path->slots[0],
  574. struct btrfs_file_extent_item);
  575. btrfs_set_file_extent_num_bytes(leaf, fi,
  576. start - key.offset);
  577. path->slots[0]++;
  578. new_key.offset = start;
  579. btrfs_set_item_key_safe(trans, root, path, &new_key);
  580. fi = btrfs_item_ptr(leaf, path->slots[0],
  581. struct btrfs_file_extent_item);
  582. btrfs_set_file_extent_num_bytes(leaf, fi,
  583. other_end - start);
  584. btrfs_set_file_extent_offset(leaf, fi,
  585. start - orig_offset);
  586. btrfs_mark_buffer_dirty(leaf);
  587. goto out;
  588. }
  589. }
  590. while (start > key.offset || end < extent_end) {
  591. if (key.offset == start)
  592. split = end;
  593. new_key.offset = split;
  594. ret = btrfs_duplicate_item(trans, root, path, &new_key);
  595. if (ret == -EAGAIN) {
  596. btrfs_release_path(root, path);
  597. goto again;
  598. }
  599. BUG_ON(ret < 0);
  600. leaf = path->nodes[0];
  601. fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
  602. struct btrfs_file_extent_item);
  603. btrfs_set_file_extent_num_bytes(leaf, fi,
  604. split - key.offset);
  605. fi = btrfs_item_ptr(leaf, path->slots[0],
  606. struct btrfs_file_extent_item);
  607. btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
  608. btrfs_set_file_extent_num_bytes(leaf, fi,
  609. extent_end - split);
  610. btrfs_mark_buffer_dirty(leaf);
  611. ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
  612. root->root_key.objectid,
  613. inode->i_ino, orig_offset);
  614. BUG_ON(ret);
  615. if (split == start) {
  616. key.offset = start;
  617. } else {
  618. BUG_ON(start != key.offset);
  619. path->slots[0]--;
  620. extent_end = end;
  621. }
  622. recow = 1;
  623. }
  624. other_start = end;
  625. other_end = 0;
  626. if (extent_mergeable(leaf, path->slots[0] + 1,
  627. inode->i_ino, bytenr, orig_offset,
  628. &other_start, &other_end)) {
  629. if (recow) {
  630. btrfs_release_path(root, path);
  631. goto again;
  632. }
  633. extent_end = other_end;
  634. del_slot = path->slots[0] + 1;
  635. del_nr++;
  636. ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
  637. 0, root->root_key.objectid,
  638. inode->i_ino, orig_offset);
  639. BUG_ON(ret);
  640. }
  641. other_start = 0;
  642. other_end = start;
  643. if (extent_mergeable(leaf, path->slots[0] - 1,
  644. inode->i_ino, bytenr, orig_offset,
  645. &other_start, &other_end)) {
  646. if (recow) {
  647. btrfs_release_path(root, path);
  648. goto again;
  649. }
  650. key.offset = other_start;
  651. del_slot = path->slots[0];
  652. del_nr++;
  653. ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
  654. 0, root->root_key.objectid,
  655. inode->i_ino, orig_offset);
  656. BUG_ON(ret);
  657. }
  658. if (del_nr == 0) {
  659. fi = btrfs_item_ptr(leaf, path->slots[0],
  660. struct btrfs_file_extent_item);
  661. btrfs_set_file_extent_type(leaf, fi,
  662. BTRFS_FILE_EXTENT_REG);
  663. btrfs_mark_buffer_dirty(leaf);
  664. } else {
  665. fi = btrfs_item_ptr(leaf, del_slot - 1,
  666. struct btrfs_file_extent_item);
  667. btrfs_set_file_extent_type(leaf, fi,
  668. BTRFS_FILE_EXTENT_REG);
  669. btrfs_set_file_extent_num_bytes(leaf, fi,
  670. extent_end - key.offset);
  671. btrfs_mark_buffer_dirty(leaf);
  672. ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
  673. BUG_ON(ret);
  674. }
  675. out:
  676. btrfs_free_path(path);
  677. return 0;
  678. }
  679. /*
  680. * this gets pages into the page cache and locks them down, it also properly
  681. * waits for data=ordered extents to finish before allowing the pages to be
  682. * modified.
  683. */
  684. static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
  685. struct page **pages, size_t num_pages,
  686. loff_t pos, unsigned long first_index,
  687. unsigned long last_index, size_t write_bytes)
  688. {
  689. struct extent_state *cached_state = NULL;
  690. int i;
  691. unsigned long index = pos >> PAGE_CACHE_SHIFT;
  692. struct inode *inode = fdentry(file)->d_inode;
  693. int err = 0;
  694. u64 start_pos;
  695. u64 last_pos;
  696. start_pos = pos & ~((u64)root->sectorsize - 1);
  697. last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
  698. if (start_pos > inode->i_size) {
  699. err = btrfs_cont_expand(inode, start_pos);
  700. if (err)
  701. return err;
  702. }
  703. memset(pages, 0, num_pages * sizeof(struct page *));
  704. again:
  705. for (i = 0; i < num_pages; i++) {
  706. pages[i] = grab_cache_page(inode->i_mapping, index + i);
  707. if (!pages[i]) {
  708. err = -ENOMEM;
  709. BUG_ON(1);
  710. }
  711. wait_on_page_writeback(pages[i]);
  712. }
  713. if (start_pos < inode->i_size) {
  714. struct btrfs_ordered_extent *ordered;
  715. lock_extent_bits(&BTRFS_I(inode)->io_tree,
  716. start_pos, last_pos - 1, 0, &cached_state,
  717. GFP_NOFS);
  718. ordered = btrfs_lookup_first_ordered_extent(inode,
  719. last_pos - 1);
  720. if (ordered &&
  721. ordered->file_offset + ordered->len > start_pos &&
  722. ordered->file_offset < last_pos) {
  723. btrfs_put_ordered_extent(ordered);
  724. unlock_extent_cached(&BTRFS_I(inode)->io_tree,
  725. start_pos, last_pos - 1,
  726. &cached_state, GFP_NOFS);
  727. for (i = 0; i < num_pages; i++) {
  728. unlock_page(pages[i]);
  729. page_cache_release(pages[i]);
  730. }
  731. btrfs_wait_ordered_range(inode, start_pos,
  732. last_pos - start_pos);
  733. goto again;
  734. }
  735. if (ordered)
  736. btrfs_put_ordered_extent(ordered);
  737. clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
  738. last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
  739. EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
  740. GFP_NOFS);
  741. unlock_extent_cached(&BTRFS_I(inode)->io_tree,
  742. start_pos, last_pos - 1, &cached_state,
  743. GFP_NOFS);
  744. }
  745. for (i = 0; i < num_pages; i++) {
  746. clear_page_dirty_for_io(pages[i]);
  747. set_page_extent_mapped(pages[i]);
  748. WARN_ON(!PageLocked(pages[i]));
  749. }
  750. return 0;
  751. }
  752. static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
  753. const struct iovec *iov,
  754. unsigned long nr_segs, loff_t pos)
  755. {
  756. struct file *file = iocb->ki_filp;
  757. struct inode *inode = fdentry(file)->d_inode;
  758. struct btrfs_root *root = BTRFS_I(inode)->root;
  759. struct page *pinned[2];
  760. struct page **pages = NULL;
  761. struct iov_iter i;
  762. loff_t *ppos = &iocb->ki_pos;
  763. loff_t start_pos;
  764. ssize_t num_written = 0;
  765. ssize_t err = 0;
  766. size_t count;
  767. size_t ocount;
  768. int ret = 0;
  769. int nrptrs;
  770. unsigned long first_index;
  771. unsigned long last_index;
  772. int will_write;
  773. int buffered = 0;
  774. will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
  775. (file->f_flags & O_DIRECT));
  776. pinned[0] = NULL;
  777. pinned[1] = NULL;
  778. start_pos = pos;
  779. vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
  780. mutex_lock(&inode->i_mutex);
  781. err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
  782. if (err)
  783. goto out;
  784. count = ocount;
  785. current->backing_dev_info = inode->i_mapping->backing_dev_info;
  786. err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
  787. if (err)
  788. goto out;
  789. if (count == 0)
  790. goto out;
  791. err = file_remove_suid(file);
  792. if (err)
  793. goto out;
  794. file_update_time(file);
  795. BTRFS_I(inode)->sequence++;
  796. if (unlikely(file->f_flags & O_DIRECT)) {
  797. num_written = generic_file_direct_write(iocb, iov, &nr_segs,
  798. pos, ppos, count,
  799. ocount);
  800. /*
  801. * the generic O_DIRECT will update in-memory i_size after the
  802. * DIOs are done. But our endio handlers that update the on
  803. * disk i_size never update past the in memory i_size. So we
  804. * need one more update here to catch any additions to the
  805. * file
  806. */
  807. if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
  808. btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  809. mark_inode_dirty(inode);
  810. }
  811. if (num_written < 0) {
  812. ret = num_written;
  813. num_written = 0;
  814. goto out;
  815. } else if (num_written == count) {
  816. /* pick up pos changes done by the generic code */
  817. pos = *ppos;
  818. goto out;
  819. }
  820. /*
  821. * We are going to do buffered for the rest of the range, so we
  822. * need to make sure to invalidate the buffered pages when we're
  823. * done.
  824. */
  825. buffered = 1;
  826. pos += num_written;
  827. }
  828. iov_iter_init(&i, iov, nr_segs, count, num_written);
  829. nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
  830. PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
  831. (sizeof(struct page *)));
  832. pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
  833. /* generic_write_checks can change our pos */
  834. start_pos = pos;
  835. first_index = pos >> PAGE_CACHE_SHIFT;
  836. last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
  837. /*
  838. * there are lots of better ways to do this, but this code
  839. * makes sure the first and last page in the file range are
  840. * up to date and ready for cow
  841. */
  842. if ((pos & (PAGE_CACHE_SIZE - 1))) {
  843. pinned[0] = grab_cache_page(inode->i_mapping, first_index);
  844. if (!PageUptodate(pinned[0])) {
  845. ret = btrfs_readpage(NULL, pinned[0]);
  846. BUG_ON(ret);
  847. wait_on_page_locked(pinned[0]);
  848. } else {
  849. unlock_page(pinned[0]);
  850. }
  851. }
  852. if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
  853. pinned[1] = grab_cache_page(inode->i_mapping, last_index);
  854. if (!PageUptodate(pinned[1])) {
  855. ret = btrfs_readpage(NULL, pinned[1]);
  856. BUG_ON(ret);
  857. wait_on_page_locked(pinned[1]);
  858. } else {
  859. unlock_page(pinned[1]);
  860. }
  861. }
  862. while (iov_iter_count(&i) > 0) {
  863. size_t offset = pos & (PAGE_CACHE_SIZE - 1);
  864. size_t write_bytes = min(iov_iter_count(&i),
  865. nrptrs * (size_t)PAGE_CACHE_SIZE -
  866. offset);
  867. size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
  868. PAGE_CACHE_SHIFT;
  869. WARN_ON(num_pages > nrptrs);
  870. memset(pages, 0, sizeof(struct page *) * nrptrs);
  871. ret = btrfs_delalloc_reserve_space(inode, write_bytes);
  872. if (ret)
  873. goto out;
  874. ret = prepare_pages(root, file, pages, num_pages,
  875. pos, first_index, last_index,
  876. write_bytes);
  877. if (ret) {
  878. btrfs_delalloc_release_space(inode, write_bytes);
  879. goto out;
  880. }
  881. ret = btrfs_copy_from_user(pos, num_pages,
  882. write_bytes, pages, &i);
  883. if (ret == 0) {
  884. dirty_and_release_pages(NULL, root, file, pages,
  885. num_pages, pos, write_bytes);
  886. }
  887. btrfs_drop_pages(pages, num_pages);
  888. if (ret) {
  889. btrfs_delalloc_release_space(inode, write_bytes);
  890. goto out;
  891. }
  892. if (will_write) {
  893. filemap_fdatawrite_range(inode->i_mapping, pos,
  894. pos + write_bytes - 1);
  895. } else {
  896. balance_dirty_pages_ratelimited_nr(inode->i_mapping,
  897. num_pages);
  898. if (num_pages <
  899. (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
  900. btrfs_btree_balance_dirty(root, 1);
  901. btrfs_throttle(root);
  902. }
  903. pos += write_bytes;
  904. num_written += write_bytes;
  905. cond_resched();
  906. }
  907. out:
  908. mutex_unlock(&inode->i_mutex);
  909. if (ret)
  910. err = ret;
  911. kfree(pages);
  912. if (pinned[0])
  913. page_cache_release(pinned[0]);
  914. if (pinned[1])
  915. page_cache_release(pinned[1]);
  916. *ppos = pos;
  917. /*
  918. * we want to make sure fsync finds this change
  919. * but we haven't joined a transaction running right now.
  920. *
  921. * Later on, someone is sure to update the inode and get the
  922. * real transid recorded.
  923. *
  924. * We set last_trans now to the fs_info generation + 1,
  925. * this will either be one more than the running transaction
  926. * or the generation used for the next transaction if there isn't
  927. * one running right now.
  928. */
  929. BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
  930. if (num_written > 0 && will_write) {
  931. struct btrfs_trans_handle *trans;
  932. err = btrfs_wait_ordered_range(inode, start_pos, num_written);
  933. if (err)
  934. num_written = err;
  935. if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
  936. trans = btrfs_start_transaction(root, 0);
  937. if (IS_ERR(trans)) {
  938. num_written = PTR_ERR(trans);
  939. goto done;
  940. }
  941. mutex_lock(&inode->i_mutex);
  942. ret = btrfs_log_dentry_safe(trans, root,
  943. file->f_dentry);
  944. mutex_unlock(&inode->i_mutex);
  945. if (ret == 0) {
  946. ret = btrfs_sync_log(trans, root);
  947. if (ret == 0)
  948. btrfs_end_transaction(trans, root);
  949. else
  950. btrfs_commit_transaction(trans, root);
  951. } else if (ret != BTRFS_NO_LOG_SYNC) {
  952. btrfs_commit_transaction(trans, root);
  953. } else {
  954. btrfs_end_transaction(trans, root);
  955. }
  956. }
  957. if (file->f_flags & O_DIRECT && buffered) {
  958. invalidate_mapping_pages(inode->i_mapping,
  959. start_pos >> PAGE_CACHE_SHIFT,
  960. (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
  961. }
  962. }
  963. done:
  964. current->backing_dev_info = NULL;
  965. return num_written ? num_written : err;
  966. }
  967. int btrfs_release_file(struct inode *inode, struct file *filp)
  968. {
  969. /*
  970. * ordered_data_close is set by settattr when we are about to truncate
  971. * a file from a non-zero size to a zero size. This tries to
  972. * flush down new bytes that may have been written if the
  973. * application were using truncate to replace a file in place.
  974. */
  975. if (BTRFS_I(inode)->ordered_data_close) {
  976. BTRFS_I(inode)->ordered_data_close = 0;
  977. btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
  978. if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
  979. filemap_flush(inode->i_mapping);
  980. }
  981. if (filp->private_data)
  982. btrfs_ioctl_trans_end(filp);
  983. return 0;
  984. }
  985. /*
  986. * fsync call for both files and directories. This logs the inode into
  987. * the tree log instead of forcing full commits whenever possible.
  988. *
  989. * It needs to call filemap_fdatawait so that all ordered extent updates are
  990. * in the metadata btree are up to date for copying to the log.
  991. *
  992. * It drops the inode mutex before doing the tree log commit. This is an
  993. * important optimization for directories because holding the mutex prevents
  994. * new operations on the dir while we write to disk.
  995. */
  996. int btrfs_sync_file(struct file *file, int datasync)
  997. {
  998. struct dentry *dentry = file->f_path.dentry;
  999. struct inode *inode = dentry->d_inode;
  1000. struct btrfs_root *root = BTRFS_I(inode)->root;
  1001. int ret = 0;
  1002. struct btrfs_trans_handle *trans;
  1003. /* we wait first, since the writeback may change the inode */
  1004. root->log_batch++;
  1005. /* the VFS called filemap_fdatawrite for us */
  1006. btrfs_wait_ordered_range(inode, 0, (u64)-1);
  1007. root->log_batch++;
  1008. /*
  1009. * check the transaction that last modified this inode
  1010. * and see if its already been committed
  1011. */
  1012. if (!BTRFS_I(inode)->last_trans)
  1013. goto out;
  1014. /*
  1015. * if the last transaction that changed this file was before
  1016. * the current transaction, we can bail out now without any
  1017. * syncing
  1018. */
  1019. mutex_lock(&root->fs_info->trans_mutex);
  1020. if (BTRFS_I(inode)->last_trans <=
  1021. root->fs_info->last_trans_committed) {
  1022. BTRFS_I(inode)->last_trans = 0;
  1023. mutex_unlock(&root->fs_info->trans_mutex);
  1024. goto out;
  1025. }
  1026. mutex_unlock(&root->fs_info->trans_mutex);
  1027. /*
  1028. * ok we haven't committed the transaction yet, lets do a commit
  1029. */
  1030. if (file->private_data)
  1031. btrfs_ioctl_trans_end(file);
  1032. trans = btrfs_start_transaction(root, 0);
  1033. if (IS_ERR(trans)) {
  1034. ret = PTR_ERR(trans);
  1035. goto out;
  1036. }
  1037. ret = btrfs_log_dentry_safe(trans, root, dentry);
  1038. if (ret < 0)
  1039. goto out;
  1040. /* we've logged all the items and now have a consistent
  1041. * version of the file in the log. It is possible that
  1042. * someone will come in and modify the file, but that's
  1043. * fine because the log is consistent on disk, and we
  1044. * have references to all of the file's extents
  1045. *
  1046. * It is possible that someone will come in and log the
  1047. * file again, but that will end up using the synchronization
  1048. * inside btrfs_sync_log to keep things safe.
  1049. */
  1050. mutex_unlock(&dentry->d_inode->i_mutex);
  1051. if (ret != BTRFS_NO_LOG_SYNC) {
  1052. if (ret > 0) {
  1053. ret = btrfs_commit_transaction(trans, root);
  1054. } else {
  1055. ret = btrfs_sync_log(trans, root);
  1056. if (ret == 0)
  1057. ret = btrfs_end_transaction(trans, root);
  1058. else
  1059. ret = btrfs_commit_transaction(trans, root);
  1060. }
  1061. } else {
  1062. ret = btrfs_end_transaction(trans, root);
  1063. }
  1064. mutex_lock(&dentry->d_inode->i_mutex);
  1065. out:
  1066. return ret > 0 ? -EIO : ret;
  1067. }
  1068. static const struct vm_operations_struct btrfs_file_vm_ops = {
  1069. .fault = filemap_fault,
  1070. .page_mkwrite = btrfs_page_mkwrite,
  1071. };
  1072. static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
  1073. {
  1074. struct address_space *mapping = filp->f_mapping;
  1075. if (!mapping->a_ops->readpage)
  1076. return -ENOEXEC;
  1077. file_accessed(filp);
  1078. vma->vm_ops = &btrfs_file_vm_ops;
  1079. vma->vm_flags |= VM_CAN_NONLINEAR;
  1080. return 0;
  1081. }
  1082. const struct file_operations btrfs_file_operations = {
  1083. .llseek = generic_file_llseek,
  1084. .read = do_sync_read,
  1085. .write = do_sync_write,
  1086. .aio_read = generic_file_aio_read,
  1087. .splice_read = generic_file_splice_read,
  1088. .aio_write = btrfs_file_aio_write,
  1089. .mmap = btrfs_file_mmap,
  1090. .open = generic_file_open,
  1091. .release = btrfs_release_file,
  1092. .fsync = btrfs_sync_file,
  1093. .unlocked_ioctl = btrfs_ioctl,
  1094. #ifdef CONFIG_COMPAT
  1095. .compat_ioctl = btrfs_ioctl,
  1096. #endif
  1097. };