file.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212
  1. /*
  2. * Copyright (C) 2007 Oracle. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public
  6. * License v2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public
  14. * License along with this program; if not, write to the
  15. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16. * Boston, MA 021110-1307, USA.
  17. */
  18. #include <linux/fs.h>
  19. #include <linux/pagemap.h>
  20. #include <linux/highmem.h>
  21. #include <linux/time.h>
  22. #include <linux/init.h>
  23. #include <linux/string.h>
  24. #include <linux/smp_lock.h>
  25. #include <linux/backing-dev.h>
  26. #include <linux/mpage.h>
  27. #include <linux/swap.h>
  28. #include <linux/writeback.h>
  29. #include <linux/statfs.h>
  30. #include <linux/compat.h>
  31. #include "ctree.h"
  32. #include "disk-io.h"
  33. #include "transaction.h"
  34. #include "btrfs_inode.h"
  35. #include "ioctl.h"
  36. #include "print-tree.h"
  37. #include "tree-log.h"
  38. #include "locking.h"
  39. #include "compat.h"
  40. /* simple helper to fault in pages and copy. This should go away
  41. * and be replaced with calls into generic code.
  42. */
  43. static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
  44. int write_bytes,
  45. struct page **prepared_pages,
  46. const char __user *buf)
  47. {
  48. long page_fault = 0;
  49. int i;
  50. int offset = pos & (PAGE_CACHE_SIZE - 1);
  51. for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
  52. size_t count = min_t(size_t,
  53. PAGE_CACHE_SIZE - offset, write_bytes);
  54. struct page *page = prepared_pages[i];
  55. fault_in_pages_readable(buf, count);
  56. /* Copy data from userspace to the current page */
  57. kmap(page);
  58. page_fault = __copy_from_user(page_address(page) + offset,
  59. buf, count);
  60. /* Flush processor's dcache for this page */
  61. flush_dcache_page(page);
  62. kunmap(page);
  63. buf += count;
  64. write_bytes -= count;
  65. if (page_fault)
  66. break;
  67. }
  68. return page_fault ? -EFAULT : 0;
  69. }
  70. /*
  71. * unlocks pages after btrfs_file_write is done with them
  72. */
  73. static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
  74. {
  75. size_t i;
  76. for (i = 0; i < num_pages; i++) {
  77. if (!pages[i])
  78. break;
  79. /* page checked is some magic around finding pages that
  80. * have been modified without going through btrfs_set_page_dirty
  81. * clear it here
  82. */
  83. ClearPageChecked(pages[i]);
  84. unlock_page(pages[i]);
  85. mark_page_accessed(pages[i]);
  86. page_cache_release(pages[i]);
  87. }
  88. }
  89. /*
  90. * after copy_from_user, pages need to be dirtied and we need to make
  91. * sure holes are created between the current EOF and the start of
  92. * any next extents (if required).
  93. *
  94. * this also makes the decision about creating an inline extent vs
  95. * doing real data extents, marking pages dirty and delalloc as required.
  96. */
  97. static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
  98. struct btrfs_root *root,
  99. struct file *file,
  100. struct page **pages,
  101. size_t num_pages,
  102. loff_t pos,
  103. size_t write_bytes)
  104. {
  105. int err = 0;
  106. int i;
  107. struct inode *inode = fdentry(file)->d_inode;
  108. u64 num_bytes;
  109. u64 start_pos;
  110. u64 end_of_last_block;
  111. u64 end_pos = pos + write_bytes;
  112. loff_t isize = i_size_read(inode);
  113. start_pos = pos & ~((u64)root->sectorsize - 1);
  114. num_bytes = (write_bytes + pos - start_pos +
  115. root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  116. end_of_last_block = start_pos + num_bytes - 1;
  117. btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
  118. for (i = 0; i < num_pages; i++) {
  119. struct page *p = pages[i];
  120. SetPageUptodate(p);
  121. ClearPageChecked(p);
  122. set_page_dirty(p);
  123. }
  124. if (end_pos > isize) {
  125. i_size_write(inode, end_pos);
  126. /* we've only changed i_size in ram, and we haven't updated
  127. * the disk i_size. There is no need to log the inode
  128. * at this time.
  129. */
  130. }
  131. return err;
  132. }
  133. /*
  134. * this drops all the extents in the cache that intersect the range
  135. * [start, end]. Existing extents are split as required.
  136. */
  137. int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  138. int skip_pinned)
  139. {
  140. struct extent_map *em;
  141. struct extent_map *split = NULL;
  142. struct extent_map *split2 = NULL;
  143. struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
  144. u64 len = end - start + 1;
  145. int ret;
  146. int testend = 1;
  147. unsigned long flags;
  148. int compressed = 0;
  149. WARN_ON(end < start);
  150. if (end == (u64)-1) {
  151. len = (u64)-1;
  152. testend = 0;
  153. }
  154. while (1) {
  155. if (!split)
  156. split = alloc_extent_map(GFP_NOFS);
  157. if (!split2)
  158. split2 = alloc_extent_map(GFP_NOFS);
  159. write_lock(&em_tree->lock);
  160. em = lookup_extent_mapping(em_tree, start, len);
  161. if (!em) {
  162. write_unlock(&em_tree->lock);
  163. break;
  164. }
  165. flags = em->flags;
  166. if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
  167. write_unlock(&em_tree->lock);
  168. if (em->start <= start &&
  169. (!testend || em->start + em->len >= start + len)) {
  170. free_extent_map(em);
  171. break;
  172. }
  173. if (start < em->start) {
  174. len = em->start - start;
  175. } else {
  176. len = start + len - (em->start + em->len);
  177. start = em->start + em->len;
  178. }
  179. free_extent_map(em);
  180. continue;
  181. }
  182. compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
  183. clear_bit(EXTENT_FLAG_PINNED, &em->flags);
  184. remove_extent_mapping(em_tree, em);
  185. if (em->block_start < EXTENT_MAP_LAST_BYTE &&
  186. em->start < start) {
  187. split->start = em->start;
  188. split->len = start - em->start;
  189. split->orig_start = em->orig_start;
  190. split->block_start = em->block_start;
  191. if (compressed)
  192. split->block_len = em->block_len;
  193. else
  194. split->block_len = split->len;
  195. split->bdev = em->bdev;
  196. split->flags = flags;
  197. ret = add_extent_mapping(em_tree, split);
  198. BUG_ON(ret);
  199. free_extent_map(split);
  200. split = split2;
  201. split2 = NULL;
  202. }
  203. if (em->block_start < EXTENT_MAP_LAST_BYTE &&
  204. testend && em->start + em->len > start + len) {
  205. u64 diff = start + len - em->start;
  206. split->start = start + len;
  207. split->len = em->start + em->len - (start + len);
  208. split->bdev = em->bdev;
  209. split->flags = flags;
  210. if (compressed) {
  211. split->block_len = em->block_len;
  212. split->block_start = em->block_start;
  213. split->orig_start = em->orig_start;
  214. } else {
  215. split->block_len = split->len;
  216. split->block_start = em->block_start + diff;
  217. split->orig_start = split->start;
  218. }
  219. ret = add_extent_mapping(em_tree, split);
  220. BUG_ON(ret);
  221. free_extent_map(split);
  222. split = NULL;
  223. }
  224. write_unlock(&em_tree->lock);
  225. /* once for us */
  226. free_extent_map(em);
  227. /* once for the tree*/
  228. free_extent_map(em);
  229. }
  230. if (split)
  231. free_extent_map(split);
  232. if (split2)
  233. free_extent_map(split2);
  234. return 0;
  235. }
  236. /*
  237. * this is very complex, but the basic idea is to drop all extents
  238. * in the range start - end. hint_block is filled in with a block number
  239. * that would be a good hint to the block allocator for this file.
  240. *
  241. * If an extent intersects the range but is not entirely inside the range
  242. * it is either truncated or split. Anything entirely inside the range
  243. * is deleted from the tree.
  244. *
  245. * inline_limit is used to tell this code which offsets in the file to keep
  246. * if they contain inline extents.
  247. */
  248. noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
  249. struct btrfs_root *root, struct inode *inode,
  250. u64 start, u64 end, u64 locked_end,
  251. u64 inline_limit, u64 *hint_byte)
  252. {
  253. u64 extent_end = 0;
  254. u64 search_start = start;
  255. u64 ram_bytes = 0;
  256. u64 disk_bytenr = 0;
  257. u64 orig_locked_end = locked_end;
  258. u8 compression;
  259. u8 encryption;
  260. u16 other_encoding = 0;
  261. struct extent_buffer *leaf;
  262. struct btrfs_file_extent_item *extent;
  263. struct btrfs_path *path;
  264. struct btrfs_key key;
  265. struct btrfs_file_extent_item old;
  266. int keep;
  267. int slot;
  268. int bookend;
  269. int found_type = 0;
  270. int found_extent;
  271. int found_inline;
  272. int recow;
  273. int ret;
  274. inline_limit = 0;
  275. btrfs_drop_extent_cache(inode, start, end - 1, 0);
  276. path = btrfs_alloc_path();
  277. if (!path)
  278. return -ENOMEM;
  279. while (1) {
  280. recow = 0;
  281. btrfs_release_path(root, path);
  282. ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
  283. search_start, -1);
  284. if (ret < 0)
  285. goto out;
  286. if (ret > 0) {
  287. if (path->slots[0] == 0) {
  288. ret = 0;
  289. goto out;
  290. }
  291. path->slots[0]--;
  292. }
  293. next_slot:
  294. keep = 0;
  295. bookend = 0;
  296. found_extent = 0;
  297. found_inline = 0;
  298. compression = 0;
  299. encryption = 0;
  300. extent = NULL;
  301. leaf = path->nodes[0];
  302. slot = path->slots[0];
  303. ret = 0;
  304. btrfs_item_key_to_cpu(leaf, &key, slot);
  305. if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
  306. key.offset >= end) {
  307. goto out;
  308. }
  309. if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
  310. key.objectid != inode->i_ino) {
  311. goto out;
  312. }
  313. if (recow) {
  314. search_start = max(key.offset, start);
  315. continue;
  316. }
  317. if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
  318. extent = btrfs_item_ptr(leaf, slot,
  319. struct btrfs_file_extent_item);
  320. found_type = btrfs_file_extent_type(leaf, extent);
  321. compression = btrfs_file_extent_compression(leaf,
  322. extent);
  323. encryption = btrfs_file_extent_encryption(leaf,
  324. extent);
  325. other_encoding = btrfs_file_extent_other_encoding(leaf,
  326. extent);
  327. if (found_type == BTRFS_FILE_EXTENT_REG ||
  328. found_type == BTRFS_FILE_EXTENT_PREALLOC) {
  329. extent_end =
  330. btrfs_file_extent_disk_bytenr(leaf,
  331. extent);
  332. if (extent_end)
  333. *hint_byte = extent_end;
  334. extent_end = key.offset +
  335. btrfs_file_extent_num_bytes(leaf, extent);
  336. ram_bytes = btrfs_file_extent_ram_bytes(leaf,
  337. extent);
  338. found_extent = 1;
  339. } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
  340. found_inline = 1;
  341. extent_end = key.offset +
  342. btrfs_file_extent_inline_len(leaf, extent);
  343. }
  344. } else {
  345. extent_end = search_start;
  346. }
  347. /* we found nothing we can drop */
  348. if ((!found_extent && !found_inline) ||
  349. search_start >= extent_end) {
  350. int nextret;
  351. u32 nritems;
  352. nritems = btrfs_header_nritems(leaf);
  353. if (slot >= nritems - 1) {
  354. nextret = btrfs_next_leaf(root, path);
  355. if (nextret)
  356. goto out;
  357. recow = 1;
  358. } else {
  359. path->slots[0]++;
  360. }
  361. goto next_slot;
  362. }
  363. if (end <= extent_end && start >= key.offset && found_inline)
  364. *hint_byte = EXTENT_MAP_INLINE;
  365. if (found_extent) {
  366. read_extent_buffer(leaf, &old, (unsigned long)extent,
  367. sizeof(old));
  368. }
  369. if (end < extent_end && end >= key.offset) {
  370. bookend = 1;
  371. if (found_inline && start <= key.offset)
  372. keep = 1;
  373. }
  374. if (bookend && found_extent) {
  375. if (locked_end < extent_end) {
  376. ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
  377. locked_end, extent_end - 1,
  378. GFP_NOFS);
  379. if (!ret) {
  380. btrfs_release_path(root, path);
  381. lock_extent(&BTRFS_I(inode)->io_tree,
  382. locked_end, extent_end - 1,
  383. GFP_NOFS);
  384. locked_end = extent_end;
  385. continue;
  386. }
  387. locked_end = extent_end;
  388. }
  389. disk_bytenr = le64_to_cpu(old.disk_bytenr);
  390. if (disk_bytenr != 0) {
  391. ret = btrfs_inc_extent_ref(trans, root,
  392. disk_bytenr,
  393. le64_to_cpu(old.disk_num_bytes), 0,
  394. root->root_key.objectid,
  395. key.objectid, key.offset -
  396. le64_to_cpu(old.offset));
  397. BUG_ON(ret);
  398. }
  399. }
  400. if (found_inline) {
  401. u64 mask = root->sectorsize - 1;
  402. search_start = (extent_end + mask) & ~mask;
  403. } else
  404. search_start = extent_end;
  405. /* truncate existing extent */
  406. if (start > key.offset) {
  407. u64 new_num;
  408. u64 old_num;
  409. keep = 1;
  410. WARN_ON(start & (root->sectorsize - 1));
  411. if (found_extent) {
  412. new_num = start - key.offset;
  413. old_num = btrfs_file_extent_num_bytes(leaf,
  414. extent);
  415. *hint_byte =
  416. btrfs_file_extent_disk_bytenr(leaf,
  417. extent);
  418. if (btrfs_file_extent_disk_bytenr(leaf,
  419. extent)) {
  420. inode_sub_bytes(inode, old_num -
  421. new_num);
  422. }
  423. btrfs_set_file_extent_num_bytes(leaf,
  424. extent, new_num);
  425. btrfs_mark_buffer_dirty(leaf);
  426. } else if (key.offset < inline_limit &&
  427. (end > extent_end) &&
  428. (inline_limit < extent_end)) {
  429. u32 new_size;
  430. new_size = btrfs_file_extent_calc_inline_size(
  431. inline_limit - key.offset);
  432. inode_sub_bytes(inode, extent_end -
  433. inline_limit);
  434. btrfs_set_file_extent_ram_bytes(leaf, extent,
  435. new_size);
  436. if (!compression && !encryption) {
  437. btrfs_truncate_item(trans, root, path,
  438. new_size, 1);
  439. }
  440. }
  441. }
  442. /* delete the entire extent */
  443. if (!keep) {
  444. if (found_inline)
  445. inode_sub_bytes(inode, extent_end -
  446. key.offset);
  447. ret = btrfs_del_item(trans, root, path);
  448. /* TODO update progress marker and return */
  449. BUG_ON(ret);
  450. extent = NULL;
  451. btrfs_release_path(root, path);
  452. /* the extent will be freed later */
  453. }
  454. if (bookend && found_inline && start <= key.offset) {
  455. u32 new_size;
  456. new_size = btrfs_file_extent_calc_inline_size(
  457. extent_end - end);
  458. inode_sub_bytes(inode, end - key.offset);
  459. btrfs_set_file_extent_ram_bytes(leaf, extent,
  460. new_size);
  461. if (!compression && !encryption)
  462. ret = btrfs_truncate_item(trans, root, path,
  463. new_size, 0);
  464. BUG_ON(ret);
  465. }
  466. /* create bookend, splitting the extent in two */
  467. if (bookend && found_extent) {
  468. struct btrfs_key ins;
  469. ins.objectid = inode->i_ino;
  470. ins.offset = end;
  471. btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
  472. btrfs_release_path(root, path);
  473. path->leave_spinning = 1;
  474. ret = btrfs_insert_empty_item(trans, root, path, &ins,
  475. sizeof(*extent));
  476. BUG_ON(ret);
  477. leaf = path->nodes[0];
  478. extent = btrfs_item_ptr(leaf, path->slots[0],
  479. struct btrfs_file_extent_item);
  480. write_extent_buffer(leaf, &old,
  481. (unsigned long)extent, sizeof(old));
  482. btrfs_set_file_extent_compression(leaf, extent,
  483. compression);
  484. btrfs_set_file_extent_encryption(leaf, extent,
  485. encryption);
  486. btrfs_set_file_extent_other_encoding(leaf, extent,
  487. other_encoding);
  488. btrfs_set_file_extent_offset(leaf, extent,
  489. le64_to_cpu(old.offset) + end - key.offset);
  490. WARN_ON(le64_to_cpu(old.num_bytes) <
  491. (extent_end - end));
  492. btrfs_set_file_extent_num_bytes(leaf, extent,
  493. extent_end - end);
  494. /*
  495. * set the ram bytes to the size of the full extent
  496. * before splitting. This is a worst case flag,
  497. * but its the best we can do because we don't know
  498. * how splitting affects compression
  499. */
  500. btrfs_set_file_extent_ram_bytes(leaf, extent,
  501. ram_bytes);
  502. btrfs_set_file_extent_type(leaf, extent, found_type);
  503. btrfs_unlock_up_safe(path, 1);
  504. btrfs_mark_buffer_dirty(path->nodes[0]);
  505. btrfs_set_lock_blocking(path->nodes[0]);
  506. path->leave_spinning = 0;
  507. btrfs_release_path(root, path);
  508. if (disk_bytenr != 0)
  509. inode_add_bytes(inode, extent_end - end);
  510. }
  511. if (found_extent && !keep) {
  512. u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
  513. if (old_disk_bytenr != 0) {
  514. inode_sub_bytes(inode,
  515. le64_to_cpu(old.num_bytes));
  516. ret = btrfs_free_extent(trans, root,
  517. old_disk_bytenr,
  518. le64_to_cpu(old.disk_num_bytes),
  519. 0, root->root_key.objectid,
  520. key.objectid, key.offset -
  521. le64_to_cpu(old.offset));
  522. BUG_ON(ret);
  523. *hint_byte = old_disk_bytenr;
  524. }
  525. }
  526. if (search_start >= end) {
  527. ret = 0;
  528. goto out;
  529. }
  530. }
  531. out:
  532. btrfs_free_path(path);
  533. if (locked_end > orig_locked_end) {
  534. unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
  535. locked_end - 1, GFP_NOFS);
  536. }
  537. return ret;
  538. }
  539. static int extent_mergeable(struct extent_buffer *leaf, int slot,
  540. u64 objectid, u64 bytenr, u64 *start, u64 *end)
  541. {
  542. struct btrfs_file_extent_item *fi;
  543. struct btrfs_key key;
  544. u64 extent_end;
  545. if (slot < 0 || slot >= btrfs_header_nritems(leaf))
  546. return 0;
  547. btrfs_item_key_to_cpu(leaf, &key, slot);
  548. if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
  549. return 0;
  550. fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
  551. if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
  552. btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
  553. btrfs_file_extent_compression(leaf, fi) ||
  554. btrfs_file_extent_encryption(leaf, fi) ||
  555. btrfs_file_extent_other_encoding(leaf, fi))
  556. return 0;
  557. extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
  558. if ((*start && *start != key.offset) || (*end && *end != extent_end))
  559. return 0;
  560. *start = key.offset;
  561. *end = extent_end;
  562. return 1;
  563. }
  564. /*
  565. * Mark extent in the range start - end as written.
  566. *
  567. * This changes extent type from 'pre-allocated' to 'regular'. If only
  568. * part of extent is marked as written, the extent will be split into
  569. * two or three.
  570. */
  571. int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
  572. struct btrfs_root *root,
  573. struct inode *inode, u64 start, u64 end)
  574. {
  575. struct extent_buffer *leaf;
  576. struct btrfs_path *path;
  577. struct btrfs_file_extent_item *fi;
  578. struct btrfs_key key;
  579. u64 bytenr;
  580. u64 num_bytes;
  581. u64 extent_end;
  582. u64 orig_offset;
  583. u64 other_start;
  584. u64 other_end;
  585. u64 split = start;
  586. u64 locked_end = end;
  587. int extent_type;
  588. int split_end = 1;
  589. int ret;
  590. btrfs_drop_extent_cache(inode, start, end - 1, 0);
  591. path = btrfs_alloc_path();
  592. BUG_ON(!path);
  593. again:
  594. key.objectid = inode->i_ino;
  595. key.type = BTRFS_EXTENT_DATA_KEY;
  596. if (split == start)
  597. key.offset = split;
  598. else
  599. key.offset = split - 1;
  600. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  601. if (ret > 0 && path->slots[0] > 0)
  602. path->slots[0]--;
  603. leaf = path->nodes[0];
  604. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  605. BUG_ON(key.objectid != inode->i_ino ||
  606. key.type != BTRFS_EXTENT_DATA_KEY);
  607. fi = btrfs_item_ptr(leaf, path->slots[0],
  608. struct btrfs_file_extent_item);
  609. extent_type = btrfs_file_extent_type(leaf, fi);
  610. BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
  611. extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
  612. BUG_ON(key.offset > start || extent_end < end);
  613. bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
  614. num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
  615. orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
  616. if (key.offset == start)
  617. split = end;
  618. if (key.offset == start && extent_end == end) {
  619. int del_nr = 0;
  620. int del_slot = 0;
  621. other_start = end;
  622. other_end = 0;
  623. if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
  624. bytenr, &other_start, &other_end)) {
  625. extent_end = other_end;
  626. del_slot = path->slots[0] + 1;
  627. del_nr++;
  628. ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
  629. 0, root->root_key.objectid,
  630. inode->i_ino, orig_offset);
  631. BUG_ON(ret);
  632. }
  633. other_start = 0;
  634. other_end = start;
  635. if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
  636. bytenr, &other_start, &other_end)) {
  637. key.offset = other_start;
  638. del_slot = path->slots[0];
  639. del_nr++;
  640. ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
  641. 0, root->root_key.objectid,
  642. inode->i_ino, orig_offset);
  643. BUG_ON(ret);
  644. }
  645. split_end = 0;
  646. if (del_nr == 0) {
  647. btrfs_set_file_extent_type(leaf, fi,
  648. BTRFS_FILE_EXTENT_REG);
  649. goto done;
  650. }
  651. fi = btrfs_item_ptr(leaf, del_slot - 1,
  652. struct btrfs_file_extent_item);
  653. btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
  654. btrfs_set_file_extent_num_bytes(leaf, fi,
  655. extent_end - key.offset);
  656. btrfs_mark_buffer_dirty(leaf);
  657. ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
  658. BUG_ON(ret);
  659. goto release;
  660. } else if (split == start) {
  661. if (locked_end < extent_end) {
  662. ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
  663. locked_end, extent_end - 1, GFP_NOFS);
  664. if (!ret) {
  665. btrfs_release_path(root, path);
  666. lock_extent(&BTRFS_I(inode)->io_tree,
  667. locked_end, extent_end - 1, GFP_NOFS);
  668. locked_end = extent_end;
  669. goto again;
  670. }
  671. locked_end = extent_end;
  672. }
  673. btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
  674. } else {
  675. BUG_ON(key.offset != start);
  676. key.offset = split;
  677. btrfs_set_file_extent_offset(leaf, fi, key.offset -
  678. orig_offset);
  679. btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
  680. btrfs_set_item_key_safe(trans, root, path, &key);
  681. extent_end = split;
  682. }
  683. if (extent_end == end) {
  684. split_end = 0;
  685. extent_type = BTRFS_FILE_EXTENT_REG;
  686. }
  687. if (extent_end == end && split == start) {
  688. other_start = end;
  689. other_end = 0;
  690. if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
  691. bytenr, &other_start, &other_end)) {
  692. path->slots[0]++;
  693. fi = btrfs_item_ptr(leaf, path->slots[0],
  694. struct btrfs_file_extent_item);
  695. key.offset = split;
  696. btrfs_set_item_key_safe(trans, root, path, &key);
  697. btrfs_set_file_extent_offset(leaf, fi, key.offset -
  698. orig_offset);
  699. btrfs_set_file_extent_num_bytes(leaf, fi,
  700. other_end - split);
  701. goto done;
  702. }
  703. }
  704. if (extent_end == end && split == end) {
  705. other_start = 0;
  706. other_end = start;
  707. if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
  708. bytenr, &other_start, &other_end)) {
  709. path->slots[0]--;
  710. fi = btrfs_item_ptr(leaf, path->slots[0],
  711. struct btrfs_file_extent_item);
  712. btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
  713. other_start);
  714. goto done;
  715. }
  716. }
  717. btrfs_mark_buffer_dirty(leaf);
  718. ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
  719. root->root_key.objectid,
  720. inode->i_ino, orig_offset);
  721. BUG_ON(ret);
  722. btrfs_release_path(root, path);
  723. key.offset = start;
  724. ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
  725. BUG_ON(ret);
  726. leaf = path->nodes[0];
  727. fi = btrfs_item_ptr(leaf, path->slots[0],
  728. struct btrfs_file_extent_item);
  729. btrfs_set_file_extent_generation(leaf, fi, trans->transid);
  730. btrfs_set_file_extent_type(leaf, fi, extent_type);
  731. btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
  732. btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
  733. btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
  734. btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
  735. btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
  736. btrfs_set_file_extent_compression(leaf, fi, 0);
  737. btrfs_set_file_extent_encryption(leaf, fi, 0);
  738. btrfs_set_file_extent_other_encoding(leaf, fi, 0);
  739. done:
  740. btrfs_mark_buffer_dirty(leaf);
  741. release:
  742. btrfs_release_path(root, path);
  743. if (split_end && split == start) {
  744. split = end;
  745. goto again;
  746. }
  747. if (locked_end > end) {
  748. unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
  749. GFP_NOFS);
  750. }
  751. btrfs_free_path(path);
  752. return 0;
  753. }
  754. /*
  755. * this gets pages into the page cache and locks them down, it also properly
  756. * waits for data=ordered extents to finish before allowing the pages to be
  757. * modified.
  758. */
  759. static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
  760. struct page **pages, size_t num_pages,
  761. loff_t pos, unsigned long first_index,
  762. unsigned long last_index, size_t write_bytes)
  763. {
  764. int i;
  765. unsigned long index = pos >> PAGE_CACHE_SHIFT;
  766. struct inode *inode = fdentry(file)->d_inode;
  767. int err = 0;
  768. u64 start_pos;
  769. u64 last_pos;
  770. start_pos = pos & ~((u64)root->sectorsize - 1);
  771. last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
  772. if (start_pos > inode->i_size) {
  773. err = btrfs_cont_expand(inode, start_pos);
  774. if (err)
  775. return err;
  776. }
  777. memset(pages, 0, num_pages * sizeof(struct page *));
  778. again:
  779. for (i = 0; i < num_pages; i++) {
  780. pages[i] = grab_cache_page(inode->i_mapping, index + i);
  781. if (!pages[i]) {
  782. err = -ENOMEM;
  783. BUG_ON(1);
  784. }
  785. wait_on_page_writeback(pages[i]);
  786. }
  787. if (start_pos < inode->i_size) {
  788. struct btrfs_ordered_extent *ordered;
  789. lock_extent(&BTRFS_I(inode)->io_tree,
  790. start_pos, last_pos - 1, GFP_NOFS);
  791. ordered = btrfs_lookup_first_ordered_extent(inode,
  792. last_pos - 1);
  793. if (ordered &&
  794. ordered->file_offset + ordered->len > start_pos &&
  795. ordered->file_offset < last_pos) {
  796. btrfs_put_ordered_extent(ordered);
  797. unlock_extent(&BTRFS_I(inode)->io_tree,
  798. start_pos, last_pos - 1, GFP_NOFS);
  799. for (i = 0; i < num_pages; i++) {
  800. unlock_page(pages[i]);
  801. page_cache_release(pages[i]);
  802. }
  803. btrfs_wait_ordered_range(inode, start_pos,
  804. last_pos - start_pos);
  805. goto again;
  806. }
  807. if (ordered)
  808. btrfs_put_ordered_extent(ordered);
  809. clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
  810. last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
  811. GFP_NOFS);
  812. unlock_extent(&BTRFS_I(inode)->io_tree,
  813. start_pos, last_pos - 1, GFP_NOFS);
  814. }
  815. for (i = 0; i < num_pages; i++) {
  816. clear_page_dirty_for_io(pages[i]);
  817. set_page_extent_mapped(pages[i]);
  818. WARN_ON(!PageLocked(pages[i]));
  819. }
  820. return 0;
  821. }
  822. static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
  823. size_t count, loff_t *ppos)
  824. {
  825. loff_t pos;
  826. loff_t start_pos;
  827. ssize_t num_written = 0;
  828. ssize_t err = 0;
  829. int ret = 0;
  830. struct inode *inode = fdentry(file)->d_inode;
  831. struct btrfs_root *root = BTRFS_I(inode)->root;
  832. struct page **pages = NULL;
  833. int nrptrs;
  834. struct page *pinned[2];
  835. unsigned long first_index;
  836. unsigned long last_index;
  837. int will_write;
  838. will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
  839. (file->f_flags & O_DIRECT));
  840. nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
  841. PAGE_CACHE_SIZE / (sizeof(struct page *)));
  842. pinned[0] = NULL;
  843. pinned[1] = NULL;
  844. pos = *ppos;
  845. start_pos = pos;
  846. vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
  847. current->backing_dev_info = inode->i_mapping->backing_dev_info;
  848. err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
  849. if (err)
  850. goto out_nolock;
  851. if (count == 0)
  852. goto out_nolock;
  853. err = file_remove_suid(file);
  854. if (err)
  855. goto out_nolock;
  856. file_update_time(file);
  857. pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
  858. mutex_lock(&inode->i_mutex);
  859. BTRFS_I(inode)->sequence++;
  860. first_index = pos >> PAGE_CACHE_SHIFT;
  861. last_index = (pos + count) >> PAGE_CACHE_SHIFT;
  862. /*
  863. * there are lots of better ways to do this, but this code
  864. * makes sure the first and last page in the file range are
  865. * up to date and ready for cow
  866. */
  867. if ((pos & (PAGE_CACHE_SIZE - 1))) {
  868. pinned[0] = grab_cache_page(inode->i_mapping, first_index);
  869. if (!PageUptodate(pinned[0])) {
  870. ret = btrfs_readpage(NULL, pinned[0]);
  871. BUG_ON(ret);
  872. wait_on_page_locked(pinned[0]);
  873. } else {
  874. unlock_page(pinned[0]);
  875. }
  876. }
  877. if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
  878. pinned[1] = grab_cache_page(inode->i_mapping, last_index);
  879. if (!PageUptodate(pinned[1])) {
  880. ret = btrfs_readpage(NULL, pinned[1]);
  881. BUG_ON(ret);
  882. wait_on_page_locked(pinned[1]);
  883. } else {
  884. unlock_page(pinned[1]);
  885. }
  886. }
  887. while (count > 0) {
  888. size_t offset = pos & (PAGE_CACHE_SIZE - 1);
  889. size_t write_bytes = min(count, nrptrs *
  890. (size_t)PAGE_CACHE_SIZE -
  891. offset);
  892. size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
  893. PAGE_CACHE_SHIFT;
  894. WARN_ON(num_pages > nrptrs);
  895. memset(pages, 0, sizeof(struct page *) * nrptrs);
  896. ret = btrfs_check_data_free_space(root, inode, write_bytes);
  897. if (ret)
  898. goto out;
  899. ret = prepare_pages(root, file, pages, num_pages,
  900. pos, first_index, last_index,
  901. write_bytes);
  902. if (ret) {
  903. btrfs_free_reserved_data_space(root, inode,
  904. write_bytes);
  905. goto out;
  906. }
  907. ret = btrfs_copy_from_user(pos, num_pages,
  908. write_bytes, pages, buf);
  909. if (ret) {
  910. btrfs_free_reserved_data_space(root, inode,
  911. write_bytes);
  912. btrfs_drop_pages(pages, num_pages);
  913. goto out;
  914. }
  915. ret = dirty_and_release_pages(NULL, root, file, pages,
  916. num_pages, pos, write_bytes);
  917. btrfs_drop_pages(pages, num_pages);
  918. if (ret) {
  919. btrfs_free_reserved_data_space(root, inode,
  920. write_bytes);
  921. goto out;
  922. }
  923. if (will_write) {
  924. btrfs_fdatawrite_range(inode->i_mapping, pos,
  925. pos + write_bytes - 1,
  926. WB_SYNC_ALL);
  927. } else {
  928. balance_dirty_pages_ratelimited_nr(inode->i_mapping,
  929. num_pages);
  930. if (num_pages <
  931. (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
  932. btrfs_btree_balance_dirty(root, 1);
  933. btrfs_throttle(root);
  934. }
  935. buf += write_bytes;
  936. count -= write_bytes;
  937. pos += write_bytes;
  938. num_written += write_bytes;
  939. cond_resched();
  940. }
  941. out:
  942. mutex_unlock(&inode->i_mutex);
  943. if (ret)
  944. err = ret;
  945. out_nolock:
  946. kfree(pages);
  947. if (pinned[0])
  948. page_cache_release(pinned[0]);
  949. if (pinned[1])
  950. page_cache_release(pinned[1]);
  951. *ppos = pos;
  952. /*
  953. * we want to make sure fsync finds this change
  954. * but we haven't joined a transaction running right now.
  955. *
  956. * Later on, someone is sure to update the inode and get the
  957. * real transid recorded.
  958. *
  959. * We set last_trans now to the fs_info generation + 1,
  960. * this will either be one more than the running transaction
  961. * or the generation used for the next transaction if there isn't
  962. * one running right now.
  963. */
  964. BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
  965. if (num_written > 0 && will_write) {
  966. struct btrfs_trans_handle *trans;
  967. err = btrfs_wait_ordered_range(inode, start_pos, num_written);
  968. if (err)
  969. num_written = err;
  970. if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
  971. trans = btrfs_start_transaction(root, 1);
  972. ret = btrfs_log_dentry_safe(trans, root,
  973. file->f_dentry);
  974. if (ret == 0) {
  975. ret = btrfs_sync_log(trans, root);
  976. if (ret == 0)
  977. btrfs_end_transaction(trans, root);
  978. else
  979. btrfs_commit_transaction(trans, root);
  980. } else {
  981. btrfs_commit_transaction(trans, root);
  982. }
  983. }
  984. if (file->f_flags & O_DIRECT) {
  985. invalidate_mapping_pages(inode->i_mapping,
  986. start_pos >> PAGE_CACHE_SHIFT,
  987. (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
  988. }
  989. }
  990. current->backing_dev_info = NULL;
  991. return num_written ? num_written : err;
  992. }
  993. int btrfs_release_file(struct inode *inode, struct file *filp)
  994. {
  995. /*
  996. * ordered_data_close is set by settattr when we are about to truncate
  997. * a file from a non-zero size to a zero size. This tries to
  998. * flush down new bytes that may have been written if the
  999. * application were using truncate to replace a file in place.
  1000. */
  1001. if (BTRFS_I(inode)->ordered_data_close) {
  1002. BTRFS_I(inode)->ordered_data_close = 0;
  1003. btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
  1004. if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
  1005. filemap_flush(inode->i_mapping);
  1006. }
  1007. if (filp->private_data)
  1008. btrfs_ioctl_trans_end(filp);
  1009. return 0;
  1010. }
  1011. /*
  1012. * fsync call for both files and directories. This logs the inode into
  1013. * the tree log instead of forcing full commits whenever possible.
  1014. *
  1015. * It needs to call filemap_fdatawait so that all ordered extent updates are
  1016. * in the metadata btree are up to date for copying to the log.
  1017. *
  1018. * It drops the inode mutex before doing the tree log commit. This is an
  1019. * important optimization for directories because holding the mutex prevents
  1020. * new operations on the dir while we write to disk.
  1021. */
  1022. int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
  1023. {
  1024. struct inode *inode = dentry->d_inode;
  1025. struct btrfs_root *root = BTRFS_I(inode)->root;
  1026. int ret = 0;
  1027. struct btrfs_trans_handle *trans;
  1028. /*
  1029. * check the transaction that last modified this inode
  1030. * and see if its already been committed
  1031. */
  1032. if (!BTRFS_I(inode)->last_trans)
  1033. goto out;
  1034. mutex_lock(&root->fs_info->trans_mutex);
  1035. if (BTRFS_I(inode)->last_trans <=
  1036. root->fs_info->last_trans_committed) {
  1037. BTRFS_I(inode)->last_trans = 0;
  1038. mutex_unlock(&root->fs_info->trans_mutex);
  1039. goto out;
  1040. }
  1041. mutex_unlock(&root->fs_info->trans_mutex);
  1042. root->log_batch++;
  1043. filemap_fdatawrite(inode->i_mapping);
  1044. btrfs_wait_ordered_range(inode, 0, (u64)-1);
  1045. root->log_batch++;
  1046. if (datasync && !(inode->i_state & I_DIRTY_PAGES))
  1047. goto out;
  1048. /*
  1049. * ok we haven't committed the transaction yet, lets do a commit
  1050. */
  1051. if (file && file->private_data)
  1052. btrfs_ioctl_trans_end(file);
  1053. trans = btrfs_start_transaction(root, 1);
  1054. if (!trans) {
  1055. ret = -ENOMEM;
  1056. goto out;
  1057. }
  1058. ret = btrfs_log_dentry_safe(trans, root, dentry);
  1059. if (ret < 0)
  1060. goto out;
  1061. /* we've logged all the items and now have a consistent
  1062. * version of the file in the log. It is possible that
  1063. * someone will come in and modify the file, but that's
  1064. * fine because the log is consistent on disk, and we
  1065. * have references to all of the file's extents
  1066. *
  1067. * It is possible that someone will come in and log the
  1068. * file again, but that will end up using the synchronization
  1069. * inside btrfs_sync_log to keep things safe.
  1070. */
  1071. mutex_unlock(&dentry->d_inode->i_mutex);
  1072. if (ret > 0) {
  1073. ret = btrfs_commit_transaction(trans, root);
  1074. } else {
  1075. ret = btrfs_sync_log(trans, root);
  1076. if (ret == 0)
  1077. ret = btrfs_end_transaction(trans, root);
  1078. else
  1079. ret = btrfs_commit_transaction(trans, root);
  1080. }
  1081. mutex_lock(&dentry->d_inode->i_mutex);
  1082. out:
  1083. return ret > 0 ? EIO : ret;
  1084. }
  1085. static struct vm_operations_struct btrfs_file_vm_ops = {
  1086. .fault = filemap_fault,
  1087. .page_mkwrite = btrfs_page_mkwrite,
  1088. };
  1089. static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
  1090. {
  1091. vma->vm_ops = &btrfs_file_vm_ops;
  1092. file_accessed(filp);
  1093. return 0;
  1094. }
  1095. struct file_operations btrfs_file_operations = {
  1096. .llseek = generic_file_llseek,
  1097. .read = do_sync_read,
  1098. .aio_read = generic_file_aio_read,
  1099. .splice_read = generic_file_splice_read,
  1100. .write = btrfs_file_write,
  1101. .mmap = btrfs_file_mmap,
  1102. .open = generic_file_open,
  1103. .release = btrfs_release_file,
  1104. .fsync = btrfs_sync_file,
  1105. .unlocked_ioctl = btrfs_ioctl,
  1106. #ifdef CONFIG_COMPAT
  1107. .compat_ioctl = btrfs_ioctl,
  1108. #endif
  1109. };