file.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220
  1. /*
  2. * Copyright (C) 2007 Oracle. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public
  6. * License v2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public
  14. * License along with this program; if not, write to the
  15. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16. * Boston, MA 021110-1307, USA.
  17. */
  18. #include <linux/fs.h>
  19. #include <linux/pagemap.h>
  20. #include <linux/highmem.h>
  21. #include <linux/time.h>
  22. #include <linux/init.h>
  23. #include <linux/string.h>
  24. #include <linux/backing-dev.h>
  25. #include <linux/mpage.h>
  26. #include <linux/swap.h>
  27. #include <linux/writeback.h>
  28. #include <linux/statfs.h>
  29. #include <linux/compat.h>
  30. #include <linux/slab.h>
  31. #include "ctree.h"
  32. #include "disk-io.h"
  33. #include "transaction.h"
  34. #include "btrfs_inode.h"
  35. #include "ioctl.h"
  36. #include "print-tree.h"
  37. #include "tree-log.h"
  38. #include "locking.h"
  39. #include "compat.h"
  40. /* simple helper to fault in pages and copy. This should go away
  41. * and be replaced with calls into generic code.
  42. */
  43. static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
  44. int write_bytes,
  45. struct page **prepared_pages,
  46. struct iov_iter *i)
  47. {
  48. size_t copied;
  49. int pg = 0;
  50. int offset = pos & (PAGE_CACHE_SIZE - 1);
  51. while (write_bytes > 0) {
  52. size_t count = min_t(size_t,
  53. PAGE_CACHE_SIZE - offset, write_bytes);
  54. struct page *page = prepared_pages[pg];
  55. again:
  56. if (unlikely(iov_iter_fault_in_readable(i, count)))
  57. return -EFAULT;
  58. /* Copy data from userspace to the current page */
  59. copied = iov_iter_copy_from_user(page, i, offset, count);
  60. /* Flush processor's dcache for this page */
  61. flush_dcache_page(page);
  62. iov_iter_advance(i, copied);
  63. write_bytes -= copied;
  64. if (unlikely(copied == 0)) {
  65. count = min_t(size_t, PAGE_CACHE_SIZE - offset,
  66. iov_iter_single_seg_count(i));
  67. goto again;
  68. }
  69. if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
  70. offset += copied;
  71. } else {
  72. pg++;
  73. offset = 0;
  74. }
  75. }
  76. return 0;
  77. }
  78. /*
  79. * unlocks pages after btrfs_file_write is done with them
  80. */
  81. static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
  82. {
  83. size_t i;
  84. for (i = 0; i < num_pages; i++) {
  85. if (!pages[i])
  86. break;
  87. /* page checked is some magic around finding pages that
  88. * have been modified without going through btrfs_set_page_dirty
  89. * clear it here
  90. */
  91. ClearPageChecked(pages[i]);
  92. unlock_page(pages[i]);
  93. mark_page_accessed(pages[i]);
  94. page_cache_release(pages[i]);
  95. }
  96. }
  97. /*
  98. * after copy_from_user, pages need to be dirtied and we need to make
  99. * sure holes are created between the current EOF and the start of
  100. * any next extents (if required).
  101. *
  102. * this also makes the decision about creating an inline extent vs
  103. * doing real data extents, marking pages dirty and delalloc as required.
  104. */
  105. static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
  106. struct btrfs_root *root,
  107. struct file *file,
  108. struct page **pages,
  109. size_t num_pages,
  110. loff_t pos,
  111. size_t write_bytes)
  112. {
  113. int err = 0;
  114. int i;
  115. struct inode *inode = fdentry(file)->d_inode;
  116. u64 num_bytes;
  117. u64 start_pos;
  118. u64 end_of_last_block;
  119. u64 end_pos = pos + write_bytes;
  120. loff_t isize = i_size_read(inode);
  121. start_pos = pos & ~((u64)root->sectorsize - 1);
  122. num_bytes = (write_bytes + pos - start_pos +
  123. root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  124. end_of_last_block = start_pos + num_bytes - 1;
  125. err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
  126. NULL);
  127. BUG_ON(err);
  128. for (i = 0; i < num_pages; i++) {
  129. struct page *p = pages[i];
  130. SetPageUptodate(p);
  131. ClearPageChecked(p);
  132. set_page_dirty(p);
  133. }
  134. if (end_pos > isize) {
  135. i_size_write(inode, end_pos);
  136. /* we've only changed i_size in ram, and we haven't updated
  137. * the disk i_size. There is no need to log the inode
  138. * at this time.
  139. */
  140. }
  141. return 0;
  142. }
  143. /*
  144. * this drops all the extents in the cache that intersect the range
  145. * [start, end]. Existing extents are split as required.
  146. */
  147. int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  148. int skip_pinned)
  149. {
  150. struct extent_map *em;
  151. struct extent_map *split = NULL;
  152. struct extent_map *split2 = NULL;
  153. struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
  154. u64 len = end - start + 1;
  155. int ret;
  156. int testend = 1;
  157. unsigned long flags;
  158. int compressed = 0;
  159. WARN_ON(end < start);
  160. if (end == (u64)-1) {
  161. len = (u64)-1;
  162. testend = 0;
  163. }
  164. while (1) {
  165. if (!split)
  166. split = alloc_extent_map(GFP_NOFS);
  167. if (!split2)
  168. split2 = alloc_extent_map(GFP_NOFS);
  169. write_lock(&em_tree->lock);
  170. em = lookup_extent_mapping(em_tree, start, len);
  171. if (!em) {
  172. write_unlock(&em_tree->lock);
  173. break;
  174. }
  175. flags = em->flags;
  176. if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
  177. if (testend && em->start + em->len >= start + len) {
  178. free_extent_map(em);
  179. write_unlock(&em_tree->lock);
  180. break;
  181. }
  182. start = em->start + em->len;
  183. if (testend)
  184. len = start + len - (em->start + em->len);
  185. free_extent_map(em);
  186. write_unlock(&em_tree->lock);
  187. continue;
  188. }
  189. compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
  190. clear_bit(EXTENT_FLAG_PINNED, &em->flags);
  191. remove_extent_mapping(em_tree, em);
  192. if (em->block_start < EXTENT_MAP_LAST_BYTE &&
  193. em->start < start) {
  194. split->start = em->start;
  195. split->len = start - em->start;
  196. split->orig_start = em->orig_start;
  197. split->block_start = em->block_start;
  198. if (compressed)
  199. split->block_len = em->block_len;
  200. else
  201. split->block_len = split->len;
  202. split->bdev = em->bdev;
  203. split->flags = flags;
  204. ret = add_extent_mapping(em_tree, split);
  205. BUG_ON(ret);
  206. free_extent_map(split);
  207. split = split2;
  208. split2 = NULL;
  209. }
  210. if (em->block_start < EXTENT_MAP_LAST_BYTE &&
  211. testend && em->start + em->len > start + len) {
  212. u64 diff = start + len - em->start;
  213. split->start = start + len;
  214. split->len = em->start + em->len - (start + len);
  215. split->bdev = em->bdev;
  216. split->flags = flags;
  217. if (compressed) {
  218. split->block_len = em->block_len;
  219. split->block_start = em->block_start;
  220. split->orig_start = em->orig_start;
  221. } else {
  222. split->block_len = split->len;
  223. split->block_start = em->block_start + diff;
  224. split->orig_start = split->start;
  225. }
  226. ret = add_extent_mapping(em_tree, split);
  227. BUG_ON(ret);
  228. free_extent_map(split);
  229. split = NULL;
  230. }
  231. write_unlock(&em_tree->lock);
  232. /* once for us */
  233. free_extent_map(em);
  234. /* once for the tree*/
  235. free_extent_map(em);
  236. }
  237. if (split)
  238. free_extent_map(split);
  239. if (split2)
  240. free_extent_map(split2);
  241. return 0;
  242. }
  243. /*
  244. * this is very complex, but the basic idea is to drop all extents
  245. * in the range start - end. hint_block is filled in with a block number
  246. * that would be a good hint to the block allocator for this file.
  247. *
  248. * If an extent intersects the range but is not entirely inside the range
  249. * it is either truncated or split. Anything entirely inside the range
  250. * is deleted from the tree.
  251. */
  252. int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
  253. u64 start, u64 end, u64 *hint_byte, int drop_cache)
  254. {
  255. struct btrfs_root *root = BTRFS_I(inode)->root;
  256. struct extent_buffer *leaf;
  257. struct btrfs_file_extent_item *fi;
  258. struct btrfs_path *path;
  259. struct btrfs_key key;
  260. struct btrfs_key new_key;
  261. u64 search_start = start;
  262. u64 disk_bytenr = 0;
  263. u64 num_bytes = 0;
  264. u64 extent_offset = 0;
  265. u64 extent_end = 0;
  266. int del_nr = 0;
  267. int del_slot = 0;
  268. int extent_type;
  269. int recow;
  270. int ret;
  271. if (drop_cache)
  272. btrfs_drop_extent_cache(inode, start, end - 1, 0);
  273. path = btrfs_alloc_path();
  274. if (!path)
  275. return -ENOMEM;
  276. while (1) {
  277. recow = 0;
  278. ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
  279. search_start, -1);
  280. if (ret < 0)
  281. break;
  282. if (ret > 0 && path->slots[0] > 0 && search_start == start) {
  283. leaf = path->nodes[0];
  284. btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
  285. if (key.objectid == inode->i_ino &&
  286. key.type == BTRFS_EXTENT_DATA_KEY)
  287. path->slots[0]--;
  288. }
  289. ret = 0;
  290. next_slot:
  291. leaf = path->nodes[0];
  292. if (path->slots[0] >= btrfs_header_nritems(leaf)) {
  293. BUG_ON(del_nr > 0);
  294. ret = btrfs_next_leaf(root, path);
  295. if (ret < 0)
  296. break;
  297. if (ret > 0) {
  298. ret = 0;
  299. break;
  300. }
  301. leaf = path->nodes[0];
  302. recow = 1;
  303. }
  304. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  305. if (key.objectid > inode->i_ino ||
  306. key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
  307. break;
  308. fi = btrfs_item_ptr(leaf, path->slots[0],
  309. struct btrfs_file_extent_item);
  310. extent_type = btrfs_file_extent_type(leaf, fi);
  311. if (extent_type == BTRFS_FILE_EXTENT_REG ||
  312. extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
  313. disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
  314. num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
  315. extent_offset = btrfs_file_extent_offset(leaf, fi);
  316. extent_end = key.offset +
  317. btrfs_file_extent_num_bytes(leaf, fi);
  318. } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
  319. extent_end = key.offset +
  320. btrfs_file_extent_inline_len(leaf, fi);
  321. } else {
  322. WARN_ON(1);
  323. extent_end = search_start;
  324. }
  325. if (extent_end <= search_start) {
  326. path->slots[0]++;
  327. goto next_slot;
  328. }
  329. search_start = max(key.offset, start);
  330. if (recow) {
  331. btrfs_release_path(root, path);
  332. continue;
  333. }
  334. /*
  335. * | - range to drop - |
  336. * | -------- extent -------- |
  337. */
  338. if (start > key.offset && end < extent_end) {
  339. BUG_ON(del_nr > 0);
  340. BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
  341. memcpy(&new_key, &key, sizeof(new_key));
  342. new_key.offset = start;
  343. ret = btrfs_duplicate_item(trans, root, path,
  344. &new_key);
  345. if (ret == -EAGAIN) {
  346. btrfs_release_path(root, path);
  347. continue;
  348. }
  349. if (ret < 0)
  350. break;
  351. leaf = path->nodes[0];
  352. fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
  353. struct btrfs_file_extent_item);
  354. btrfs_set_file_extent_num_bytes(leaf, fi,
  355. start - key.offset);
  356. fi = btrfs_item_ptr(leaf, path->slots[0],
  357. struct btrfs_file_extent_item);
  358. extent_offset += start - key.offset;
  359. btrfs_set_file_extent_offset(leaf, fi, extent_offset);
  360. btrfs_set_file_extent_num_bytes(leaf, fi,
  361. extent_end - start);
  362. btrfs_mark_buffer_dirty(leaf);
  363. if (disk_bytenr > 0) {
  364. ret = btrfs_inc_extent_ref(trans, root,
  365. disk_bytenr, num_bytes, 0,
  366. root->root_key.objectid,
  367. new_key.objectid,
  368. start - extent_offset);
  369. BUG_ON(ret);
  370. *hint_byte = disk_bytenr;
  371. }
  372. key.offset = start;
  373. }
  374. /*
  375. * | ---- range to drop ----- |
  376. * | -------- extent -------- |
  377. */
  378. if (start <= key.offset && end < extent_end) {
  379. BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
  380. memcpy(&new_key, &key, sizeof(new_key));
  381. new_key.offset = end;
  382. btrfs_set_item_key_safe(trans, root, path, &new_key);
  383. extent_offset += end - key.offset;
  384. btrfs_set_file_extent_offset(leaf, fi, extent_offset);
  385. btrfs_set_file_extent_num_bytes(leaf, fi,
  386. extent_end - end);
  387. btrfs_mark_buffer_dirty(leaf);
  388. if (disk_bytenr > 0) {
  389. inode_sub_bytes(inode, end - key.offset);
  390. *hint_byte = disk_bytenr;
  391. }
  392. break;
  393. }
  394. search_start = extent_end;
  395. /*
  396. * | ---- range to drop ----- |
  397. * | -------- extent -------- |
  398. */
  399. if (start > key.offset && end >= extent_end) {
  400. BUG_ON(del_nr > 0);
  401. BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
  402. btrfs_set_file_extent_num_bytes(leaf, fi,
  403. start - key.offset);
  404. btrfs_mark_buffer_dirty(leaf);
  405. if (disk_bytenr > 0) {
  406. inode_sub_bytes(inode, extent_end - start);
  407. *hint_byte = disk_bytenr;
  408. }
  409. if (end == extent_end)
  410. break;
  411. path->slots[0]++;
  412. goto next_slot;
  413. }
  414. /*
  415. * | ---- range to drop ----- |
  416. * | ------ extent ------ |
  417. */
  418. if (start <= key.offset && end >= extent_end) {
  419. if (del_nr == 0) {
  420. del_slot = path->slots[0];
  421. del_nr = 1;
  422. } else {
  423. BUG_ON(del_slot + del_nr != path->slots[0]);
  424. del_nr++;
  425. }
  426. if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
  427. inode_sub_bytes(inode,
  428. extent_end - key.offset);
  429. extent_end = ALIGN(extent_end,
  430. root->sectorsize);
  431. } else if (disk_bytenr > 0) {
  432. ret = btrfs_free_extent(trans, root,
  433. disk_bytenr, num_bytes, 0,
  434. root->root_key.objectid,
  435. key.objectid, key.offset -
  436. extent_offset);
  437. BUG_ON(ret);
  438. inode_sub_bytes(inode,
  439. extent_end - key.offset);
  440. *hint_byte = disk_bytenr;
  441. }
  442. if (end == extent_end)
  443. break;
  444. if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
  445. path->slots[0]++;
  446. goto next_slot;
  447. }
  448. ret = btrfs_del_items(trans, root, path, del_slot,
  449. del_nr);
  450. BUG_ON(ret);
  451. del_nr = 0;
  452. del_slot = 0;
  453. btrfs_release_path(root, path);
  454. continue;
  455. }
  456. BUG_ON(1);
  457. }
  458. if (del_nr > 0) {
  459. ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
  460. BUG_ON(ret);
  461. }
  462. btrfs_free_path(path);
  463. return ret;
  464. }
  465. static int extent_mergeable(struct extent_buffer *leaf, int slot,
  466. u64 objectid, u64 bytenr, u64 orig_offset,
  467. u64 *start, u64 *end)
  468. {
  469. struct btrfs_file_extent_item *fi;
  470. struct btrfs_key key;
  471. u64 extent_end;
  472. if (slot < 0 || slot >= btrfs_header_nritems(leaf))
  473. return 0;
  474. btrfs_item_key_to_cpu(leaf, &key, slot);
  475. if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
  476. return 0;
  477. fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
  478. if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
  479. btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
  480. btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
  481. btrfs_file_extent_compression(leaf, fi) ||
  482. btrfs_file_extent_encryption(leaf, fi) ||
  483. btrfs_file_extent_other_encoding(leaf, fi))
  484. return 0;
  485. extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
  486. if ((*start && *start != key.offset) || (*end && *end != extent_end))
  487. return 0;
  488. *start = key.offset;
  489. *end = extent_end;
  490. return 1;
  491. }
  492. /*
  493. * Mark extent in the range start - end as written.
  494. *
  495. * This changes extent type from 'pre-allocated' to 'regular'. If only
  496. * part of extent is marked as written, the extent will be split into
  497. * two or three.
  498. */
  499. int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
  500. struct inode *inode, u64 start, u64 end)
  501. {
  502. struct btrfs_root *root = BTRFS_I(inode)->root;
  503. struct extent_buffer *leaf;
  504. struct btrfs_path *path;
  505. struct btrfs_file_extent_item *fi;
  506. struct btrfs_key key;
  507. struct btrfs_key new_key;
  508. u64 bytenr;
  509. u64 num_bytes;
  510. u64 extent_end;
  511. u64 orig_offset;
  512. u64 other_start;
  513. u64 other_end;
  514. u64 split;
  515. int del_nr = 0;
  516. int del_slot = 0;
  517. int recow;
  518. int ret;
  519. btrfs_drop_extent_cache(inode, start, end - 1, 0);
  520. path = btrfs_alloc_path();
  521. BUG_ON(!path);
  522. again:
  523. recow = 0;
  524. split = start;
  525. key.objectid = inode->i_ino;
  526. key.type = BTRFS_EXTENT_DATA_KEY;
  527. key.offset = split;
  528. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  529. if (ret > 0 && path->slots[0] > 0)
  530. path->slots[0]--;
  531. leaf = path->nodes[0];
  532. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  533. BUG_ON(key.objectid != inode->i_ino ||
  534. key.type != BTRFS_EXTENT_DATA_KEY);
  535. fi = btrfs_item_ptr(leaf, path->slots[0],
  536. struct btrfs_file_extent_item);
  537. BUG_ON(btrfs_file_extent_type(leaf, fi) !=
  538. BTRFS_FILE_EXTENT_PREALLOC);
  539. extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
  540. BUG_ON(key.offset > start || extent_end < end);
  541. bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
  542. num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
  543. orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
  544. memcpy(&new_key, &key, sizeof(new_key));
  545. if (start == key.offset && end < extent_end) {
  546. other_start = 0;
  547. other_end = start;
  548. if (extent_mergeable(leaf, path->slots[0] - 1,
  549. inode->i_ino, bytenr, orig_offset,
  550. &other_start, &other_end)) {
  551. new_key.offset = end;
  552. btrfs_set_item_key_safe(trans, root, path, &new_key);
  553. fi = btrfs_item_ptr(leaf, path->slots[0],
  554. struct btrfs_file_extent_item);
  555. btrfs_set_file_extent_num_bytes(leaf, fi,
  556. extent_end - end);
  557. btrfs_set_file_extent_offset(leaf, fi,
  558. end - orig_offset);
  559. fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
  560. struct btrfs_file_extent_item);
  561. btrfs_set_file_extent_num_bytes(leaf, fi,
  562. end - other_start);
  563. btrfs_mark_buffer_dirty(leaf);
  564. goto out;
  565. }
  566. }
  567. if (start > key.offset && end == extent_end) {
  568. other_start = end;
  569. other_end = 0;
  570. if (extent_mergeable(leaf, path->slots[0] + 1,
  571. inode->i_ino, bytenr, orig_offset,
  572. &other_start, &other_end)) {
  573. fi = btrfs_item_ptr(leaf, path->slots[0],
  574. struct btrfs_file_extent_item);
  575. btrfs_set_file_extent_num_bytes(leaf, fi,
  576. start - key.offset);
  577. path->slots[0]++;
  578. new_key.offset = start;
  579. btrfs_set_item_key_safe(trans, root, path, &new_key);
  580. fi = btrfs_item_ptr(leaf, path->slots[0],
  581. struct btrfs_file_extent_item);
  582. btrfs_set_file_extent_num_bytes(leaf, fi,
  583. other_end - start);
  584. btrfs_set_file_extent_offset(leaf, fi,
  585. start - orig_offset);
  586. btrfs_mark_buffer_dirty(leaf);
  587. goto out;
  588. }
  589. }
  590. while (start > key.offset || end < extent_end) {
  591. if (key.offset == start)
  592. split = end;
  593. new_key.offset = split;
  594. ret = btrfs_duplicate_item(trans, root, path, &new_key);
  595. if (ret == -EAGAIN) {
  596. btrfs_release_path(root, path);
  597. goto again;
  598. }
  599. BUG_ON(ret < 0);
  600. leaf = path->nodes[0];
  601. fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
  602. struct btrfs_file_extent_item);
  603. btrfs_set_file_extent_num_bytes(leaf, fi,
  604. split - key.offset);
  605. fi = btrfs_item_ptr(leaf, path->slots[0],
  606. struct btrfs_file_extent_item);
  607. btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
  608. btrfs_set_file_extent_num_bytes(leaf, fi,
  609. extent_end - split);
  610. btrfs_mark_buffer_dirty(leaf);
  611. ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
  612. root->root_key.objectid,
  613. inode->i_ino, orig_offset);
  614. BUG_ON(ret);
  615. if (split == start) {
  616. key.offset = start;
  617. } else {
  618. BUG_ON(start != key.offset);
  619. path->slots[0]--;
  620. extent_end = end;
  621. }
  622. recow = 1;
  623. }
  624. other_start = end;
  625. other_end = 0;
  626. if (extent_mergeable(leaf, path->slots[0] + 1,
  627. inode->i_ino, bytenr, orig_offset,
  628. &other_start, &other_end)) {
  629. if (recow) {
  630. btrfs_release_path(root, path);
  631. goto again;
  632. }
  633. extent_end = other_end;
  634. del_slot = path->slots[0] + 1;
  635. del_nr++;
  636. ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
  637. 0, root->root_key.objectid,
  638. inode->i_ino, orig_offset);
  639. BUG_ON(ret);
  640. }
  641. other_start = 0;
  642. other_end = start;
  643. if (extent_mergeable(leaf, path->slots[0] - 1,
  644. inode->i_ino, bytenr, orig_offset,
  645. &other_start, &other_end)) {
  646. if (recow) {
  647. btrfs_release_path(root, path);
  648. goto again;
  649. }
  650. key.offset = other_start;
  651. del_slot = path->slots[0];
  652. del_nr++;
  653. ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
  654. 0, root->root_key.objectid,
  655. inode->i_ino, orig_offset);
  656. BUG_ON(ret);
  657. }
  658. if (del_nr == 0) {
  659. fi = btrfs_item_ptr(leaf, path->slots[0],
  660. struct btrfs_file_extent_item);
  661. btrfs_set_file_extent_type(leaf, fi,
  662. BTRFS_FILE_EXTENT_REG);
  663. btrfs_mark_buffer_dirty(leaf);
  664. } else {
  665. fi = btrfs_item_ptr(leaf, del_slot - 1,
  666. struct btrfs_file_extent_item);
  667. btrfs_set_file_extent_type(leaf, fi,
  668. BTRFS_FILE_EXTENT_REG);
  669. btrfs_set_file_extent_num_bytes(leaf, fi,
  670. extent_end - key.offset);
  671. btrfs_mark_buffer_dirty(leaf);
  672. ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
  673. BUG_ON(ret);
  674. }
  675. out:
  676. btrfs_free_path(path);
  677. return 0;
  678. }
  679. /*
  680. * this gets pages into the page cache and locks them down, it also properly
  681. * waits for data=ordered extents to finish before allowing the pages to be
  682. * modified.
  683. */
  684. static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
  685. struct page **pages, size_t num_pages,
  686. loff_t pos, unsigned long first_index,
  687. unsigned long last_index, size_t write_bytes)
  688. {
  689. struct extent_state *cached_state = NULL;
  690. int i;
  691. unsigned long index = pos >> PAGE_CACHE_SHIFT;
  692. struct inode *inode = fdentry(file)->d_inode;
  693. int err = 0;
  694. u64 start_pos;
  695. u64 last_pos;
  696. start_pos = pos & ~((u64)root->sectorsize - 1);
  697. last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
  698. if (start_pos > inode->i_size) {
  699. err = btrfs_cont_expand(inode, start_pos);
  700. if (err)
  701. return err;
  702. }
  703. memset(pages, 0, num_pages * sizeof(struct page *));
  704. again:
  705. for (i = 0; i < num_pages; i++) {
  706. pages[i] = grab_cache_page(inode->i_mapping, index + i);
  707. if (!pages[i]) {
  708. err = -ENOMEM;
  709. BUG_ON(1);
  710. }
  711. wait_on_page_writeback(pages[i]);
  712. }
  713. if (start_pos < inode->i_size) {
  714. struct btrfs_ordered_extent *ordered;
  715. lock_extent_bits(&BTRFS_I(inode)->io_tree,
  716. start_pos, last_pos - 1, 0, &cached_state,
  717. GFP_NOFS);
  718. ordered = btrfs_lookup_first_ordered_extent(inode,
  719. last_pos - 1);
  720. if (ordered &&
  721. ordered->file_offset + ordered->len > start_pos &&
  722. ordered->file_offset < last_pos) {
  723. btrfs_put_ordered_extent(ordered);
  724. unlock_extent_cached(&BTRFS_I(inode)->io_tree,
  725. start_pos, last_pos - 1,
  726. &cached_state, GFP_NOFS);
  727. for (i = 0; i < num_pages; i++) {
  728. unlock_page(pages[i]);
  729. page_cache_release(pages[i]);
  730. }
  731. btrfs_wait_ordered_range(inode, start_pos,
  732. last_pos - start_pos);
  733. goto again;
  734. }
  735. if (ordered)
  736. btrfs_put_ordered_extent(ordered);
  737. clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
  738. last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
  739. EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
  740. GFP_NOFS);
  741. unlock_extent_cached(&BTRFS_I(inode)->io_tree,
  742. start_pos, last_pos - 1, &cached_state,
  743. GFP_NOFS);
  744. }
  745. for (i = 0; i < num_pages; i++) {
  746. clear_page_dirty_for_io(pages[i]);
  747. set_page_extent_mapped(pages[i]);
  748. WARN_ON(!PageLocked(pages[i]));
  749. }
  750. return 0;
  751. }
  752. static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
  753. const struct iovec *iov,
  754. unsigned long nr_segs, loff_t pos)
  755. {
  756. struct file *file = iocb->ki_filp;
  757. struct inode *inode = fdentry(file)->d_inode;
  758. struct btrfs_root *root = BTRFS_I(inode)->root;
  759. struct page *pinned[2];
  760. struct page **pages = NULL;
  761. struct iov_iter i;
  762. loff_t *ppos = &iocb->ki_pos;
  763. loff_t start_pos;
  764. ssize_t num_written = 0;
  765. ssize_t err = 0;
  766. size_t count;
  767. size_t ocount;
  768. int ret = 0;
  769. int nrptrs;
  770. unsigned long first_index;
  771. unsigned long last_index;
  772. int will_write;
  773. int buffered = 0;
  774. will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
  775. (file->f_flags & O_DIRECT));
  776. pinned[0] = NULL;
  777. pinned[1] = NULL;
  778. start_pos = pos;
  779. vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
  780. mutex_lock(&inode->i_mutex);
  781. err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
  782. if (err)
  783. goto out;
  784. count = ocount;
  785. current->backing_dev_info = inode->i_mapping->backing_dev_info;
  786. err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
  787. if (err)
  788. goto out;
  789. if (count == 0)
  790. goto out;
  791. err = file_remove_suid(file);
  792. if (err)
  793. goto out;
  794. file_update_time(file);
  795. BTRFS_I(inode)->sequence++;
  796. if (unlikely(file->f_flags & O_DIRECT)) {
  797. num_written = generic_file_direct_write(iocb, iov, &nr_segs,
  798. pos, ppos, count,
  799. ocount);
  800. /*
  801. * the generic O_DIRECT will update in-memory i_size after the
  802. * DIOs are done. But our endio handlers that update the on
  803. * disk i_size never update past the in memory i_size. So we
  804. * need one more update here to catch any additions to the
  805. * file
  806. */
  807. if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
  808. btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  809. mark_inode_dirty(inode);
  810. }
  811. if (num_written < 0) {
  812. ret = num_written;
  813. num_written = 0;
  814. goto out;
  815. } else if (num_written == count) {
  816. /* pick up pos changes done by the generic code */
  817. pos = *ppos;
  818. goto out;
  819. }
  820. /*
  821. * We are going to do buffered for the rest of the range, so we
  822. * need to make sure to invalidate the buffered pages when we're
  823. * done.
  824. */
  825. buffered = 1;
  826. pos += num_written;
  827. }
  828. iov_iter_init(&i, iov, nr_segs, count, num_written);
  829. nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
  830. PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
  831. (sizeof(struct page *)));
  832. pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
  833. /* generic_write_checks can change our pos */
  834. start_pos = pos;
  835. first_index = pos >> PAGE_CACHE_SHIFT;
  836. last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
  837. /*
  838. * there are lots of better ways to do this, but this code
  839. * makes sure the first and last page in the file range are
  840. * up to date and ready for cow
  841. */
  842. if ((pos & (PAGE_CACHE_SIZE - 1))) {
  843. pinned[0] = grab_cache_page(inode->i_mapping, first_index);
  844. if (!PageUptodate(pinned[0])) {
  845. ret = btrfs_readpage(NULL, pinned[0]);
  846. BUG_ON(ret);
  847. wait_on_page_locked(pinned[0]);
  848. } else {
  849. unlock_page(pinned[0]);
  850. }
  851. }
  852. if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
  853. pinned[1] = grab_cache_page(inode->i_mapping, last_index);
  854. if (!PageUptodate(pinned[1])) {
  855. ret = btrfs_readpage(NULL, pinned[1]);
  856. BUG_ON(ret);
  857. wait_on_page_locked(pinned[1]);
  858. } else {
  859. unlock_page(pinned[1]);
  860. }
  861. }
  862. while (iov_iter_count(&i) > 0) {
  863. size_t offset = pos & (PAGE_CACHE_SIZE - 1);
  864. size_t write_bytes = min(iov_iter_count(&i),
  865. nrptrs * (size_t)PAGE_CACHE_SIZE -
  866. offset);
  867. size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
  868. PAGE_CACHE_SHIFT;
  869. WARN_ON(num_pages > nrptrs);
  870. memset(pages, 0, sizeof(struct page *) * nrptrs);
  871. ret = btrfs_delalloc_reserve_space(inode, write_bytes);
  872. if (ret)
  873. goto out;
  874. ret = prepare_pages(root, file, pages, num_pages,
  875. pos, first_index, last_index,
  876. write_bytes);
  877. if (ret) {
  878. btrfs_delalloc_release_space(inode, write_bytes);
  879. goto out;
  880. }
  881. ret = btrfs_copy_from_user(pos, num_pages,
  882. write_bytes, pages, &i);
  883. if (ret == 0) {
  884. dirty_and_release_pages(NULL, root, file, pages,
  885. num_pages, pos, write_bytes);
  886. }
  887. btrfs_drop_pages(pages, num_pages);
  888. if (ret) {
  889. btrfs_delalloc_release_space(inode, write_bytes);
  890. goto out;
  891. }
  892. if (will_write) {
  893. filemap_fdatawrite_range(inode->i_mapping, pos,
  894. pos + write_bytes - 1);
  895. } else {
  896. balance_dirty_pages_ratelimited_nr(inode->i_mapping,
  897. num_pages);
  898. if (num_pages <
  899. (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
  900. btrfs_btree_balance_dirty(root, 1);
  901. btrfs_throttle(root);
  902. }
  903. pos += write_bytes;
  904. num_written += write_bytes;
  905. cond_resched();
  906. }
  907. out:
  908. mutex_unlock(&inode->i_mutex);
  909. if (ret)
  910. err = ret;
  911. kfree(pages);
  912. if (pinned[0])
  913. page_cache_release(pinned[0]);
  914. if (pinned[1])
  915. page_cache_release(pinned[1]);
  916. *ppos = pos;
  917. /*
  918. * we want to make sure fsync finds this change
  919. * but we haven't joined a transaction running right now.
  920. *
  921. * Later on, someone is sure to update the inode and get the
  922. * real transid recorded.
  923. *
  924. * We set last_trans now to the fs_info generation + 1,
  925. * this will either be one more than the running transaction
  926. * or the generation used for the next transaction if there isn't
  927. * one running right now.
  928. */
  929. BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
  930. if (num_written > 0 && will_write) {
  931. struct btrfs_trans_handle *trans;
  932. err = btrfs_wait_ordered_range(inode, start_pos, num_written);
  933. if (err)
  934. num_written = err;
  935. if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
  936. trans = btrfs_start_transaction(root, 0);
  937. ret = btrfs_log_dentry_safe(trans, root,
  938. file->f_dentry);
  939. if (ret == 0) {
  940. ret = btrfs_sync_log(trans, root);
  941. if (ret == 0)
  942. btrfs_end_transaction(trans, root);
  943. else
  944. btrfs_commit_transaction(trans, root);
  945. } else if (ret != BTRFS_NO_LOG_SYNC) {
  946. btrfs_commit_transaction(trans, root);
  947. } else {
  948. btrfs_end_transaction(trans, root);
  949. }
  950. }
  951. if (file->f_flags & O_DIRECT && buffered) {
  952. invalidate_mapping_pages(inode->i_mapping,
  953. start_pos >> PAGE_CACHE_SHIFT,
  954. (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
  955. }
  956. }
  957. current->backing_dev_info = NULL;
  958. return num_written ? num_written : err;
  959. }
  960. int btrfs_release_file(struct inode *inode, struct file *filp)
  961. {
  962. /*
  963. * ordered_data_close is set by settattr when we are about to truncate
  964. * a file from a non-zero size to a zero size. This tries to
  965. * flush down new bytes that may have been written if the
  966. * application were using truncate to replace a file in place.
  967. */
  968. if (BTRFS_I(inode)->ordered_data_close) {
  969. BTRFS_I(inode)->ordered_data_close = 0;
  970. btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
  971. if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
  972. filemap_flush(inode->i_mapping);
  973. }
  974. if (filp->private_data)
  975. btrfs_ioctl_trans_end(filp);
  976. return 0;
  977. }
  978. /*
  979. * fsync call for both files and directories. This logs the inode into
  980. * the tree log instead of forcing full commits whenever possible.
  981. *
  982. * It needs to call filemap_fdatawait so that all ordered extent updates are
  983. * in the metadata btree are up to date for copying to the log.
  984. *
  985. * It drops the inode mutex before doing the tree log commit. This is an
  986. * important optimization for directories because holding the mutex prevents
  987. * new operations on the dir while we write to disk.
  988. */
  989. int btrfs_sync_file(struct file *file, int datasync)
  990. {
  991. struct dentry *dentry = file->f_path.dentry;
  992. struct inode *inode = dentry->d_inode;
  993. struct btrfs_root *root = BTRFS_I(inode)->root;
  994. int ret = 0;
  995. struct btrfs_trans_handle *trans;
  996. /* we wait first, since the writeback may change the inode */
  997. root->log_batch++;
  998. /* the VFS called filemap_fdatawrite for us */
  999. btrfs_wait_ordered_range(inode, 0, (u64)-1);
  1000. root->log_batch++;
  1001. /*
  1002. * check the transaction that last modified this inode
  1003. * and see if its already been committed
  1004. */
  1005. if (!BTRFS_I(inode)->last_trans)
  1006. goto out;
  1007. /*
  1008. * if the last transaction that changed this file was before
  1009. * the current transaction, we can bail out now without any
  1010. * syncing
  1011. */
  1012. mutex_lock(&root->fs_info->trans_mutex);
  1013. if (BTRFS_I(inode)->last_trans <=
  1014. root->fs_info->last_trans_committed) {
  1015. BTRFS_I(inode)->last_trans = 0;
  1016. mutex_unlock(&root->fs_info->trans_mutex);
  1017. goto out;
  1018. }
  1019. mutex_unlock(&root->fs_info->trans_mutex);
  1020. /*
  1021. * ok we haven't committed the transaction yet, lets do a commit
  1022. */
  1023. if (file->private_data)
  1024. btrfs_ioctl_trans_end(file);
  1025. trans = btrfs_start_transaction(root, 0);
  1026. if (IS_ERR(trans)) {
  1027. ret = PTR_ERR(trans);
  1028. goto out;
  1029. }
  1030. ret = btrfs_log_dentry_safe(trans, root, dentry);
  1031. if (ret < 0)
  1032. goto out;
  1033. /* we've logged all the items and now have a consistent
  1034. * version of the file in the log. It is possible that
  1035. * someone will come in and modify the file, but that's
  1036. * fine because the log is consistent on disk, and we
  1037. * have references to all of the file's extents
  1038. *
  1039. * It is possible that someone will come in and log the
  1040. * file again, but that will end up using the synchronization
  1041. * inside btrfs_sync_log to keep things safe.
  1042. */
  1043. mutex_unlock(&dentry->d_inode->i_mutex);
  1044. if (ret != BTRFS_NO_LOG_SYNC) {
  1045. if (ret > 0) {
  1046. ret = btrfs_commit_transaction(trans, root);
  1047. } else {
  1048. ret = btrfs_sync_log(trans, root);
  1049. if (ret == 0)
  1050. ret = btrfs_end_transaction(trans, root);
  1051. else
  1052. ret = btrfs_commit_transaction(trans, root);
  1053. }
  1054. } else {
  1055. ret = btrfs_end_transaction(trans, root);
  1056. }
  1057. mutex_lock(&dentry->d_inode->i_mutex);
  1058. out:
  1059. return ret > 0 ? -EIO : ret;
  1060. }
  1061. static const struct vm_operations_struct btrfs_file_vm_ops = {
  1062. .fault = filemap_fault,
  1063. .page_mkwrite = btrfs_page_mkwrite,
  1064. };
  1065. static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
  1066. {
  1067. struct address_space *mapping = filp->f_mapping;
  1068. if (!mapping->a_ops->readpage)
  1069. return -ENOEXEC;
  1070. file_accessed(filp);
  1071. vma->vm_ops = &btrfs_file_vm_ops;
  1072. vma->vm_flags |= VM_CAN_NONLINEAR;
  1073. return 0;
  1074. }
  1075. const struct file_operations btrfs_file_operations = {
  1076. .llseek = generic_file_llseek,
  1077. .read = do_sync_read,
  1078. .write = do_sync_write,
  1079. .aio_read = generic_file_aio_read,
  1080. .splice_read = generic_file_splice_read,
  1081. .aio_write = btrfs_file_aio_write,
  1082. .mmap = btrfs_file_mmap,
  1083. .open = generic_file_open,
  1084. .release = btrfs_release_file,
  1085. .fsync = btrfs_sync_file,
  1086. .unlocked_ioctl = btrfs_ioctl,
  1087. #ifdef CONFIG_COMPAT
  1088. .compat_ioctl = btrfs_ioctl,
  1089. #endif
  1090. };