extent_map.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656
  1. #include <linux/bitops.h>
  2. #include <linux/slab.h>
  3. #include <linux/bio.h>
  4. #include <linux/mm.h>
  5. #include <linux/gfp.h>
  6. #include <linux/pagemap.h>
  7. #include <linux/page-flags.h>
  8. #include <linux/module.h>
  9. #include <linux/spinlock.h>
  10. #include <linux/blkdev.h>
  11. #include "extent_map.h"
  12. static struct kmem_cache *extent_map_cache;
  13. static struct kmem_cache *extent_state_cache;
  14. struct tree_entry {
  15. u64 start;
  16. u64 end;
  17. int in_tree;
  18. struct rb_node rb_node;
  19. };
  20. /* bits for the extent state */
  21. #define EXTENT_DIRTY 1
  22. #define EXTENT_WRITEBACK (1 << 1)
  23. #define EXTENT_UPTODATE (1 << 2)
  24. #define EXTENT_LOCKED (1 << 3)
  25. #define EXTENT_NEW (1 << 4)
  26. #define EXTENT_DELALLOC (1 << 5)
  27. #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
  28. static LIST_HEAD(all_states);
  29. spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
  30. void __init extent_map_init(void)
  31. {
  32. extent_map_cache = kmem_cache_create("extent_map",
  33. sizeof(struct extent_map), 0,
  34. SLAB_RECLAIM_ACCOUNT |
  35. SLAB_DESTROY_BY_RCU,
  36. NULL);
  37. extent_state_cache = kmem_cache_create("extent_state",
  38. sizeof(struct extent_state), 0,
  39. SLAB_RECLAIM_ACCOUNT |
  40. SLAB_DESTROY_BY_RCU,
  41. NULL);
  42. }
  43. void __exit extent_map_exit(void)
  44. {
  45. while(!list_empty(&all_states)) {
  46. struct extent_state *state;
  47. struct list_head *cur = all_states.next;
  48. state = list_entry(cur, struct extent_state, list);
  49. printk("found leaked state %Lu %Lu state %d in_tree %d\n",
  50. state->start, state->end, state->state, state->in_tree);
  51. list_del(&state->list);
  52. kfree(state);
  53. }
  54. if (extent_map_cache)
  55. kmem_cache_destroy(extent_map_cache);
  56. if (extent_state_cache)
  57. kmem_cache_destroy(extent_state_cache);
  58. }
  59. void extent_map_tree_init(struct extent_map_tree *tree,
  60. struct address_space *mapping, gfp_t mask)
  61. {
  62. tree->map.rb_node = NULL;
  63. tree->state.rb_node = NULL;
  64. rwlock_init(&tree->lock);
  65. tree->mapping = mapping;
  66. }
  67. EXPORT_SYMBOL(extent_map_tree_init);
  68. struct extent_map *alloc_extent_map(gfp_t mask)
  69. {
  70. struct extent_map *em;
  71. em = kmem_cache_alloc(extent_map_cache, mask);
  72. if (!em || IS_ERR(em))
  73. return em;
  74. em->in_tree = 0;
  75. atomic_set(&em->refs, 1);
  76. return em;
  77. }
  78. EXPORT_SYMBOL(alloc_extent_map);
  79. void free_extent_map(struct extent_map *em)
  80. {
  81. if (atomic_dec_and_test(&em->refs)) {
  82. WARN_ON(em->in_tree);
  83. kmem_cache_free(extent_map_cache, em);
  84. }
  85. }
  86. EXPORT_SYMBOL(free_extent_map);
  87. struct extent_state *alloc_extent_state(gfp_t mask)
  88. {
  89. struct extent_state *state;
  90. state = kmem_cache_alloc(extent_state_cache, mask);
  91. if (!state || IS_ERR(state))
  92. return state;
  93. state->state = 0;
  94. state->in_tree = 0;
  95. atomic_set(&state->refs, 1);
  96. init_waitqueue_head(&state->wq);
  97. spin_lock_irq(&state_lock);
  98. list_add(&state->list, &all_states);
  99. spin_unlock_irq(&state_lock);
  100. return state;
  101. }
  102. EXPORT_SYMBOL(alloc_extent_state);
  103. void free_extent_state(struct extent_state *state)
  104. {
  105. if (atomic_dec_and_test(&state->refs)) {
  106. WARN_ON(state->in_tree);
  107. spin_lock_irq(&state_lock);
  108. list_del_init(&state->list);
  109. spin_unlock_irq(&state_lock);
  110. kmem_cache_free(extent_state_cache, state);
  111. }
  112. }
  113. EXPORT_SYMBOL(free_extent_state);
  114. static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
  115. struct rb_node *node)
  116. {
  117. struct rb_node ** p = &root->rb_node;
  118. struct rb_node * parent = NULL;
  119. struct tree_entry *entry;
  120. while(*p) {
  121. parent = *p;
  122. entry = rb_entry(parent, struct tree_entry, rb_node);
  123. if (offset < entry->start)
  124. p = &(*p)->rb_left;
  125. else if (offset > entry->end)
  126. p = &(*p)->rb_right;
  127. else
  128. return parent;
  129. }
  130. entry = rb_entry(node, struct tree_entry, rb_node);
  131. entry->in_tree = 1;
  132. rb_link_node(node, parent, p);
  133. rb_insert_color(node, root);
  134. return NULL;
  135. }
  136. static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
  137. struct rb_node **prev_ret)
  138. {
  139. struct rb_node * n = root->rb_node;
  140. struct rb_node *prev = NULL;
  141. struct tree_entry *entry;
  142. struct tree_entry *prev_entry = NULL;
  143. while(n) {
  144. entry = rb_entry(n, struct tree_entry, rb_node);
  145. prev = n;
  146. prev_entry = entry;
  147. if (offset < entry->start)
  148. n = n->rb_left;
  149. else if (offset > entry->end)
  150. n = n->rb_right;
  151. else
  152. return n;
  153. }
  154. if (!prev_ret)
  155. return NULL;
  156. while(prev && offset > prev_entry->end) {
  157. prev = rb_next(prev);
  158. prev_entry = rb_entry(prev, struct tree_entry, rb_node);
  159. }
  160. *prev_ret = prev;
  161. return NULL;
  162. }
  163. static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
  164. {
  165. struct rb_node *prev;
  166. struct rb_node *ret;
  167. ret = __tree_search(root, offset, &prev);
  168. if (!ret)
  169. return prev;
  170. return ret;
  171. }
  172. static int tree_delete(struct rb_root *root, u64 offset)
  173. {
  174. struct rb_node *node;
  175. struct tree_entry *entry;
  176. node = __tree_search(root, offset, NULL);
  177. if (!node)
  178. return -ENOENT;
  179. entry = rb_entry(node, struct tree_entry, rb_node);
  180. entry->in_tree = 0;
  181. rb_erase(node, root);
  182. return 0;
  183. }
  184. /*
  185. * add_extent_mapping tries a simple backward merge with existing
  186. * mappings. The extent_map struct passed in will be inserted into
  187. * the tree directly (no copies made, just a reference taken).
  188. */
  189. int add_extent_mapping(struct extent_map_tree *tree,
  190. struct extent_map *em)
  191. {
  192. int ret = 0;
  193. struct extent_map *prev = NULL;
  194. struct rb_node *rb;
  195. write_lock_irq(&tree->lock);
  196. rb = tree_insert(&tree->map, em->end, &em->rb_node);
  197. if (rb) {
  198. prev = rb_entry(rb, struct extent_map, rb_node);
  199. printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
  200. ret = -EEXIST;
  201. goto out;
  202. }
  203. atomic_inc(&em->refs);
  204. if (em->start != 0) {
  205. rb = rb_prev(&em->rb_node);
  206. if (rb)
  207. prev = rb_entry(rb, struct extent_map, rb_node);
  208. if (prev && prev->end + 1 == em->start &&
  209. ((em->block_start == 0 && prev->block_start == 0) ||
  210. (em->block_start == prev->block_end + 1))) {
  211. em->start = prev->start;
  212. em->block_start = prev->block_start;
  213. rb_erase(&prev->rb_node, &tree->map);
  214. prev->in_tree = 0;
  215. free_extent_map(prev);
  216. }
  217. }
  218. out:
  219. write_unlock_irq(&tree->lock);
  220. return ret;
  221. }
  222. EXPORT_SYMBOL(add_extent_mapping);
  223. /*
  224. * lookup_extent_mapping returns the first extent_map struct in the
  225. * tree that intersects the [start, end] (inclusive) range. There may
  226. * be additional objects in the tree that intersect, so check the object
  227. * returned carefully to make sure you don't need additional lookups.
  228. */
  229. struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
  230. u64 start, u64 end)
  231. {
  232. struct extent_map *em;
  233. struct rb_node *rb_node;
  234. read_lock_irq(&tree->lock);
  235. rb_node = tree_search(&tree->map, start);
  236. if (!rb_node) {
  237. em = NULL;
  238. goto out;
  239. }
  240. if (IS_ERR(rb_node)) {
  241. em = ERR_PTR(PTR_ERR(rb_node));
  242. goto out;
  243. }
  244. em = rb_entry(rb_node, struct extent_map, rb_node);
  245. if (em->end < start || em->start > end) {
  246. em = NULL;
  247. goto out;
  248. }
  249. atomic_inc(&em->refs);
  250. out:
  251. read_unlock_irq(&tree->lock);
  252. return em;
  253. }
  254. EXPORT_SYMBOL(lookup_extent_mapping);
  255. /*
  256. * removes an extent_map struct from the tree. No reference counts are
  257. * dropped, and no checks are done to see if the range is in use
  258. */
  259. int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
  260. {
  261. int ret;
  262. write_lock_irq(&tree->lock);
  263. ret = tree_delete(&tree->map, em->end);
  264. write_unlock_irq(&tree->lock);
  265. return ret;
  266. }
  267. EXPORT_SYMBOL(remove_extent_mapping);
  268. /*
  269. * utility function to look for merge candidates inside a given range.
  270. * Any extents with matching state are merged together into a single
  271. * extent in the tree. Extents with EXTENT_IO in their state field
  272. * are not merged because the end_io handlers need to be able to do
  273. * operations on them without sleeping (or doing allocations/splits).
  274. *
  275. * This should be called with the tree lock held.
  276. */
  277. static int merge_state(struct extent_map_tree *tree,
  278. struct extent_state *state)
  279. {
  280. struct extent_state *other;
  281. struct rb_node *other_node;
  282. if (state->state & EXTENT_IOBITS)
  283. return 0;
  284. other_node = rb_prev(&state->rb_node);
  285. if (other_node) {
  286. other = rb_entry(other_node, struct extent_state, rb_node);
  287. if (other->end == state->start - 1 &&
  288. other->state == state->state) {
  289. state->start = other->start;
  290. other->in_tree = 0;
  291. rb_erase(&other->rb_node, &tree->state);
  292. free_extent_state(other);
  293. }
  294. }
  295. other_node = rb_next(&state->rb_node);
  296. if (other_node) {
  297. other = rb_entry(other_node, struct extent_state, rb_node);
  298. if (other->start == state->end + 1 &&
  299. other->state == state->state) {
  300. other->start = state->start;
  301. state->in_tree = 0;
  302. rb_erase(&state->rb_node, &tree->state);
  303. free_extent_state(state);
  304. }
  305. }
  306. return 0;
  307. }
  308. /*
  309. * insert an extent_state struct into the tree. 'bits' are set on the
  310. * struct before it is inserted.
  311. *
  312. * This may return -EEXIST if the extent is already there, in which case the
  313. * state struct is freed.
  314. *
  315. * The tree lock is not taken internally. This is a utility function and
  316. * probably isn't what you want to call (see set/clear_extent_bit).
  317. */
  318. static int insert_state(struct extent_map_tree *tree,
  319. struct extent_state *state, u64 start, u64 end,
  320. int bits)
  321. {
  322. struct rb_node *node;
  323. if (end < start) {
  324. printk("end < start %Lu %Lu\n", end, start);
  325. WARN_ON(1);
  326. }
  327. state->state |= bits;
  328. state->start = start;
  329. state->end = end;
  330. if ((end & 4095) == 0) {
  331. printk("insert state %Lu %Lu strange end\n", start, end);
  332. WARN_ON(1);
  333. }
  334. node = tree_insert(&tree->state, end, &state->rb_node);
  335. if (node) {
  336. struct extent_state *found;
  337. found = rb_entry(node, struct extent_state, rb_node);
  338. printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
  339. free_extent_state(state);
  340. return -EEXIST;
  341. }
  342. merge_state(tree, state);
  343. return 0;
  344. }
  345. /*
  346. * split a given extent state struct in two, inserting the preallocated
  347. * struct 'prealloc' as the newly created second half. 'split' indicates an
  348. * offset inside 'orig' where it should be split.
  349. *
  350. * Before calling,
  351. * the tree has 'orig' at [orig->start, orig->end]. After calling, there
  352. * are two extent state structs in the tree:
  353. * prealloc: [orig->start, split - 1]
  354. * orig: [ split, orig->end ]
  355. *
  356. * The tree locks are not taken by this function. They need to be held
  357. * by the caller.
  358. */
  359. static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
  360. struct extent_state *prealloc, u64 split)
  361. {
  362. struct rb_node *node;
  363. prealloc->start = orig->start;
  364. prealloc->end = split - 1;
  365. prealloc->state = orig->state;
  366. orig->start = split;
  367. if ((prealloc->end & 4095) == 0) {
  368. printk("insert state %Lu %Lu strange end\n", prealloc->start,
  369. prealloc->end);
  370. WARN_ON(1);
  371. }
  372. node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
  373. if (node) {
  374. struct extent_state *found;
  375. found = rb_entry(node, struct extent_state, rb_node);
  376. printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
  377. free_extent_state(prealloc);
  378. return -EEXIST;
  379. }
  380. return 0;
  381. }
  382. /*
  383. * utility function to clear some bits in an extent state struct.
  384. * it will optionally wake up any one waiting on this state (wake == 1), or
  385. * forcibly remove the state from the tree (delete == 1).
  386. *
  387. * If no bits are set on the state struct after clearing things, the
  388. * struct is freed and removed from the tree
  389. */
  390. static int clear_state_bit(struct extent_map_tree *tree,
  391. struct extent_state *state, int bits, int wake,
  392. int delete)
  393. {
  394. int ret = state->state & bits;
  395. state->state &= ~bits;
  396. if (wake)
  397. wake_up(&state->wq);
  398. if (delete || state->state == 0) {
  399. if (state->in_tree) {
  400. rb_erase(&state->rb_node, &tree->state);
  401. state->in_tree = 0;
  402. free_extent_state(state);
  403. } else {
  404. WARN_ON(1);
  405. }
  406. } else {
  407. merge_state(tree, state);
  408. }
  409. return ret;
  410. }
  411. /*
  412. * clear some bits on a range in the tree. This may require splitting
  413. * or inserting elements in the tree, so the gfp mask is used to
  414. * indicate which allocations or sleeping are allowed.
  415. *
  416. * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
  417. * the given range from the tree regardless of state (ie for truncate).
  418. *
  419. * the range [start, end] is inclusive.
  420. *
  421. * This takes the tree lock, and returns < 0 on error, > 0 if any of the
  422. * bits were already set, or zero if none of the bits were already set.
  423. */
  424. int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
  425. int bits, int wake, int delete, gfp_t mask)
  426. {
  427. struct extent_state *state;
  428. struct extent_state *prealloc = NULL;
  429. struct rb_node *node;
  430. int err;
  431. int set = 0;
  432. again:
  433. if (!prealloc && (mask & __GFP_WAIT)) {
  434. prealloc = alloc_extent_state(mask);
  435. if (!prealloc)
  436. return -ENOMEM;
  437. }
  438. write_lock_irq(&tree->lock);
  439. /*
  440. * this search will find the extents that end after
  441. * our range starts
  442. */
  443. node = tree_search(&tree->state, start);
  444. if (!node)
  445. goto out;
  446. state = rb_entry(node, struct extent_state, rb_node);
  447. if (state->start > end)
  448. goto out;
  449. WARN_ON(state->end < start);
  450. /*
  451. * | ---- desired range ---- |
  452. * | state | or
  453. * | ------------- state -------------- |
  454. *
  455. * We need to split the extent we found, and may flip
  456. * bits on second half.
  457. *
  458. * If the extent we found extends past our range, we
  459. * just split and search again. It'll get split again
  460. * the next time though.
  461. *
  462. * If the extent we found is inside our range, we clear
  463. * the desired bit on it.
  464. */
  465. if (state->start < start) {
  466. err = split_state(tree, state, prealloc, start);
  467. BUG_ON(err == -EEXIST);
  468. prealloc = NULL;
  469. if (err)
  470. goto out;
  471. if (state->end <= end) {
  472. start = state->end + 1;
  473. set |= clear_state_bit(tree, state, bits,
  474. wake, delete);
  475. } else {
  476. start = state->start;
  477. }
  478. goto search_again;
  479. }
  480. /*
  481. * | ---- desired range ---- |
  482. * | state |
  483. * We need to split the extent, and clear the bit
  484. * on the first half
  485. */
  486. if (state->start <= end && state->end > end) {
  487. err = split_state(tree, state, prealloc, end + 1);
  488. BUG_ON(err == -EEXIST);
  489. if (wake)
  490. wake_up(&state->wq);
  491. set |= clear_state_bit(tree, prealloc, bits,
  492. wake, delete);
  493. prealloc = NULL;
  494. goto out;
  495. }
  496. start = state->end + 1;
  497. set |= clear_state_bit(tree, state, bits, wake, delete);
  498. goto search_again;
  499. out:
  500. write_unlock_irq(&tree->lock);
  501. if (prealloc)
  502. free_extent_state(prealloc);
  503. return set;
  504. search_again:
  505. if (start >= end)
  506. goto out;
  507. write_unlock_irq(&tree->lock);
  508. if (mask & __GFP_WAIT)
  509. cond_resched();
  510. goto again;
  511. }
  512. EXPORT_SYMBOL(clear_extent_bit);
  513. static int wait_on_state(struct extent_map_tree *tree,
  514. struct extent_state *state)
  515. {
  516. DEFINE_WAIT(wait);
  517. prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
  518. read_unlock_irq(&tree->lock);
  519. schedule();
  520. read_lock_irq(&tree->lock);
  521. finish_wait(&state->wq, &wait);
  522. return 0;
  523. }
  524. /*
  525. * waits for one or more bits to clear on a range in the state tree.
  526. * The range [start, end] is inclusive.
  527. * The tree lock is taken by this function
  528. */
  529. int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
  530. {
  531. struct extent_state *state;
  532. struct rb_node *node;
  533. read_lock_irq(&tree->lock);
  534. again:
  535. while (1) {
  536. /*
  537. * this search will find all the extents that end after
  538. * our range starts
  539. */
  540. node = tree_search(&tree->state, start);
  541. if (!node)
  542. break;
  543. state = rb_entry(node, struct extent_state, rb_node);
  544. if (state->start > end)
  545. goto out;
  546. if (state->state & bits) {
  547. start = state->start;
  548. atomic_inc(&state->refs);
  549. wait_on_state(tree, state);
  550. free_extent_state(state);
  551. goto again;
  552. }
  553. start = state->end + 1;
  554. if (start > end)
  555. break;
  556. if (need_resched()) {
  557. read_unlock_irq(&tree->lock);
  558. cond_resched();
  559. read_lock_irq(&tree->lock);
  560. }
  561. }
  562. out:
  563. read_unlock_irq(&tree->lock);
  564. return 0;
  565. }
  566. EXPORT_SYMBOL(wait_extent_bit);
  567. /*
  568. * set some bits on a range in the tree. This may require allocations
  569. * or sleeping, so the gfp mask is used to indicate what is allowed.
  570. *
  571. * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
  572. * range already has the desired bits set. The start of the existing
  573. * range is returned in failed_start in this case.
  574. *
  575. * [start, end] is inclusive
  576. * This takes the tree lock.
  577. */
  578. int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
  579. int exclusive, u64 *failed_start, gfp_t mask)
  580. {
  581. struct extent_state *state;
  582. struct extent_state *prealloc = NULL;
  583. struct rb_node *node;
  584. int err = 0;
  585. int set;
  586. u64 last_start;
  587. u64 last_end;
  588. again:
  589. if (!prealloc && (mask & __GFP_WAIT)) {
  590. prealloc = alloc_extent_state(mask);
  591. if (!prealloc)
  592. return -ENOMEM;
  593. }
  594. write_lock_irq(&tree->lock);
  595. /*
  596. * this search will find all the extents that end after
  597. * our range starts.
  598. */
  599. node = tree_search(&tree->state, start);
  600. if (!node) {
  601. err = insert_state(tree, prealloc, start, end, bits);
  602. prealloc = NULL;
  603. BUG_ON(err == -EEXIST);
  604. goto out;
  605. }
  606. state = rb_entry(node, struct extent_state, rb_node);
  607. last_start = state->start;
  608. last_end = state->end;
  609. /*
  610. * | ---- desired range ---- |
  611. * | state |
  612. *
  613. * Just lock what we found and keep going
  614. */
  615. if (state->start == start && state->end <= end) {
  616. set = state->state & bits;
  617. if (set && exclusive) {
  618. *failed_start = state->start;
  619. err = -EEXIST;
  620. goto out;
  621. }
  622. state->state |= bits;
  623. start = state->end + 1;
  624. merge_state(tree, state);
  625. goto search_again;
  626. }
  627. /*
  628. * | ---- desired range ---- |
  629. * | state |
  630. * or
  631. * | ------------- state -------------- |
  632. *
  633. * We need to split the extent we found, and may flip bits on
  634. * second half.
  635. *
  636. * If the extent we found extends past our
  637. * range, we just split and search again. It'll get split
  638. * again the next time though.
  639. *
  640. * If the extent we found is inside our range, we set the
  641. * desired bit on it.
  642. */
  643. if (state->start < start) {
  644. set = state->state & bits;
  645. if (exclusive && set) {
  646. *failed_start = start;
  647. err = -EEXIST;
  648. goto out;
  649. }
  650. err = split_state(tree, state, prealloc, start);
  651. BUG_ON(err == -EEXIST);
  652. prealloc = NULL;
  653. if (err)
  654. goto out;
  655. if (state->end <= end) {
  656. state->state |= bits;
  657. start = state->end + 1;
  658. merge_state(tree, state);
  659. } else {
  660. start = state->start;
  661. }
  662. goto search_again;
  663. }
  664. /*
  665. * | ---- desired range ---- |
  666. * | state |
  667. * We need to split the extent, and set the bit
  668. * on the first half
  669. */
  670. if (state->start <= end && state->end > end) {
  671. set = state->state & bits;
  672. if (exclusive && set) {
  673. *failed_start = start;
  674. err = -EEXIST;
  675. goto out;
  676. }
  677. err = split_state(tree, state, prealloc, end + 1);
  678. BUG_ON(err == -EEXIST);
  679. prealloc->state |= bits;
  680. merge_state(tree, prealloc);
  681. prealloc = NULL;
  682. goto out;
  683. }
  684. /*
  685. * | ---- desired range ---- |
  686. * | state | or | state |
  687. *
  688. * There's a hole, we need to insert something in it and
  689. * ignore the extent we found.
  690. */
  691. if (state->start > start) {
  692. u64 this_end;
  693. if (end < last_start)
  694. this_end = end;
  695. else
  696. this_end = last_start -1;
  697. err = insert_state(tree, prealloc, start, this_end,
  698. bits);
  699. prealloc = NULL;
  700. BUG_ON(err == -EEXIST);
  701. if (err)
  702. goto out;
  703. start = this_end + 1;
  704. goto search_again;
  705. }
  706. goto search_again;
  707. out:
  708. write_unlock_irq(&tree->lock);
  709. if (prealloc)
  710. free_extent_state(prealloc);
  711. return err;
  712. search_again:
  713. if (start > end)
  714. goto out;
  715. write_unlock_irq(&tree->lock);
  716. if (mask & __GFP_WAIT)
  717. cond_resched();
  718. goto again;
  719. }
  720. EXPORT_SYMBOL(set_extent_bit);
  721. /* wrappers around set/clear extent bit */
  722. int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
  723. gfp_t mask)
  724. {
  725. return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
  726. mask);
  727. }
  728. EXPORT_SYMBOL(set_extent_dirty);
  729. int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
  730. gfp_t mask)
  731. {
  732. return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask);
  733. }
  734. EXPORT_SYMBOL(clear_extent_dirty);
  735. int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
  736. gfp_t mask)
  737. {
  738. return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
  739. mask);
  740. }
  741. EXPORT_SYMBOL(set_extent_new);
  742. int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
  743. gfp_t mask)
  744. {
  745. return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
  746. }
  747. EXPORT_SYMBOL(clear_extent_new);
  748. int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
  749. gfp_t mask)
  750. {
  751. return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
  752. mask);
  753. }
  754. EXPORT_SYMBOL(set_extent_uptodate);
  755. int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
  756. gfp_t mask)
  757. {
  758. return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
  759. }
  760. EXPORT_SYMBOL(clear_extent_uptodate);
  761. int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
  762. gfp_t mask)
  763. {
  764. return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
  765. 0, NULL, mask);
  766. }
  767. EXPORT_SYMBOL(set_extent_writeback);
  768. int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
  769. gfp_t mask)
  770. {
  771. return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
  772. }
  773. EXPORT_SYMBOL(clear_extent_writeback);
  774. int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
  775. {
  776. return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
  777. }
  778. EXPORT_SYMBOL(wait_on_extent_writeback);
  779. /*
  780. * locks a range in ascending order, waiting for any locked regions
  781. * it hits on the way. [start,end] are inclusive, and this will sleep.
  782. */
  783. int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
  784. {
  785. int err;
  786. u64 failed_start;
  787. while (1) {
  788. err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
  789. &failed_start, mask);
  790. if (err == -EEXIST && (mask & __GFP_WAIT)) {
  791. wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
  792. start = failed_start;
  793. } else {
  794. break;
  795. }
  796. WARN_ON(start > end);
  797. }
  798. return err;
  799. }
  800. EXPORT_SYMBOL(lock_extent);
  801. int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
  802. gfp_t mask)
  803. {
  804. return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
  805. }
  806. EXPORT_SYMBOL(unlock_extent);
  807. /*
  808. * helper function to set pages and extents in the tree dirty
  809. */
  810. int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
  811. {
  812. unsigned long index = start >> PAGE_CACHE_SHIFT;
  813. unsigned long end_index = end >> PAGE_CACHE_SHIFT;
  814. struct page *page;
  815. while (index <= end_index) {
  816. page = find_get_page(tree->mapping, index);
  817. BUG_ON(!page);
  818. __set_page_dirty_nobuffers(page);
  819. page_cache_release(page);
  820. index++;
  821. }
  822. set_extent_dirty(tree, start, end, GFP_NOFS);
  823. return 0;
  824. }
  825. EXPORT_SYMBOL(set_range_dirty);
  826. /*
  827. * helper function to set both pages and extents in the tree writeback
  828. */
  829. int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
  830. {
  831. unsigned long index = start >> PAGE_CACHE_SHIFT;
  832. unsigned long end_index = end >> PAGE_CACHE_SHIFT;
  833. struct page *page;
  834. while (index <= end_index) {
  835. page = find_get_page(tree->mapping, index);
  836. BUG_ON(!page);
  837. set_page_writeback(page);
  838. page_cache_release(page);
  839. index++;
  840. }
  841. set_extent_writeback(tree, start, end, GFP_NOFS);
  842. return 0;
  843. }
  844. EXPORT_SYMBOL(set_range_writeback);
  845. /*
  846. * helper function to lock both pages and extents in the tree.
  847. * pages must be locked first.
  848. */
  849. int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
  850. {
  851. unsigned long index = start >> PAGE_CACHE_SHIFT;
  852. unsigned long end_index = end >> PAGE_CACHE_SHIFT;
  853. struct page *page;
  854. int err;
  855. while (index <= end_index) {
  856. page = grab_cache_page(tree->mapping, index);
  857. if (!page) {
  858. err = -ENOMEM;
  859. goto failed;
  860. }
  861. if (IS_ERR(page)) {
  862. err = PTR_ERR(page);
  863. goto failed;
  864. }
  865. index++;
  866. }
  867. lock_extent(tree, start, end, GFP_NOFS);
  868. return 0;
  869. failed:
  870. /*
  871. * we failed above in getting the page at 'index', so we undo here
  872. * up to but not including the page at 'index'
  873. */
  874. end_index = index;
  875. index = start >> PAGE_CACHE_SHIFT;
  876. while (index < end_index) {
  877. page = find_get_page(tree->mapping, index);
  878. unlock_page(page);
  879. page_cache_release(page);
  880. index++;
  881. }
  882. return err;
  883. }
  884. EXPORT_SYMBOL(lock_range);
  885. /*
  886. * helper function to unlock both pages and extents in the tree.
  887. */
  888. int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
  889. {
  890. unsigned long index = start >> PAGE_CACHE_SHIFT;
  891. unsigned long end_index = end >> PAGE_CACHE_SHIFT;
  892. struct page *page;
  893. while (index <= end_index) {
  894. page = find_get_page(tree->mapping, index);
  895. unlock_page(page);
  896. page_cache_release(page);
  897. index++;
  898. }
  899. unlock_extent(tree, start, end, GFP_NOFS);
  900. return 0;
  901. }
  902. EXPORT_SYMBOL(unlock_range);
  903. /*
  904. * searches a range in the state tree for a given mask.
  905. * If 'filled' == 1, this returns 1 only if ever extent in the tree
  906. * has the bits set. Otherwise, 1 is returned if any bit in the
  907. * range is found set.
  908. */
  909. static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
  910. int bits, int filled)
  911. {
  912. struct extent_state *state = NULL;
  913. struct rb_node *node;
  914. int bitset = 0;
  915. read_lock_irq(&tree->lock);
  916. node = tree_search(&tree->state, start);
  917. while (node && start <= end) {
  918. state = rb_entry(node, struct extent_state, rb_node);
  919. if (state->start > end)
  920. break;
  921. if (filled && state->start > start) {
  922. bitset = 0;
  923. break;
  924. }
  925. if (state->state & bits) {
  926. bitset = 1;
  927. if (!filled)
  928. break;
  929. } else if (filled) {
  930. bitset = 0;
  931. break;
  932. }
  933. start = state->end + 1;
  934. if (start > end)
  935. break;
  936. node = rb_next(node);
  937. }
  938. read_unlock_irq(&tree->lock);
  939. return bitset;
  940. }
  941. /*
  942. * helper function to set a given page up to date if all the
  943. * extents in the tree for that page are up to date
  944. */
  945. static int check_page_uptodate(struct extent_map_tree *tree,
  946. struct page *page)
  947. {
  948. u64 start = page->index << PAGE_CACHE_SHIFT;
  949. u64 end = start + PAGE_CACHE_SIZE - 1;
  950. if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
  951. SetPageUptodate(page);
  952. return 0;
  953. }
  954. /*
  955. * helper function to unlock a page if all the extents in the tree
  956. * for that page are unlocked
  957. */
  958. static int check_page_locked(struct extent_map_tree *tree,
  959. struct page *page)
  960. {
  961. u64 start = page->index << PAGE_CACHE_SHIFT;
  962. u64 end = start + PAGE_CACHE_SIZE - 1;
  963. if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
  964. unlock_page(page);
  965. return 0;
  966. }
  967. /*
  968. * helper function to end page writeback if all the extents
  969. * in the tree for that page are done with writeback
  970. */
  971. static int check_page_writeback(struct extent_map_tree *tree,
  972. struct page *page)
  973. {
  974. u64 start = page->index << PAGE_CACHE_SHIFT;
  975. u64 end = start + PAGE_CACHE_SIZE - 1;
  976. if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
  977. end_page_writeback(page);
  978. return 0;
  979. }
  980. /* lots and lots of room for performance fixes in the end_bio funcs */
  981. /*
  982. * after a writepage IO is done, we need to:
  983. * clear the uptodate bits on error
  984. * clear the writeback bits in the extent tree for this IO
  985. * end_page_writeback if the page has no more pending IO
  986. *
  987. * Scheduling is not allowed, so the extent state tree is expected
  988. * to have one and only one object corresponding to this IO.
  989. */
  990. static int end_bio_extent_writepage(struct bio *bio,
  991. unsigned int bytes_done, int err)
  992. {
  993. const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
  994. struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
  995. struct extent_map_tree *tree = bio->bi_private;
  996. u64 start;
  997. u64 end;
  998. int whole_page;
  999. if (bio->bi_size)
  1000. return 1;
  1001. do {
  1002. struct page *page = bvec->bv_page;
  1003. start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
  1004. end = start + bvec->bv_len - 1;
  1005. if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
  1006. whole_page = 1;
  1007. else
  1008. whole_page = 0;
  1009. if (--bvec >= bio->bi_io_vec)
  1010. prefetchw(&bvec->bv_page->flags);
  1011. if (!uptodate) {
  1012. clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
  1013. ClearPageUptodate(page);
  1014. SetPageError(page);
  1015. }
  1016. clear_extent_writeback(tree, start, end, GFP_ATOMIC);
  1017. if (whole_page)
  1018. end_page_writeback(page);
  1019. else
  1020. check_page_writeback(tree, page);
  1021. } while (bvec >= bio->bi_io_vec);
  1022. bio_put(bio);
  1023. return 0;
  1024. }
  1025. /*
  1026. * after a readpage IO is done, we need to:
  1027. * clear the uptodate bits on error
  1028. * set the uptodate bits if things worked
  1029. * set the page up to date if all extents in the tree are uptodate
  1030. * clear the lock bit in the extent tree
  1031. * unlock the page if there are no other extents locked for it
  1032. *
  1033. * Scheduling is not allowed, so the extent state tree is expected
  1034. * to have one and only one object corresponding to this IO.
  1035. */
  1036. static int end_bio_extent_readpage(struct bio *bio,
  1037. unsigned int bytes_done, int err)
  1038. {
  1039. const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
  1040. struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
  1041. struct extent_map_tree *tree = bio->bi_private;
  1042. u64 start;
  1043. u64 end;
  1044. int whole_page;
  1045. if (bio->bi_size)
  1046. return 1;
  1047. do {
  1048. struct page *page = bvec->bv_page;
  1049. start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
  1050. end = start + bvec->bv_len - 1;
  1051. if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
  1052. whole_page = 1;
  1053. else
  1054. whole_page = 0;
  1055. if (--bvec >= bio->bi_io_vec)
  1056. prefetchw(&bvec->bv_page->flags);
  1057. if (uptodate) {
  1058. set_extent_uptodate(tree, start, end, GFP_ATOMIC);
  1059. if (whole_page)
  1060. SetPageUptodate(page);
  1061. else
  1062. check_page_uptodate(tree, page);
  1063. } else {
  1064. ClearPageUptodate(page);
  1065. SetPageError(page);
  1066. }
  1067. unlock_extent(tree, start, end, GFP_ATOMIC);
  1068. if (whole_page)
  1069. unlock_page(page);
  1070. else
  1071. check_page_locked(tree, page);
  1072. } while (bvec >= bio->bi_io_vec);
  1073. bio_put(bio);
  1074. return 0;
  1075. }
  1076. /*
  1077. * IO done from prepare_write is pretty simple, we just unlock
  1078. * the structs in the extent tree when done, and set the uptodate bits
  1079. * as appropriate.
  1080. */
  1081. static int end_bio_extent_preparewrite(struct bio *bio,
  1082. unsigned int bytes_done, int err)
  1083. {
  1084. const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
  1085. struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
  1086. struct extent_map_tree *tree = bio->bi_private;
  1087. u64 start;
  1088. u64 end;
  1089. if (bio->bi_size)
  1090. return 1;
  1091. do {
  1092. struct page *page = bvec->bv_page;
  1093. start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
  1094. end = start + bvec->bv_len - 1;
  1095. if (--bvec >= bio->bi_io_vec)
  1096. prefetchw(&bvec->bv_page->flags);
  1097. if (uptodate) {
  1098. set_extent_uptodate(tree, start, end, GFP_ATOMIC);
  1099. } else {
  1100. ClearPageUptodate(page);
  1101. SetPageError(page);
  1102. }
  1103. unlock_extent(tree, start, end, GFP_ATOMIC);
  1104. } while (bvec >= bio->bi_io_vec);
  1105. bio_put(bio);
  1106. return 0;
  1107. }
  1108. static int submit_extent_page(int rw, struct extent_map_tree *tree,
  1109. struct page *page, sector_t sector,
  1110. size_t size, unsigned long offset,
  1111. struct block_device *bdev,
  1112. bio_end_io_t end_io_func)
  1113. {
  1114. struct bio *bio;
  1115. int ret = 0;
  1116. bio = bio_alloc(GFP_NOIO, 1);
  1117. bio->bi_sector = sector;
  1118. bio->bi_bdev = bdev;
  1119. bio->bi_io_vec[0].bv_page = page;
  1120. bio->bi_io_vec[0].bv_len = size;
  1121. bio->bi_io_vec[0].bv_offset = offset;
  1122. bio->bi_vcnt = 1;
  1123. bio->bi_idx = 0;
  1124. bio->bi_size = size;
  1125. bio->bi_end_io = end_io_func;
  1126. bio->bi_private = tree;
  1127. bio_get(bio);
  1128. submit_bio(rw, bio);
  1129. if (bio_flagged(bio, BIO_EOPNOTSUPP))
  1130. ret = -EOPNOTSUPP;
  1131. bio_put(bio);
  1132. return ret;
  1133. }
  1134. /*
  1135. * basic readpage implementation. Locked extent state structs are inserted
  1136. * into the tree that are removed when the IO is done (by the end_io
  1137. * handlers)
  1138. */
  1139. int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
  1140. get_extent_t *get_extent)
  1141. {
  1142. struct inode *inode = page->mapping->host;
  1143. u64 start = page->index << PAGE_CACHE_SHIFT;
  1144. u64 page_end = start + PAGE_CACHE_SIZE - 1;
  1145. u64 end;
  1146. u64 cur = start;
  1147. u64 extent_offset;
  1148. u64 last_byte = i_size_read(inode);
  1149. u64 block_start;
  1150. u64 cur_end;
  1151. sector_t sector;
  1152. struct extent_map *em;
  1153. struct block_device *bdev;
  1154. int ret;
  1155. int nr = 0;
  1156. size_t page_offset = 0;
  1157. size_t iosize;
  1158. size_t blocksize = inode->i_sb->s_blocksize;
  1159. if (!PagePrivate(page)) {
  1160. SetPagePrivate(page);
  1161. set_page_private(page, 1);
  1162. page_cache_get(page);
  1163. }
  1164. end = page_end;
  1165. lock_extent(tree, start, end, GFP_NOFS);
  1166. while (cur <= end) {
  1167. if (cur >= last_byte) {
  1168. iosize = PAGE_CACHE_SIZE - page_offset;
  1169. zero_user_page(page, page_offset, iosize, KM_USER0);
  1170. set_extent_uptodate(tree, cur, cur + iosize - 1,
  1171. GFP_NOFS);
  1172. unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
  1173. break;
  1174. }
  1175. em = get_extent(inode, page, page_offset, cur, end, 0);
  1176. if (IS_ERR(em) || !em) {
  1177. SetPageError(page);
  1178. unlock_extent(tree, cur, end, GFP_NOFS);
  1179. break;
  1180. }
  1181. extent_offset = cur - em->start;
  1182. BUG_ON(em->end < cur);
  1183. BUG_ON(end < cur);
  1184. iosize = min(em->end - cur, end - cur) + 1;
  1185. cur_end = min(em->end, end);
  1186. iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
  1187. sector = (em->block_start + extent_offset) >> 9;
  1188. bdev = em->bdev;
  1189. block_start = em->block_start;
  1190. free_extent_map(em);
  1191. em = NULL;
  1192. /* we've found a hole, just zero and go on */
  1193. if (block_start == 0) {
  1194. zero_user_page(page, page_offset, iosize, KM_USER0);
  1195. set_extent_uptodate(tree, cur, cur + iosize - 1,
  1196. GFP_NOFS);
  1197. unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
  1198. cur = cur + iosize;
  1199. page_offset += iosize;
  1200. continue;
  1201. }
  1202. /* the get_extent function already copied into the page */
  1203. if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
  1204. unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
  1205. cur = cur + iosize;
  1206. page_offset += iosize;
  1207. continue;
  1208. }
  1209. ret = submit_extent_page(READ, tree, page,
  1210. sector, iosize, page_offset, bdev,
  1211. end_bio_extent_readpage);
  1212. if (ret)
  1213. SetPageError(page);
  1214. cur = cur + iosize;
  1215. page_offset += iosize;
  1216. nr++;
  1217. }
  1218. if (!nr) {
  1219. if (!PageError(page))
  1220. SetPageUptodate(page);
  1221. unlock_page(page);
  1222. }
  1223. return 0;
  1224. }
  1225. EXPORT_SYMBOL(extent_read_full_page);
  1226. /*
  1227. * the writepage semantics are similar to regular writepage. extent
  1228. * records are inserted to lock ranges in the tree, and as dirty areas
  1229. * are found, they are marked writeback. Then the lock bits are removed
  1230. * and the end_io handler clears the writeback ranges
  1231. */
  1232. int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
  1233. get_extent_t *get_extent,
  1234. struct writeback_control *wbc)
  1235. {
  1236. struct inode *inode = page->mapping->host;
  1237. u64 start = page->index << PAGE_CACHE_SHIFT;
  1238. u64 page_end = start + PAGE_CACHE_SIZE - 1;
  1239. u64 end;
  1240. u64 cur = start;
  1241. u64 extent_offset;
  1242. u64 last_byte = i_size_read(inode);
  1243. u64 block_start;
  1244. sector_t sector;
  1245. struct extent_map *em;
  1246. struct block_device *bdev;
  1247. int ret;
  1248. int nr = 0;
  1249. size_t page_offset = 0;
  1250. size_t iosize;
  1251. size_t blocksize;
  1252. loff_t i_size = i_size_read(inode);
  1253. unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
  1254. if (page->index > end_index) {
  1255. clear_extent_dirty(tree, start, page_end, GFP_NOFS);
  1256. unlock_page(page);
  1257. return 0;
  1258. }
  1259. if (page->index == end_index) {
  1260. size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
  1261. zero_user_page(page, offset,
  1262. PAGE_CACHE_SIZE - offset, KM_USER0);
  1263. }
  1264. if (!PagePrivate(page)) {
  1265. SetPagePrivate(page);
  1266. set_page_private(page, 1);
  1267. page_cache_get(page);
  1268. }
  1269. end = page_end;
  1270. lock_extent(tree, start, page_end, GFP_NOFS);
  1271. if (last_byte <= start) {
  1272. clear_extent_dirty(tree, start, page_end, GFP_NOFS);
  1273. goto done;
  1274. }
  1275. set_extent_uptodate(tree, start, page_end, GFP_NOFS);
  1276. blocksize = inode->i_sb->s_blocksize;
  1277. while (cur <= end) {
  1278. if (cur >= last_byte) {
  1279. clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
  1280. break;
  1281. }
  1282. em = get_extent(inode, page, page_offset, cur, end, 1);
  1283. if (IS_ERR(em) || !em) {
  1284. SetPageError(page);
  1285. break;
  1286. }
  1287. extent_offset = cur - em->start;
  1288. BUG_ON(em->end < cur);
  1289. BUG_ON(end < cur);
  1290. iosize = min(em->end - cur, end - cur) + 1;
  1291. iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
  1292. sector = (em->block_start + extent_offset) >> 9;
  1293. bdev = em->bdev;
  1294. block_start = em->block_start;
  1295. free_extent_map(em);
  1296. em = NULL;
  1297. if (block_start == 0 || block_start == EXTENT_MAP_INLINE) {
  1298. clear_extent_dirty(tree, cur,
  1299. cur + iosize - 1, GFP_NOFS);
  1300. cur = cur + iosize;
  1301. page_offset += iosize;
  1302. continue;
  1303. }
  1304. /* leave this out until we have a page_mkwrite call */
  1305. if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
  1306. EXTENT_DIRTY, 0)) {
  1307. cur = cur + iosize;
  1308. page_offset += iosize;
  1309. continue;
  1310. }
  1311. clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
  1312. set_range_writeback(tree, cur, cur + iosize - 1);
  1313. ret = submit_extent_page(WRITE, tree, page,
  1314. sector, iosize, page_offset, bdev,
  1315. end_bio_extent_writepage);
  1316. if (ret)
  1317. SetPageError(page);
  1318. cur = cur + iosize;
  1319. page_offset += iosize;
  1320. nr++;
  1321. }
  1322. done:
  1323. WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0));
  1324. unlock_extent(tree, start, page_end, GFP_NOFS);
  1325. unlock_page(page);
  1326. return 0;
  1327. }
  1328. EXPORT_SYMBOL(extent_write_full_page);
  1329. /*
  1330. * basic invalidatepage code, this waits on any locked or writeback
  1331. * ranges corresponding to the page, and then deletes any extent state
  1332. * records from the tree
  1333. */
  1334. int extent_invalidatepage(struct extent_map_tree *tree,
  1335. struct page *page, unsigned long offset)
  1336. {
  1337. u64 start = (page->index << PAGE_CACHE_SHIFT);
  1338. u64 end = start + PAGE_CACHE_SIZE - 1;
  1339. size_t blocksize = page->mapping->host->i_sb->s_blocksize;
  1340. start += (offset + blocksize -1) & ~(blocksize - 1);
  1341. if (start > end)
  1342. return 0;
  1343. lock_extent(tree, start, end, GFP_NOFS);
  1344. wait_on_extent_writeback(tree, start, end);
  1345. clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY,
  1346. 1, 1, GFP_NOFS);
  1347. return 0;
  1348. }
  1349. EXPORT_SYMBOL(extent_invalidatepage);
  1350. /*
  1351. * simple commit_write call, set_range_dirty is used to mark both
  1352. * the pages and the extent records as dirty
  1353. */
  1354. int extent_commit_write(struct extent_map_tree *tree,
  1355. struct inode *inode, struct page *page,
  1356. unsigned from, unsigned to)
  1357. {
  1358. loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
  1359. if (!PagePrivate(page)) {
  1360. SetPagePrivate(page);
  1361. set_page_private(page, 1);
  1362. page_cache_get(page);
  1363. }
  1364. set_page_dirty(page);
  1365. if (pos > inode->i_size) {
  1366. i_size_write(inode, pos);
  1367. mark_inode_dirty(inode);
  1368. }
  1369. return 0;
  1370. }
  1371. EXPORT_SYMBOL(extent_commit_write);
  1372. int extent_prepare_write(struct extent_map_tree *tree,
  1373. struct inode *inode, struct page *page,
  1374. unsigned from, unsigned to, get_extent_t *get_extent)
  1375. {
  1376. u64 page_start = page->index << PAGE_CACHE_SHIFT;
  1377. u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
  1378. u64 block_start;
  1379. u64 orig_block_start;
  1380. u64 block_end;
  1381. u64 cur_end;
  1382. struct extent_map *em;
  1383. unsigned blocksize = 1 << inode->i_blkbits;
  1384. size_t page_offset = 0;
  1385. size_t block_off_start;
  1386. size_t block_off_end;
  1387. int err = 0;
  1388. int iocount = 0;
  1389. int ret = 0;
  1390. int isnew;
  1391. if (!PagePrivate(page)) {
  1392. SetPagePrivate(page);
  1393. set_page_private(page, 1);
  1394. page_cache_get(page);
  1395. }
  1396. block_start = (page_start + from) & ~((u64)blocksize - 1);
  1397. block_end = (page_start + to - 1) | (blocksize - 1);
  1398. orig_block_start = block_start;
  1399. lock_extent(tree, page_start, page_end, GFP_NOFS);
  1400. while(block_start <= block_end) {
  1401. em = get_extent(inode, page, page_offset, block_start,
  1402. block_end, 1);
  1403. if (IS_ERR(em) || !em) {
  1404. goto err;
  1405. }
  1406. cur_end = min(block_end, em->end);
  1407. block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
  1408. block_off_end = block_off_start + blocksize;
  1409. isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
  1410. if (!PageUptodate(page) && isnew &&
  1411. (block_off_end > to || block_off_start < from)) {
  1412. void *kaddr;
  1413. kaddr = kmap_atomic(page, KM_USER0);
  1414. if (block_off_end > to)
  1415. memset(kaddr + to, 0, block_off_end - to);
  1416. if (block_off_start < from)
  1417. memset(kaddr + block_off_start, 0,
  1418. from - block_off_start);
  1419. flush_dcache_page(page);
  1420. kunmap_atomic(kaddr, KM_USER0);
  1421. }
  1422. if (!isnew && !PageUptodate(page) &&
  1423. (block_off_end > to || block_off_start < from) &&
  1424. !test_range_bit(tree, block_start, cur_end,
  1425. EXTENT_UPTODATE, 1)) {
  1426. u64 sector;
  1427. u64 extent_offset = block_start - em->start;
  1428. size_t iosize;
  1429. sector = (em->block_start + extent_offset) >> 9;
  1430. iosize = (cur_end - block_start + blocksize - 1) &
  1431. ~((u64)blocksize - 1);
  1432. /*
  1433. * we've already got the extent locked, but we
  1434. * need to split the state such that our end_bio
  1435. * handler can clear the lock.
  1436. */
  1437. set_extent_bit(tree, block_start,
  1438. block_start + iosize - 1,
  1439. EXTENT_LOCKED, 0, NULL, GFP_NOFS);
  1440. ret = submit_extent_page(READ, tree, page,
  1441. sector, iosize, page_offset, em->bdev,
  1442. end_bio_extent_preparewrite);
  1443. iocount++;
  1444. block_start = block_start + iosize;
  1445. } else {
  1446. set_extent_uptodate(tree, block_start, cur_end,
  1447. GFP_NOFS);
  1448. unlock_extent(tree, block_start, cur_end, GFP_NOFS);
  1449. block_start = cur_end + 1;
  1450. }
  1451. page_offset = block_start & (PAGE_CACHE_SIZE - 1);
  1452. free_extent_map(em);
  1453. }
  1454. if (iocount) {
  1455. wait_extent_bit(tree, orig_block_start,
  1456. block_end, EXTENT_LOCKED);
  1457. }
  1458. check_page_uptodate(tree, page);
  1459. err:
  1460. /* FIXME, zero out newly allocated blocks on error */
  1461. return err;
  1462. }
  1463. EXPORT_SYMBOL(extent_prepare_write);
  1464. /*
  1465. * a helper for releasepage. As long as there are no locked extents
  1466. * in the range corresponding to the page, both state records and extent
  1467. * map records are removed
  1468. */
  1469. int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
  1470. {
  1471. struct extent_map *em;
  1472. u64 start = page->index << PAGE_CACHE_SHIFT;
  1473. u64 end = start + PAGE_CACHE_SIZE - 1;
  1474. u64 orig_start = start;
  1475. while (start <= end) {
  1476. em = lookup_extent_mapping(tree, start, end);
  1477. if (!em || IS_ERR(em))
  1478. break;
  1479. if (test_range_bit(tree, em->start, em->end,
  1480. EXTENT_LOCKED, 0)) {
  1481. free_extent_map(em);
  1482. start = em->end + 1;
  1483. printk("range still locked %Lu %Lu\n", em->start, em->end);
  1484. break;
  1485. }
  1486. remove_extent_mapping(tree, em);
  1487. start = em->end + 1;
  1488. /* once for the rb tree */
  1489. free_extent_map(em);
  1490. /* once for us */
  1491. free_extent_map(em);
  1492. }
  1493. WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0));
  1494. clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
  1495. 1, 1, GFP_NOFS);
  1496. return 1;
  1497. }
  1498. EXPORT_SYMBOL(try_release_extent_mapping);