extents.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909
  1. /*
  2. * linux/fs/nfs/blocklayout/blocklayout.h
  3. *
  4. * Module for the NFSv4.1 pNFS block layout driver.
  5. *
  6. * Copyright (c) 2006 The Regents of the University of Michigan.
  7. * All rights reserved.
  8. *
  9. * Andy Adamson <andros@citi.umich.edu>
  10. * Fred Isaman <iisaman@umich.edu>
  11. *
  12. * permission is granted to use, copy, create derivative works and
  13. * redistribute this software and such derivative works for any purpose,
  14. * so long as the name of the university of michigan is not used in
  15. * any advertising or publicity pertaining to the use or distribution
  16. * of this software without specific, written prior authorization. if
  17. * the above copyright notice or any other identification of the
  18. * university of michigan is included in any copy of any portion of
  19. * this software, then the disclaimer below must also be included.
  20. *
  21. * this software is provided as is, without representation from the
  22. * university of michigan as to its fitness for any purpose, and without
  23. * warranty by the university of michigan of any kind, either express
  24. * or implied, including without limitation the implied warranties of
  25. * merchantability and fitness for a particular purpose. the regents
  26. * of the university of michigan shall not be liable for any damages,
  27. * including special, indirect, incidental, or consequential damages,
  28. * with respect to any claim arising out or in connection with the use
  29. * of the software, even if it has been or is hereafter advised of the
  30. * possibility of such damages.
  31. */
  32. #include "blocklayout.h"
  33. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  34. /* Bit numbers */
  35. #define EXTENT_INITIALIZED 0
  36. #define EXTENT_WRITTEN 1
  37. #define EXTENT_IN_COMMIT 2
  38. #define INTERNAL_EXISTS MY_MAX_TAGS
  39. #define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
  40. /* Returns largest t<=s s.t. t%base==0 */
  41. static inline sector_t normalize(sector_t s, int base)
  42. {
  43. sector_t tmp = s; /* Since do_div modifies its argument */
  44. return s - do_div(tmp, base);
  45. }
  46. static inline sector_t normalize_up(sector_t s, int base)
  47. {
  48. return normalize(s + base - 1, base);
  49. }
  50. /* Complete stub using list while determine API wanted */
  51. /* Returns tags, or negative */
  52. static int32_t _find_entry(struct my_tree *tree, u64 s)
  53. {
  54. struct pnfs_inval_tracking *pos;
  55. dprintk("%s(%llu) enter\n", __func__, s);
  56. list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
  57. if (pos->it_sector > s)
  58. continue;
  59. else if (pos->it_sector == s)
  60. return pos->it_tags & INTERNAL_MASK;
  61. else
  62. break;
  63. }
  64. return -ENOENT;
  65. }
  66. static inline
  67. int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
  68. {
  69. int32_t tags;
  70. dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
  71. s = normalize(s, tree->mtt_step_size);
  72. tags = _find_entry(tree, s);
  73. if ((tags < 0) || !(tags & (1 << tag)))
  74. return 0;
  75. else
  76. return 1;
  77. }
  78. /* Creates entry with tag, or if entry already exists, unions tag to it.
  79. * If storage is not NULL, newly created entry will use it.
  80. * Returns number of entries added, or negative on error.
  81. */
  82. static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
  83. struct pnfs_inval_tracking *storage)
  84. {
  85. int found = 0;
  86. struct pnfs_inval_tracking *pos;
  87. dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
  88. list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
  89. if (pos->it_sector > s)
  90. continue;
  91. else if (pos->it_sector == s) {
  92. found = 1;
  93. break;
  94. } else
  95. break;
  96. }
  97. if (found) {
  98. pos->it_tags |= (1 << tag);
  99. return 0;
  100. } else {
  101. struct pnfs_inval_tracking *new;
  102. new = storage;
  103. new->it_sector = s;
  104. new->it_tags = (1 << tag);
  105. list_add(&new->it_link, &pos->it_link);
  106. return 1;
  107. }
  108. }
  109. /* XXXX Really want option to not create */
  110. /* Over range, unions tag with existing entries, else creates entry with tag */
  111. static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
  112. {
  113. u64 i;
  114. dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
  115. for (i = normalize(s, tree->mtt_step_size); i < s + length;
  116. i += tree->mtt_step_size)
  117. if (_add_entry(tree, i, tag, NULL))
  118. return -ENOMEM;
  119. return 0;
  120. }
  121. /* Ensure that future operations on given range of tree will not malloc */
  122. static int _preload_range(struct pnfs_inval_markings *marks,
  123. u64 offset, u64 length)
  124. {
  125. u64 start, end, s;
  126. int count, i, used = 0, status = -ENOMEM;
  127. struct pnfs_inval_tracking **storage;
  128. struct my_tree *tree = &marks->im_tree;
  129. dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
  130. start = normalize(offset, tree->mtt_step_size);
  131. end = normalize_up(offset + length, tree->mtt_step_size);
  132. count = (int)(end - start) / (int)tree->mtt_step_size;
  133. /* Pre-malloc what memory we might need */
  134. storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
  135. if (!storage)
  136. return -ENOMEM;
  137. for (i = 0; i < count; i++) {
  138. storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
  139. GFP_NOFS);
  140. if (!storage[i])
  141. goto out_cleanup;
  142. }
  143. spin_lock_bh(&marks->im_lock);
  144. for (s = start; s < end; s += tree->mtt_step_size)
  145. used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
  146. spin_unlock_bh(&marks->im_lock);
  147. status = 0;
  148. out_cleanup:
  149. for (i = used; i < count; i++) {
  150. if (!storage[i])
  151. break;
  152. kfree(storage[i]);
  153. }
  154. kfree(storage);
  155. return status;
  156. }
  157. /* We are relying on page lock to serialize this */
  158. int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
  159. {
  160. int rv;
  161. spin_lock_bh(&marks->im_lock);
  162. rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
  163. spin_unlock_bh(&marks->im_lock);
  164. return rv;
  165. }
  166. /* Assume start, end already sector aligned */
  167. static int
  168. _range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
  169. {
  170. struct pnfs_inval_tracking *pos;
  171. u64 expect = 0;
  172. dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
  173. list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
  174. if (pos->it_sector >= end)
  175. continue;
  176. if (!expect) {
  177. if ((pos->it_sector == end - tree->mtt_step_size) &&
  178. (pos->it_tags & (1 << tag))) {
  179. expect = pos->it_sector - tree->mtt_step_size;
  180. if (pos->it_sector < tree->mtt_step_size || expect < start)
  181. return 1;
  182. continue;
  183. } else {
  184. return 0;
  185. }
  186. }
  187. if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
  188. return 0;
  189. expect -= tree->mtt_step_size;
  190. if (expect < start)
  191. return 1;
  192. }
  193. return 0;
  194. }
  195. static int is_range_written(struct pnfs_inval_markings *marks,
  196. sector_t start, sector_t end)
  197. {
  198. int rv;
  199. spin_lock_bh(&marks->im_lock);
  200. rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
  201. spin_unlock_bh(&marks->im_lock);
  202. return rv;
  203. }
  204. /* Marks sectors in [offest, offset_length) as having been initialized.
  205. * All lengths are step-aligned, where step is min(pagesize, blocksize).
  206. * Currently assumes offset is page-aligned
  207. */
  208. int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
  209. sector_t offset, sector_t length)
  210. {
  211. sector_t start, end;
  212. dprintk("%s(offset=%llu,len=%llu) enter\n",
  213. __func__, (u64)offset, (u64)length);
  214. start = normalize(offset, marks->im_block_size);
  215. end = normalize_up(offset + length, marks->im_block_size);
  216. if (_preload_range(marks, start, end - start))
  217. goto outerr;
  218. spin_lock_bh(&marks->im_lock);
  219. if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
  220. goto out_unlock;
  221. spin_unlock_bh(&marks->im_lock);
  222. return 0;
  223. out_unlock:
  224. spin_unlock_bh(&marks->im_lock);
  225. outerr:
  226. return -ENOMEM;
  227. }
  228. /* Marks sectors in [offest, offset+length) as having been written to disk.
  229. * All lengths should be block aligned.
  230. */
  231. static int mark_written_sectors(struct pnfs_inval_markings *marks,
  232. sector_t offset, sector_t length)
  233. {
  234. int status;
  235. dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
  236. (u64)offset, (u64)length);
  237. spin_lock_bh(&marks->im_lock);
  238. status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
  239. spin_unlock_bh(&marks->im_lock);
  240. return status;
  241. }
  242. static void print_short_extent(struct pnfs_block_short_extent *be)
  243. {
  244. dprintk("PRINT SHORT EXTENT extent %p\n", be);
  245. if (be) {
  246. dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
  247. dprintk(" be_length %llu\n", (u64)be->bse_length);
  248. }
  249. }
  250. static void print_clist(struct list_head *list, unsigned int count)
  251. {
  252. struct pnfs_block_short_extent *be;
  253. unsigned int i = 0;
  254. ifdebug(FACILITY) {
  255. printk(KERN_DEBUG "****************\n");
  256. printk(KERN_DEBUG "Extent list looks like:\n");
  257. list_for_each_entry(be, list, bse_node) {
  258. i++;
  259. print_short_extent(be);
  260. }
  261. if (i != count)
  262. printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
  263. printk(KERN_DEBUG "****************\n");
  264. }
  265. }
  266. /* Note: In theory, we should do more checking that devid's match between
  267. * old and new, but if they don't, the lists are too corrupt to salvage anyway.
  268. */
  269. /* Note this is very similar to bl_add_merge_extent */
  270. static void add_to_commitlist(struct pnfs_block_layout *bl,
  271. struct pnfs_block_short_extent *new)
  272. {
  273. struct list_head *clist = &bl->bl_commit;
  274. struct pnfs_block_short_extent *old, *save;
  275. sector_t end = new->bse_f_offset + new->bse_length;
  276. dprintk("%s enter\n", __func__);
  277. print_short_extent(new);
  278. print_clist(clist, bl->bl_count);
  279. bl->bl_count++;
  280. /* Scan for proper place to insert, extending new to the left
  281. * as much as possible.
  282. */
  283. list_for_each_entry_safe(old, save, clist, bse_node) {
  284. if (new->bse_f_offset < old->bse_f_offset)
  285. break;
  286. if (end <= old->bse_f_offset + old->bse_length) {
  287. /* Range is already in list */
  288. bl->bl_count--;
  289. kfree(new);
  290. return;
  291. } else if (new->bse_f_offset <=
  292. old->bse_f_offset + old->bse_length) {
  293. /* new overlaps or abuts existing be */
  294. if (new->bse_mdev == old->bse_mdev) {
  295. /* extend new to fully replace old */
  296. new->bse_length += new->bse_f_offset -
  297. old->bse_f_offset;
  298. new->bse_f_offset = old->bse_f_offset;
  299. list_del(&old->bse_node);
  300. bl->bl_count--;
  301. kfree(old);
  302. }
  303. }
  304. }
  305. /* Note that if we never hit the above break, old will not point to a
  306. * valid extent. However, in that case &old->bse_node==list.
  307. */
  308. list_add_tail(&new->bse_node, &old->bse_node);
  309. /* Scan forward for overlaps. If we find any, extend new and
  310. * remove the overlapped extent.
  311. */
  312. old = list_prepare_entry(new, clist, bse_node);
  313. list_for_each_entry_safe_continue(old, save, clist, bse_node) {
  314. if (end < old->bse_f_offset)
  315. break;
  316. /* new overlaps or abuts old */
  317. if (new->bse_mdev == old->bse_mdev) {
  318. if (end < old->bse_f_offset + old->bse_length) {
  319. /* extend new to fully cover old */
  320. end = old->bse_f_offset + old->bse_length;
  321. new->bse_length = end - new->bse_f_offset;
  322. }
  323. list_del(&old->bse_node);
  324. bl->bl_count--;
  325. kfree(old);
  326. }
  327. }
  328. dprintk("%s: after merging\n", __func__);
  329. print_clist(clist, bl->bl_count);
  330. }
  331. /* Note the range described by offset, length is guaranteed to be contained
  332. * within be.
  333. * new will be freed, either by this function or add_to_commitlist if they
  334. * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
  335. */
  336. int bl_mark_for_commit(struct pnfs_block_extent *be,
  337. sector_t offset, sector_t length,
  338. struct pnfs_block_short_extent *new)
  339. {
  340. sector_t new_end, end = offset + length;
  341. struct pnfs_block_layout *bl = container_of(be->be_inval,
  342. struct pnfs_block_layout,
  343. bl_inval);
  344. mark_written_sectors(be->be_inval, offset, length);
  345. /* We want to add the range to commit list, but it must be
  346. * block-normalized, and verified that the normalized range has
  347. * been entirely written to disk.
  348. */
  349. new->bse_f_offset = offset;
  350. offset = normalize(offset, bl->bl_blocksize);
  351. if (offset < new->bse_f_offset) {
  352. if (is_range_written(be->be_inval, offset, new->bse_f_offset))
  353. new->bse_f_offset = offset;
  354. else
  355. new->bse_f_offset = offset + bl->bl_blocksize;
  356. }
  357. new_end = normalize_up(end, bl->bl_blocksize);
  358. if (end < new_end) {
  359. if (is_range_written(be->be_inval, end, new_end))
  360. end = new_end;
  361. else
  362. end = new_end - bl->bl_blocksize;
  363. }
  364. if (end <= new->bse_f_offset) {
  365. kfree(new);
  366. return 0;
  367. }
  368. new->bse_length = end - new->bse_f_offset;
  369. new->bse_devid = be->be_devid;
  370. new->bse_mdev = be->be_mdev;
  371. spin_lock(&bl->bl_ext_lock);
  372. add_to_commitlist(bl, new);
  373. spin_unlock(&bl->bl_ext_lock);
  374. return 0;
  375. }
  376. static void print_bl_extent(struct pnfs_block_extent *be)
  377. {
  378. dprintk("PRINT EXTENT extent %p\n", be);
  379. if (be) {
  380. dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
  381. dprintk(" be_length %llu\n", (u64)be->be_length);
  382. dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
  383. dprintk(" be_state %d\n", be->be_state);
  384. }
  385. }
  386. static void
  387. destroy_extent(struct kref *kref)
  388. {
  389. struct pnfs_block_extent *be;
  390. be = container_of(kref, struct pnfs_block_extent, be_refcnt);
  391. dprintk("%s be=%p\n", __func__, be);
  392. kfree(be);
  393. }
  394. void
  395. bl_put_extent(struct pnfs_block_extent *be)
  396. {
  397. if (be) {
  398. dprintk("%s enter %p (%i)\n", __func__, be,
  399. atomic_read(&be->be_refcnt.refcount));
  400. kref_put(&be->be_refcnt, destroy_extent);
  401. }
  402. }
  403. struct pnfs_block_extent *bl_alloc_extent(void)
  404. {
  405. struct pnfs_block_extent *be;
  406. be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
  407. if (!be)
  408. return NULL;
  409. INIT_LIST_HEAD(&be->be_node);
  410. kref_init(&be->be_refcnt);
  411. be->be_inval = NULL;
  412. return be;
  413. }
  414. static void print_elist(struct list_head *list)
  415. {
  416. struct pnfs_block_extent *be;
  417. dprintk("****************\n");
  418. dprintk("Extent list looks like:\n");
  419. list_for_each_entry(be, list, be_node) {
  420. print_bl_extent(be);
  421. }
  422. dprintk("****************\n");
  423. }
  424. static inline int
  425. extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
  426. {
  427. /* Note this assumes new->be_f_offset >= old->be_f_offset */
  428. return (new->be_state == old->be_state) &&
  429. ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
  430. ((new->be_v_offset - old->be_v_offset ==
  431. new->be_f_offset - old->be_f_offset) &&
  432. new->be_mdev == old->be_mdev));
  433. }
  434. /* Adds new to appropriate list in bl, modifying new and removing existing
  435. * extents as appropriate to deal with overlaps.
  436. *
  437. * See bl_find_get_extent for list constraints.
  438. *
  439. * Refcount on new is already set. If end up not using it, or error out,
  440. * need to put the reference.
  441. *
  442. * bl->bl_ext_lock is held by caller.
  443. */
  444. int
  445. bl_add_merge_extent(struct pnfs_block_layout *bl,
  446. struct pnfs_block_extent *new)
  447. {
  448. struct pnfs_block_extent *be, *tmp;
  449. sector_t end = new->be_f_offset + new->be_length;
  450. struct list_head *list;
  451. dprintk("%s enter with be=%p\n", __func__, new);
  452. print_bl_extent(new);
  453. list = &bl->bl_extents[bl_choose_list(new->be_state)];
  454. print_elist(list);
  455. /* Scan for proper place to insert, extending new to the left
  456. * as much as possible.
  457. */
  458. list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
  459. if (new->be_f_offset >= be->be_f_offset + be->be_length)
  460. break;
  461. if (new->be_f_offset >= be->be_f_offset) {
  462. if (end <= be->be_f_offset + be->be_length) {
  463. /* new is a subset of existing be*/
  464. if (extents_consistent(be, new)) {
  465. dprintk("%s: new is subset, ignoring\n",
  466. __func__);
  467. bl_put_extent(new);
  468. return 0;
  469. } else {
  470. goto out_err;
  471. }
  472. } else {
  473. /* |<-- be -->|
  474. * |<-- new -->| */
  475. if (extents_consistent(be, new)) {
  476. /* extend new to fully replace be */
  477. new->be_length += new->be_f_offset -
  478. be->be_f_offset;
  479. new->be_f_offset = be->be_f_offset;
  480. new->be_v_offset = be->be_v_offset;
  481. dprintk("%s: removing %p\n", __func__, be);
  482. list_del(&be->be_node);
  483. bl_put_extent(be);
  484. } else {
  485. goto out_err;
  486. }
  487. }
  488. } else if (end >= be->be_f_offset + be->be_length) {
  489. /* new extent overlap existing be */
  490. if (extents_consistent(be, new)) {
  491. /* extend new to fully replace be */
  492. dprintk("%s: removing %p\n", __func__, be);
  493. list_del(&be->be_node);
  494. bl_put_extent(be);
  495. } else {
  496. goto out_err;
  497. }
  498. } else if (end > be->be_f_offset) {
  499. /* |<-- be -->|
  500. *|<-- new -->| */
  501. if (extents_consistent(new, be)) {
  502. /* extend new to fully replace be */
  503. new->be_length += be->be_f_offset + be->be_length -
  504. new->be_f_offset - new->be_length;
  505. dprintk("%s: removing %p\n", __func__, be);
  506. list_del(&be->be_node);
  507. bl_put_extent(be);
  508. } else {
  509. goto out_err;
  510. }
  511. }
  512. }
  513. /* Note that if we never hit the above break, be will not point to a
  514. * valid extent. However, in that case &be->be_node==list.
  515. */
  516. list_add(&new->be_node, &be->be_node);
  517. dprintk("%s: inserting new\n", __func__);
  518. print_elist(list);
  519. /* FIXME - The per-list consistency checks have all been done,
  520. * should now check cross-list consistency.
  521. */
  522. return 0;
  523. out_err:
  524. bl_put_extent(new);
  525. return -EIO;
  526. }
  527. /* Returns extent, or NULL. If a second READ extent exists, it is returned
  528. * in cow_read, if given.
  529. *
  530. * The extents are kept in two seperate ordered lists, one for READ and NONE,
  531. * one for READWRITE and INVALID. Within each list, we assume:
  532. * 1. Extents are ordered by file offset.
  533. * 2. For any given isect, there is at most one extents that matches.
  534. */
  535. struct pnfs_block_extent *
  536. bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
  537. struct pnfs_block_extent **cow_read)
  538. {
  539. struct pnfs_block_extent *be, *cow, *ret;
  540. int i;
  541. dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
  542. cow = ret = NULL;
  543. spin_lock(&bl->bl_ext_lock);
  544. for (i = 0; i < EXTENT_LISTS; i++) {
  545. list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
  546. if (isect >= be->be_f_offset + be->be_length)
  547. break;
  548. if (isect >= be->be_f_offset) {
  549. /* We have found an extent */
  550. dprintk("%s Get %p (%i)\n", __func__, be,
  551. atomic_read(&be->be_refcnt.refcount));
  552. kref_get(&be->be_refcnt);
  553. if (!ret)
  554. ret = be;
  555. else if (be->be_state != PNFS_BLOCK_READ_DATA)
  556. bl_put_extent(be);
  557. else
  558. cow = be;
  559. break;
  560. }
  561. }
  562. if (ret &&
  563. (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
  564. break;
  565. }
  566. spin_unlock(&bl->bl_ext_lock);
  567. if (cow_read)
  568. *cow_read = cow;
  569. print_bl_extent(ret);
  570. return ret;
  571. }
  572. /* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
  573. static struct pnfs_block_extent *
  574. bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
  575. {
  576. struct pnfs_block_extent *be, *ret = NULL;
  577. int i;
  578. dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
  579. for (i = 0; i < EXTENT_LISTS; i++) {
  580. if (ret)
  581. break;
  582. list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
  583. if (isect >= be->be_f_offset + be->be_length)
  584. break;
  585. if (isect >= be->be_f_offset) {
  586. /* We have found an extent */
  587. dprintk("%s Get %p (%i)\n", __func__, be,
  588. atomic_read(&be->be_refcnt.refcount));
  589. kref_get(&be->be_refcnt);
  590. ret = be;
  591. break;
  592. }
  593. }
  594. }
  595. print_bl_extent(ret);
  596. return ret;
  597. }
  598. int
  599. encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
  600. struct xdr_stream *xdr,
  601. const struct nfs4_layoutcommit_args *arg)
  602. {
  603. struct pnfs_block_short_extent *lce, *save;
  604. unsigned int count = 0;
  605. __be32 *p, *xdr_start;
  606. dprintk("%s enter\n", __func__);
  607. /* BUG - creation of bl_commit is buggy - need to wait for
  608. * entire block to be marked WRITTEN before it can be added.
  609. */
  610. spin_lock(&bl->bl_ext_lock);
  611. /* Want to adjust for possible truncate */
  612. /* We now want to adjust argument range */
  613. /* XDR encode the ranges found */
  614. xdr_start = xdr_reserve_space(xdr, 8);
  615. if (!xdr_start)
  616. goto out;
  617. list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
  618. p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
  619. if (!p)
  620. break;
  621. p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
  622. p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
  623. p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
  624. p = xdr_encode_hyper(p, 0LL);
  625. *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
  626. list_del(&lce->bse_node);
  627. list_add_tail(&lce->bse_node, &bl->bl_committing);
  628. bl->bl_count--;
  629. count++;
  630. }
  631. xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
  632. xdr_start[1] = cpu_to_be32(count);
  633. out:
  634. spin_unlock(&bl->bl_ext_lock);
  635. dprintk("%s found %i ranges\n", __func__, count);
  636. return 0;
  637. }
  638. /* Helper function to set_to_rw that initialize a new extent */
  639. static void
  640. _prep_new_extent(struct pnfs_block_extent *new,
  641. struct pnfs_block_extent *orig,
  642. sector_t offset, sector_t length, int state)
  643. {
  644. kref_init(&new->be_refcnt);
  645. /* don't need to INIT_LIST_HEAD(&new->be_node) */
  646. memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
  647. new->be_mdev = orig->be_mdev;
  648. new->be_f_offset = offset;
  649. new->be_length = length;
  650. new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
  651. new->be_state = state;
  652. new->be_inval = orig->be_inval;
  653. }
  654. /* Tries to merge be with extent in front of it in list.
  655. * Frees storage if not used.
  656. */
  657. static struct pnfs_block_extent *
  658. _front_merge(struct pnfs_block_extent *be, struct list_head *head,
  659. struct pnfs_block_extent *storage)
  660. {
  661. struct pnfs_block_extent *prev;
  662. if (!storage)
  663. goto no_merge;
  664. if (&be->be_node == head || be->be_node.prev == head)
  665. goto no_merge;
  666. prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
  667. if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
  668. !extents_consistent(prev, be))
  669. goto no_merge;
  670. _prep_new_extent(storage, prev, prev->be_f_offset,
  671. prev->be_length + be->be_length, prev->be_state);
  672. list_replace(&prev->be_node, &storage->be_node);
  673. bl_put_extent(prev);
  674. list_del(&be->be_node);
  675. bl_put_extent(be);
  676. return storage;
  677. no_merge:
  678. kfree(storage);
  679. return be;
  680. }
  681. static u64
  682. set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
  683. {
  684. u64 rv = offset + length;
  685. struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
  686. struct pnfs_block_extent *children[3];
  687. struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
  688. int i = 0, j;
  689. dprintk("%s(%llu, %llu)\n", __func__, offset, length);
  690. /* Create storage for up to three new extents e1, e2, e3 */
  691. e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
  692. e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
  693. e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
  694. /* BUG - we are ignoring any failure */
  695. if (!e1 || !e2 || !e3)
  696. goto out_nosplit;
  697. spin_lock(&bl->bl_ext_lock);
  698. be = bl_find_get_extent_locked(bl, offset);
  699. rv = be->be_f_offset + be->be_length;
  700. if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
  701. spin_unlock(&bl->bl_ext_lock);
  702. goto out_nosplit;
  703. }
  704. /* Add e* to children, bumping e*'s krefs */
  705. if (be->be_f_offset != offset) {
  706. _prep_new_extent(e1, be, be->be_f_offset,
  707. offset - be->be_f_offset,
  708. PNFS_BLOCK_INVALID_DATA);
  709. children[i++] = e1;
  710. print_bl_extent(e1);
  711. } else
  712. merge1 = e1;
  713. _prep_new_extent(e2, be, offset,
  714. min(length, be->be_f_offset + be->be_length - offset),
  715. PNFS_BLOCK_READWRITE_DATA);
  716. children[i++] = e2;
  717. print_bl_extent(e2);
  718. if (offset + length < be->be_f_offset + be->be_length) {
  719. _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
  720. be->be_f_offset + be->be_length -
  721. offset - length,
  722. PNFS_BLOCK_INVALID_DATA);
  723. children[i++] = e3;
  724. print_bl_extent(e3);
  725. } else
  726. merge2 = e3;
  727. /* Remove be from list, and insert the e* */
  728. /* We don't get refs on e*, since this list is the base reference
  729. * set when init'ed.
  730. */
  731. if (i < 3)
  732. children[i] = NULL;
  733. new = children[0];
  734. list_replace(&be->be_node, &new->be_node);
  735. bl_put_extent(be);
  736. new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
  737. for (j = 1; j < i; j++) {
  738. old = new;
  739. new = children[j];
  740. list_add(&new->be_node, &old->be_node);
  741. }
  742. if (merge2) {
  743. /* This is a HACK, should just create a _back_merge function */
  744. new = list_entry(new->be_node.next,
  745. struct pnfs_block_extent, be_node);
  746. new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
  747. }
  748. spin_unlock(&bl->bl_ext_lock);
  749. /* Since we removed the base reference above, be is now scheduled for
  750. * destruction.
  751. */
  752. bl_put_extent(be);
  753. dprintk("%s returns %llu after split\n", __func__, rv);
  754. return rv;
  755. out_nosplit:
  756. kfree(e1);
  757. kfree(e2);
  758. kfree(e3);
  759. dprintk("%s returns %llu without splitting\n", __func__, rv);
  760. return rv;
  761. }
  762. void
  763. clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
  764. const struct nfs4_layoutcommit_args *arg,
  765. int status)
  766. {
  767. struct pnfs_block_short_extent *lce, *save;
  768. dprintk("%s status %d\n", __func__, status);
  769. list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
  770. if (likely(!status)) {
  771. u64 offset = lce->bse_f_offset;
  772. u64 end = offset + lce->bse_length;
  773. do {
  774. offset = set_to_rw(bl, offset, end - offset);
  775. } while (offset < end);
  776. list_del(&lce->bse_node);
  777. kfree(lce);
  778. } else {
  779. list_del(&lce->bse_node);
  780. spin_lock(&bl->bl_ext_lock);
  781. add_to_commitlist(bl, lce);
  782. spin_unlock(&bl->bl_ext_lock);
  783. }
  784. }
  785. }
  786. int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
  787. {
  788. struct pnfs_block_short_extent *new;
  789. new = kmalloc(sizeof(*new), GFP_NOFS);
  790. if (unlikely(!new))
  791. return -ENOMEM;
  792. spin_lock_bh(&marks->im_lock);
  793. list_add(&new->bse_node, &marks->im_extents);
  794. spin_unlock_bh(&marks->im_lock);
  795. return 0;
  796. }
  797. struct pnfs_block_short_extent *
  798. bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
  799. {
  800. struct pnfs_block_short_extent *rv = NULL;
  801. spin_lock_bh(&marks->im_lock);
  802. if (!list_empty(&marks->im_extents)) {
  803. rv = list_entry((&marks->im_extents)->next,
  804. struct pnfs_block_short_extent, bse_node);
  805. list_del_init(&rv->bse_node);
  806. }
  807. spin_unlock_bh(&marks->im_lock);
  808. return rv;
  809. }
  810. void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
  811. {
  812. struct pnfs_block_short_extent *se = NULL, *tmp;
  813. if (num_to_free <= 0)
  814. return;
  815. spin_lock(&marks->im_lock);
  816. list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
  817. list_del(&se->bse_node);
  818. kfree(se);
  819. if (--num_to_free == 0)
  820. break;
  821. }
  822. spin_unlock(&marks->im_lock);
  823. BUG_ON(num_to_free > 0);
  824. }