extents.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935
  1. /*
  2. * linux/fs/nfs/blocklayout/blocklayout.h
  3. *
  4. * Module for the NFSv4.1 pNFS block layout driver.
  5. *
  6. * Copyright (c) 2006 The Regents of the University of Michigan.
  7. * All rights reserved.
  8. *
  9. * Andy Adamson <andros@citi.umich.edu>
  10. * Fred Isaman <iisaman@umich.edu>
  11. *
  12. * permission is granted to use, copy, create derivative works and
  13. * redistribute this software and such derivative works for any purpose,
  14. * so long as the name of the university of michigan is not used in
  15. * any advertising or publicity pertaining to the use or distribution
  16. * of this software without specific, written prior authorization. if
  17. * the above copyright notice or any other identification of the
  18. * university of michigan is included in any copy of any portion of
  19. * this software, then the disclaimer below must also be included.
  20. *
  21. * this software is provided as is, without representation from the
  22. * university of michigan as to its fitness for any purpose, and without
  23. * warranty by the university of michigan of any kind, either express
  24. * or implied, including without limitation the implied warranties of
  25. * merchantability and fitness for a particular purpose. the regents
  26. * of the university of michigan shall not be liable for any damages,
  27. * including special, indirect, incidental, or consequential damages,
  28. * with respect to any claim arising out or in connection with the use
  29. * of the software, even if it has been or is hereafter advised of the
  30. * possibility of such damages.
  31. */
  32. #include "blocklayout.h"
  33. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  34. /* Bit numbers */
  35. #define EXTENT_INITIALIZED 0
  36. #define EXTENT_WRITTEN 1
  37. #define EXTENT_IN_COMMIT 2
  38. #define INTERNAL_EXISTS MY_MAX_TAGS
  39. #define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
  40. /* Returns largest t<=s s.t. t%base==0 */
  41. static inline sector_t normalize(sector_t s, int base)
  42. {
  43. sector_t tmp = s; /* Since do_div modifies its argument */
  44. return s - do_div(tmp, base);
  45. }
  46. static inline sector_t normalize_up(sector_t s, int base)
  47. {
  48. return normalize(s + base - 1, base);
  49. }
  50. /* Complete stub using list while determine API wanted */
  51. /* Returns tags, or negative */
  52. static int32_t _find_entry(struct my_tree *tree, u64 s)
  53. {
  54. struct pnfs_inval_tracking *pos;
  55. dprintk("%s(%llu) enter\n", __func__, s);
  56. list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
  57. if (pos->it_sector > s)
  58. continue;
  59. else if (pos->it_sector == s)
  60. return pos->it_tags & INTERNAL_MASK;
  61. else
  62. break;
  63. }
  64. return -ENOENT;
  65. }
  66. static inline
  67. int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
  68. {
  69. int32_t tags;
  70. dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
  71. s = normalize(s, tree->mtt_step_size);
  72. tags = _find_entry(tree, s);
  73. if ((tags < 0) || !(tags & (1 << tag)))
  74. return 0;
  75. else
  76. return 1;
  77. }
  78. /* Creates entry with tag, or if entry already exists, unions tag to it.
  79. * If storage is not NULL, newly created entry will use it.
  80. * Returns number of entries added, or negative on error.
  81. */
  82. static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
  83. struct pnfs_inval_tracking *storage)
  84. {
  85. int found = 0;
  86. struct pnfs_inval_tracking *pos;
  87. dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
  88. list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
  89. if (pos->it_sector > s)
  90. continue;
  91. else if (pos->it_sector == s) {
  92. found = 1;
  93. break;
  94. } else
  95. break;
  96. }
  97. if (found) {
  98. pos->it_tags |= (1 << tag);
  99. return 0;
  100. } else {
  101. struct pnfs_inval_tracking *new;
  102. if (storage)
  103. new = storage;
  104. else {
  105. new = kmalloc(sizeof(*new), GFP_NOFS);
  106. if (!new)
  107. return -ENOMEM;
  108. }
  109. new->it_sector = s;
  110. new->it_tags = (1 << tag);
  111. list_add(&new->it_link, &pos->it_link);
  112. return 1;
  113. }
  114. }
  115. /* XXXX Really want option to not create */
  116. /* Over range, unions tag with existing entries, else creates entry with tag */
  117. static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
  118. {
  119. u64 i;
  120. dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
  121. for (i = normalize(s, tree->mtt_step_size); i < s + length;
  122. i += tree->mtt_step_size)
  123. if (_add_entry(tree, i, tag, NULL))
  124. return -ENOMEM;
  125. return 0;
  126. }
  127. /* Ensure that future operations on given range of tree will not malloc */
  128. static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
  129. {
  130. u64 start, end, s;
  131. int count, i, used = 0, status = -ENOMEM;
  132. struct pnfs_inval_tracking **storage;
  133. dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
  134. start = normalize(offset, tree->mtt_step_size);
  135. end = normalize_up(offset + length, tree->mtt_step_size);
  136. count = (int)(end - start) / (int)tree->mtt_step_size;
  137. /* Pre-malloc what memory we might need */
  138. storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
  139. if (!storage)
  140. return -ENOMEM;
  141. for (i = 0; i < count; i++) {
  142. storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
  143. GFP_NOFS);
  144. if (!storage[i])
  145. goto out_cleanup;
  146. }
  147. /* Now need lock - HOW??? */
  148. for (s = start; s < end; s += tree->mtt_step_size)
  149. used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
  150. /* Unlock - HOW??? */
  151. status = 0;
  152. out_cleanup:
  153. for (i = used; i < count; i++) {
  154. if (!storage[i])
  155. break;
  156. kfree(storage[i]);
  157. }
  158. kfree(storage);
  159. return status;
  160. }
  161. static void set_needs_init(sector_t *array, sector_t offset)
  162. {
  163. sector_t *p = array;
  164. dprintk("%s enter\n", __func__);
  165. if (!p)
  166. return;
  167. while (*p < offset)
  168. p++;
  169. if (*p == offset)
  170. return;
  171. else if (*p == ~0) {
  172. *p++ = offset;
  173. *p = ~0;
  174. return;
  175. } else {
  176. sector_t *save = p;
  177. dprintk("%s Adding %llu\n", __func__, (u64)offset);
  178. while (*p != ~0)
  179. p++;
  180. p++;
  181. memmove(save + 1, save, (char *)p - (char *)save);
  182. *save = offset;
  183. return;
  184. }
  185. }
  186. /* We are relying on page lock to serialize this */
  187. int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
  188. {
  189. int rv;
  190. spin_lock(&marks->im_lock);
  191. rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
  192. spin_unlock(&marks->im_lock);
  193. return rv;
  194. }
  195. /* Assume start, end already sector aligned */
  196. static int
  197. _range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
  198. {
  199. struct pnfs_inval_tracking *pos;
  200. u64 expect = 0;
  201. dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
  202. list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
  203. if (pos->it_sector >= end)
  204. continue;
  205. if (!expect) {
  206. if ((pos->it_sector == end - tree->mtt_step_size) &&
  207. (pos->it_tags & (1 << tag))) {
  208. expect = pos->it_sector - tree->mtt_step_size;
  209. if (pos->it_sector < tree->mtt_step_size || expect < start)
  210. return 1;
  211. continue;
  212. } else {
  213. return 0;
  214. }
  215. }
  216. if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
  217. return 0;
  218. expect -= tree->mtt_step_size;
  219. if (expect < start)
  220. return 1;
  221. }
  222. return 0;
  223. }
  224. static int is_range_written(struct pnfs_inval_markings *marks,
  225. sector_t start, sector_t end)
  226. {
  227. int rv;
  228. spin_lock(&marks->im_lock);
  229. rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
  230. spin_unlock(&marks->im_lock);
  231. return rv;
  232. }
  233. /* Marks sectors in [offest, offset_length) as having been initialized.
  234. * All lengths are step-aligned, where step is min(pagesize, blocksize).
  235. * Notes where partial block is initialized, and helps prepare it for
  236. * complete initialization later.
  237. */
  238. /* Currently assumes offset is page-aligned */
  239. int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
  240. sector_t offset, sector_t length,
  241. sector_t **pages)
  242. {
  243. sector_t s, start, end;
  244. sector_t *array = NULL; /* Pages to mark */
  245. dprintk("%s(offset=%llu,len=%llu) enter\n",
  246. __func__, (u64)offset, (u64)length);
  247. s = max((sector_t) 3,
  248. 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
  249. dprintk("%s set max=%llu\n", __func__, (u64)s);
  250. if (pages) {
  251. array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
  252. if (!array)
  253. goto outerr;
  254. array[0] = ~0;
  255. }
  256. start = normalize(offset, marks->im_block_size);
  257. end = normalize_up(offset + length, marks->im_block_size);
  258. if (_preload_range(&marks->im_tree, start, end - start))
  259. goto outerr;
  260. spin_lock(&marks->im_lock);
  261. for (s = normalize_up(start, PAGE_CACHE_SECTORS);
  262. s < offset; s += PAGE_CACHE_SECTORS) {
  263. dprintk("%s pre-area pages\n", __func__);
  264. /* Portion of used block is not initialized */
  265. if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
  266. set_needs_init(array, s);
  267. }
  268. if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
  269. goto out_unlock;
  270. for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
  271. s < end; s += PAGE_CACHE_SECTORS) {
  272. dprintk("%s post-area pages\n", __func__);
  273. if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
  274. set_needs_init(array, s);
  275. }
  276. spin_unlock(&marks->im_lock);
  277. if (pages) {
  278. if (array[0] == ~0) {
  279. kfree(array);
  280. *pages = NULL;
  281. } else
  282. *pages = array;
  283. }
  284. return 0;
  285. out_unlock:
  286. spin_unlock(&marks->im_lock);
  287. outerr:
  288. if (pages) {
  289. kfree(array);
  290. *pages = NULL;
  291. }
  292. return -ENOMEM;
  293. }
  294. /* Marks sectors in [offest, offset+length) as having been written to disk.
  295. * All lengths should be block aligned.
  296. */
  297. static int mark_written_sectors(struct pnfs_inval_markings *marks,
  298. sector_t offset, sector_t length)
  299. {
  300. int status;
  301. dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
  302. (u64)offset, (u64)length);
  303. spin_lock(&marks->im_lock);
  304. status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
  305. spin_unlock(&marks->im_lock);
  306. return status;
  307. }
  308. static void print_short_extent(struct pnfs_block_short_extent *be)
  309. {
  310. dprintk("PRINT SHORT EXTENT extent %p\n", be);
  311. if (be) {
  312. dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
  313. dprintk(" be_length %llu\n", (u64)be->bse_length);
  314. }
  315. }
  316. static void print_clist(struct list_head *list, unsigned int count)
  317. {
  318. struct pnfs_block_short_extent *be;
  319. unsigned int i = 0;
  320. ifdebug(FACILITY) {
  321. printk(KERN_DEBUG "****************\n");
  322. printk(KERN_DEBUG "Extent list looks like:\n");
  323. list_for_each_entry(be, list, bse_node) {
  324. i++;
  325. print_short_extent(be);
  326. }
  327. if (i != count)
  328. printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
  329. printk(KERN_DEBUG "****************\n");
  330. }
  331. }
  332. /* Note: In theory, we should do more checking that devid's match between
  333. * old and new, but if they don't, the lists are too corrupt to salvage anyway.
  334. */
  335. /* Note this is very similar to bl_add_merge_extent */
  336. static void add_to_commitlist(struct pnfs_block_layout *bl,
  337. struct pnfs_block_short_extent *new)
  338. {
  339. struct list_head *clist = &bl->bl_commit;
  340. struct pnfs_block_short_extent *old, *save;
  341. sector_t end = new->bse_f_offset + new->bse_length;
  342. dprintk("%s enter\n", __func__);
  343. print_short_extent(new);
  344. print_clist(clist, bl->bl_count);
  345. bl->bl_count++;
  346. /* Scan for proper place to insert, extending new to the left
  347. * as much as possible.
  348. */
  349. list_for_each_entry_safe(old, save, clist, bse_node) {
  350. if (new->bse_f_offset < old->bse_f_offset)
  351. break;
  352. if (end <= old->bse_f_offset + old->bse_length) {
  353. /* Range is already in list */
  354. bl->bl_count--;
  355. kfree(new);
  356. return;
  357. } else if (new->bse_f_offset <=
  358. old->bse_f_offset + old->bse_length) {
  359. /* new overlaps or abuts existing be */
  360. if (new->bse_mdev == old->bse_mdev) {
  361. /* extend new to fully replace old */
  362. new->bse_length += new->bse_f_offset -
  363. old->bse_f_offset;
  364. new->bse_f_offset = old->bse_f_offset;
  365. list_del(&old->bse_node);
  366. bl->bl_count--;
  367. kfree(old);
  368. }
  369. }
  370. }
  371. /* Note that if we never hit the above break, old will not point to a
  372. * valid extent. However, in that case &old->bse_node==list.
  373. */
  374. list_add_tail(&new->bse_node, &old->bse_node);
  375. /* Scan forward for overlaps. If we find any, extend new and
  376. * remove the overlapped extent.
  377. */
  378. old = list_prepare_entry(new, clist, bse_node);
  379. list_for_each_entry_safe_continue(old, save, clist, bse_node) {
  380. if (end < old->bse_f_offset)
  381. break;
  382. /* new overlaps or abuts old */
  383. if (new->bse_mdev == old->bse_mdev) {
  384. if (end < old->bse_f_offset + old->bse_length) {
  385. /* extend new to fully cover old */
  386. end = old->bse_f_offset + old->bse_length;
  387. new->bse_length = end - new->bse_f_offset;
  388. }
  389. list_del(&old->bse_node);
  390. bl->bl_count--;
  391. kfree(old);
  392. }
  393. }
  394. dprintk("%s: after merging\n", __func__);
  395. print_clist(clist, bl->bl_count);
  396. }
  397. /* Note the range described by offset, length is guaranteed to be contained
  398. * within be.
  399. */
  400. int bl_mark_for_commit(struct pnfs_block_extent *be,
  401. sector_t offset, sector_t length)
  402. {
  403. sector_t new_end, end = offset + length;
  404. struct pnfs_block_short_extent *new;
  405. struct pnfs_block_layout *bl = container_of(be->be_inval,
  406. struct pnfs_block_layout,
  407. bl_inval);
  408. new = kmalloc(sizeof(*new), GFP_NOFS);
  409. if (!new)
  410. return -ENOMEM;
  411. mark_written_sectors(be->be_inval, offset, length);
  412. /* We want to add the range to commit list, but it must be
  413. * block-normalized, and verified that the normalized range has
  414. * been entirely written to disk.
  415. */
  416. new->bse_f_offset = offset;
  417. offset = normalize(offset, bl->bl_blocksize);
  418. if (offset < new->bse_f_offset) {
  419. if (is_range_written(be->be_inval, offset, new->bse_f_offset))
  420. new->bse_f_offset = offset;
  421. else
  422. new->bse_f_offset = offset + bl->bl_blocksize;
  423. }
  424. new_end = normalize_up(end, bl->bl_blocksize);
  425. if (end < new_end) {
  426. if (is_range_written(be->be_inval, end, new_end))
  427. end = new_end;
  428. else
  429. end = new_end - bl->bl_blocksize;
  430. }
  431. if (end <= new->bse_f_offset) {
  432. kfree(new);
  433. return 0;
  434. }
  435. new->bse_length = end - new->bse_f_offset;
  436. new->bse_devid = be->be_devid;
  437. new->bse_mdev = be->be_mdev;
  438. spin_lock(&bl->bl_ext_lock);
  439. /* new will be freed, either by add_to_commitlist if it decides not
  440. * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
  441. */
  442. add_to_commitlist(bl, new);
  443. spin_unlock(&bl->bl_ext_lock);
  444. return 0;
  445. }
  446. static void print_bl_extent(struct pnfs_block_extent *be)
  447. {
  448. dprintk("PRINT EXTENT extent %p\n", be);
  449. if (be) {
  450. dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
  451. dprintk(" be_length %llu\n", (u64)be->be_length);
  452. dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
  453. dprintk(" be_state %d\n", be->be_state);
  454. }
  455. }
  456. static void
  457. destroy_extent(struct kref *kref)
  458. {
  459. struct pnfs_block_extent *be;
  460. be = container_of(kref, struct pnfs_block_extent, be_refcnt);
  461. dprintk("%s be=%p\n", __func__, be);
  462. kfree(be);
  463. }
  464. void
  465. bl_put_extent(struct pnfs_block_extent *be)
  466. {
  467. if (be) {
  468. dprintk("%s enter %p (%i)\n", __func__, be,
  469. atomic_read(&be->be_refcnt.refcount));
  470. kref_put(&be->be_refcnt, destroy_extent);
  471. }
  472. }
  473. struct pnfs_block_extent *bl_alloc_extent(void)
  474. {
  475. struct pnfs_block_extent *be;
  476. be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
  477. if (!be)
  478. return NULL;
  479. INIT_LIST_HEAD(&be->be_node);
  480. kref_init(&be->be_refcnt);
  481. be->be_inval = NULL;
  482. return be;
  483. }
  484. static void print_elist(struct list_head *list)
  485. {
  486. struct pnfs_block_extent *be;
  487. dprintk("****************\n");
  488. dprintk("Extent list looks like:\n");
  489. list_for_each_entry(be, list, be_node) {
  490. print_bl_extent(be);
  491. }
  492. dprintk("****************\n");
  493. }
  494. static inline int
  495. extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
  496. {
  497. /* Note this assumes new->be_f_offset >= old->be_f_offset */
  498. return (new->be_state == old->be_state) &&
  499. ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
  500. ((new->be_v_offset - old->be_v_offset ==
  501. new->be_f_offset - old->be_f_offset) &&
  502. new->be_mdev == old->be_mdev));
  503. }
  504. /* Adds new to appropriate list in bl, modifying new and removing existing
  505. * extents as appropriate to deal with overlaps.
  506. *
  507. * See bl_find_get_extent for list constraints.
  508. *
  509. * Refcount on new is already set. If end up not using it, or error out,
  510. * need to put the reference.
  511. *
  512. * bl->bl_ext_lock is held by caller.
  513. */
  514. int
  515. bl_add_merge_extent(struct pnfs_block_layout *bl,
  516. struct pnfs_block_extent *new)
  517. {
  518. struct pnfs_block_extent *be, *tmp;
  519. sector_t end = new->be_f_offset + new->be_length;
  520. struct list_head *list;
  521. dprintk("%s enter with be=%p\n", __func__, new);
  522. print_bl_extent(new);
  523. list = &bl->bl_extents[bl_choose_list(new->be_state)];
  524. print_elist(list);
  525. /* Scan for proper place to insert, extending new to the left
  526. * as much as possible.
  527. */
  528. list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
  529. if (new->be_f_offset >= be->be_f_offset + be->be_length)
  530. break;
  531. if (new->be_f_offset >= be->be_f_offset) {
  532. if (end <= be->be_f_offset + be->be_length) {
  533. /* new is a subset of existing be*/
  534. if (extents_consistent(be, new)) {
  535. dprintk("%s: new is subset, ignoring\n",
  536. __func__);
  537. bl_put_extent(new);
  538. return 0;
  539. } else {
  540. goto out_err;
  541. }
  542. } else {
  543. /* |<-- be -->|
  544. * |<-- new -->| */
  545. if (extents_consistent(be, new)) {
  546. /* extend new to fully replace be */
  547. new->be_length += new->be_f_offset -
  548. be->be_f_offset;
  549. new->be_f_offset = be->be_f_offset;
  550. new->be_v_offset = be->be_v_offset;
  551. dprintk("%s: removing %p\n", __func__, be);
  552. list_del(&be->be_node);
  553. bl_put_extent(be);
  554. } else {
  555. goto out_err;
  556. }
  557. }
  558. } else if (end >= be->be_f_offset + be->be_length) {
  559. /* new extent overlap existing be */
  560. if (extents_consistent(be, new)) {
  561. /* extend new to fully replace be */
  562. dprintk("%s: removing %p\n", __func__, be);
  563. list_del(&be->be_node);
  564. bl_put_extent(be);
  565. } else {
  566. goto out_err;
  567. }
  568. } else if (end > be->be_f_offset) {
  569. /* |<-- be -->|
  570. *|<-- new -->| */
  571. if (extents_consistent(new, be)) {
  572. /* extend new to fully replace be */
  573. new->be_length += be->be_f_offset + be->be_length -
  574. new->be_f_offset - new->be_length;
  575. dprintk("%s: removing %p\n", __func__, be);
  576. list_del(&be->be_node);
  577. bl_put_extent(be);
  578. } else {
  579. goto out_err;
  580. }
  581. }
  582. }
  583. /* Note that if we never hit the above break, be will not point to a
  584. * valid extent. However, in that case &be->be_node==list.
  585. */
  586. list_add(&new->be_node, &be->be_node);
  587. dprintk("%s: inserting new\n", __func__);
  588. print_elist(list);
  589. /* FIXME - The per-list consistency checks have all been done,
  590. * should now check cross-list consistency.
  591. */
  592. return 0;
  593. out_err:
  594. bl_put_extent(new);
  595. return -EIO;
  596. }
  597. /* Returns extent, or NULL. If a second READ extent exists, it is returned
  598. * in cow_read, if given.
  599. *
  600. * The extents are kept in two seperate ordered lists, one for READ and NONE,
  601. * one for READWRITE and INVALID. Within each list, we assume:
  602. * 1. Extents are ordered by file offset.
  603. * 2. For any given isect, there is at most one extents that matches.
  604. */
  605. struct pnfs_block_extent *
  606. bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
  607. struct pnfs_block_extent **cow_read)
  608. {
  609. struct pnfs_block_extent *be, *cow, *ret;
  610. int i;
  611. dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
  612. cow = ret = NULL;
  613. spin_lock(&bl->bl_ext_lock);
  614. for (i = 0; i < EXTENT_LISTS; i++) {
  615. list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
  616. if (isect >= be->be_f_offset + be->be_length)
  617. break;
  618. if (isect >= be->be_f_offset) {
  619. /* We have found an extent */
  620. dprintk("%s Get %p (%i)\n", __func__, be,
  621. atomic_read(&be->be_refcnt.refcount));
  622. kref_get(&be->be_refcnt);
  623. if (!ret)
  624. ret = be;
  625. else if (be->be_state != PNFS_BLOCK_READ_DATA)
  626. bl_put_extent(be);
  627. else
  628. cow = be;
  629. break;
  630. }
  631. }
  632. if (ret &&
  633. (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
  634. break;
  635. }
  636. spin_unlock(&bl->bl_ext_lock);
  637. if (cow_read)
  638. *cow_read = cow;
  639. print_bl_extent(ret);
  640. return ret;
  641. }
  642. /* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
  643. static struct pnfs_block_extent *
  644. bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
  645. {
  646. struct pnfs_block_extent *be, *ret = NULL;
  647. int i;
  648. dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
  649. for (i = 0; i < EXTENT_LISTS; i++) {
  650. if (ret)
  651. break;
  652. list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
  653. if (isect >= be->be_f_offset + be->be_length)
  654. break;
  655. if (isect >= be->be_f_offset) {
  656. /* We have found an extent */
  657. dprintk("%s Get %p (%i)\n", __func__, be,
  658. atomic_read(&be->be_refcnt.refcount));
  659. kref_get(&be->be_refcnt);
  660. ret = be;
  661. break;
  662. }
  663. }
  664. }
  665. print_bl_extent(ret);
  666. return ret;
  667. }
  668. int
  669. encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
  670. struct xdr_stream *xdr,
  671. const struct nfs4_layoutcommit_args *arg)
  672. {
  673. struct pnfs_block_short_extent *lce, *save;
  674. unsigned int count = 0;
  675. __be32 *p, *xdr_start;
  676. dprintk("%s enter\n", __func__);
  677. /* BUG - creation of bl_commit is buggy - need to wait for
  678. * entire block to be marked WRITTEN before it can be added.
  679. */
  680. spin_lock(&bl->bl_ext_lock);
  681. /* Want to adjust for possible truncate */
  682. /* We now want to adjust argument range */
  683. /* XDR encode the ranges found */
  684. xdr_start = xdr_reserve_space(xdr, 8);
  685. if (!xdr_start)
  686. goto out;
  687. list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
  688. p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
  689. if (!p)
  690. break;
  691. p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
  692. p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
  693. p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
  694. p = xdr_encode_hyper(p, 0LL);
  695. *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
  696. list_del(&lce->bse_node);
  697. list_add_tail(&lce->bse_node, &bl->bl_committing);
  698. bl->bl_count--;
  699. count++;
  700. }
  701. xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
  702. xdr_start[1] = cpu_to_be32(count);
  703. out:
  704. spin_unlock(&bl->bl_ext_lock);
  705. dprintk("%s found %i ranges\n", __func__, count);
  706. return 0;
  707. }
  708. /* Helper function to set_to_rw that initialize a new extent */
  709. static void
  710. _prep_new_extent(struct pnfs_block_extent *new,
  711. struct pnfs_block_extent *orig,
  712. sector_t offset, sector_t length, int state)
  713. {
  714. kref_init(&new->be_refcnt);
  715. /* don't need to INIT_LIST_HEAD(&new->be_node) */
  716. memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
  717. new->be_mdev = orig->be_mdev;
  718. new->be_f_offset = offset;
  719. new->be_length = length;
  720. new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
  721. new->be_state = state;
  722. new->be_inval = orig->be_inval;
  723. }
  724. /* Tries to merge be with extent in front of it in list.
  725. * Frees storage if not used.
  726. */
  727. static struct pnfs_block_extent *
  728. _front_merge(struct pnfs_block_extent *be, struct list_head *head,
  729. struct pnfs_block_extent *storage)
  730. {
  731. struct pnfs_block_extent *prev;
  732. if (!storage)
  733. goto no_merge;
  734. if (&be->be_node == head || be->be_node.prev == head)
  735. goto no_merge;
  736. prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
  737. if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
  738. !extents_consistent(prev, be))
  739. goto no_merge;
  740. _prep_new_extent(storage, prev, prev->be_f_offset,
  741. prev->be_length + be->be_length, prev->be_state);
  742. list_replace(&prev->be_node, &storage->be_node);
  743. bl_put_extent(prev);
  744. list_del(&be->be_node);
  745. bl_put_extent(be);
  746. return storage;
  747. no_merge:
  748. kfree(storage);
  749. return be;
  750. }
  751. static u64
  752. set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
  753. {
  754. u64 rv = offset + length;
  755. struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
  756. struct pnfs_block_extent *children[3];
  757. struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
  758. int i = 0, j;
  759. dprintk("%s(%llu, %llu)\n", __func__, offset, length);
  760. /* Create storage for up to three new extents e1, e2, e3 */
  761. e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
  762. e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
  763. e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
  764. /* BUG - we are ignoring any failure */
  765. if (!e1 || !e2 || !e3)
  766. goto out_nosplit;
  767. spin_lock(&bl->bl_ext_lock);
  768. be = bl_find_get_extent_locked(bl, offset);
  769. rv = be->be_f_offset + be->be_length;
  770. if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
  771. spin_unlock(&bl->bl_ext_lock);
  772. goto out_nosplit;
  773. }
  774. /* Add e* to children, bumping e*'s krefs */
  775. if (be->be_f_offset != offset) {
  776. _prep_new_extent(e1, be, be->be_f_offset,
  777. offset - be->be_f_offset,
  778. PNFS_BLOCK_INVALID_DATA);
  779. children[i++] = e1;
  780. print_bl_extent(e1);
  781. } else
  782. merge1 = e1;
  783. _prep_new_extent(e2, be, offset,
  784. min(length, be->be_f_offset + be->be_length - offset),
  785. PNFS_BLOCK_READWRITE_DATA);
  786. children[i++] = e2;
  787. print_bl_extent(e2);
  788. if (offset + length < be->be_f_offset + be->be_length) {
  789. _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
  790. be->be_f_offset + be->be_length -
  791. offset - length,
  792. PNFS_BLOCK_INVALID_DATA);
  793. children[i++] = e3;
  794. print_bl_extent(e3);
  795. } else
  796. merge2 = e3;
  797. /* Remove be from list, and insert the e* */
  798. /* We don't get refs on e*, since this list is the base reference
  799. * set when init'ed.
  800. */
  801. if (i < 3)
  802. children[i] = NULL;
  803. new = children[0];
  804. list_replace(&be->be_node, &new->be_node);
  805. bl_put_extent(be);
  806. new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
  807. for (j = 1; j < i; j++) {
  808. old = new;
  809. new = children[j];
  810. list_add(&new->be_node, &old->be_node);
  811. }
  812. if (merge2) {
  813. /* This is a HACK, should just create a _back_merge function */
  814. new = list_entry(new->be_node.next,
  815. struct pnfs_block_extent, be_node);
  816. new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
  817. }
  818. spin_unlock(&bl->bl_ext_lock);
  819. /* Since we removed the base reference above, be is now scheduled for
  820. * destruction.
  821. */
  822. bl_put_extent(be);
  823. dprintk("%s returns %llu after split\n", __func__, rv);
  824. return rv;
  825. out_nosplit:
  826. kfree(e1);
  827. kfree(e2);
  828. kfree(e3);
  829. dprintk("%s returns %llu without splitting\n", __func__, rv);
  830. return rv;
  831. }
  832. void
  833. clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
  834. const struct nfs4_layoutcommit_args *arg,
  835. int status)
  836. {
  837. struct pnfs_block_short_extent *lce, *save;
  838. dprintk("%s status %d\n", __func__, status);
  839. list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
  840. if (likely(!status)) {
  841. u64 offset = lce->bse_f_offset;
  842. u64 end = offset + lce->bse_length;
  843. do {
  844. offset = set_to_rw(bl, offset, end - offset);
  845. } while (offset < end);
  846. list_del(&lce->bse_node);
  847. kfree(lce);
  848. } else {
  849. list_del(&lce->bse_node);
  850. spin_lock(&bl->bl_ext_lock);
  851. add_to_commitlist(bl, lce);
  852. spin_unlock(&bl->bl_ext_lock);
  853. }
  854. }
  855. }