drbd_bitmap.c 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560
  1. /*
  2. drbd_bitmap.c
  3. This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
  4. Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
  5. Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
  6. Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  7. drbd is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2, or (at your option)
  10. any later version.
  11. drbd is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with drbd; see the file COPYING. If not, write to
  17. the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #include <linux/bitops.h>
  20. #include <linux/vmalloc.h>
  21. #include <linux/string.h>
  22. #include <linux/drbd.h>
  23. #include <linux/slab.h>
  24. #include <asm/kmap_types.h>
  25. #include "drbd_int.h"
  26. /* OPAQUE outside this file!
  27. * interface defined in drbd_int.h
  28. * convention:
  29. * function name drbd_bm_... => used elsewhere, "public".
  30. * function name bm_... => internal to implementation, "private".
  31. * Note that since find_first_bit returns int, at the current granularity of
  32. * the bitmap (4KB per byte), this implementation "only" supports up to
  33. * 1<<(32+12) == 16 TB...
  34. */
  35. /*
  36. * NOTE
  37. * Access to the *bm_pages is protected by bm_lock.
  38. * It is safe to read the other members within the lock.
  39. *
  40. * drbd_bm_set_bits is called from bio_endio callbacks,
  41. * We may be called with irq already disabled,
  42. * so we need spin_lock_irqsave().
  43. * And we need the kmap_atomic.
  44. */
  45. struct drbd_bitmap {
  46. struct page **bm_pages;
  47. spinlock_t bm_lock;
  48. /* WARNING unsigned long bm_*:
  49. * 32bit number of bit offset is just enough for 512 MB bitmap.
  50. * it will blow up if we make the bitmap bigger...
  51. * not that it makes much sense to have a bitmap that large,
  52. * rather change the granularity to 16k or 64k or something.
  53. * (that implies other problems, however...)
  54. */
  55. unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
  56. unsigned long bm_bits;
  57. size_t bm_words;
  58. size_t bm_number_of_pages;
  59. sector_t bm_dev_capacity;
  60. struct mutex bm_change; /* serializes resize operations */
  61. wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
  62. unsigned long bm_flags;
  63. /* debugging aid, in case we are still racy somewhere */
  64. char *bm_why;
  65. struct task_struct *bm_task;
  66. };
  67. /* definition of bits in bm_flags */
  68. #define BM_LOCKED 0
  69. // #define BM_MD_IO_ERROR 1 unused now.
  70. #define BM_P_VMALLOCED 2
  71. static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
  72. unsigned long e, int val, const enum km_type km);
  73. static int bm_is_locked(struct drbd_bitmap *b)
  74. {
  75. return test_bit(BM_LOCKED, &b->bm_flags);
  76. }
  77. #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
  78. static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
  79. {
  80. struct drbd_bitmap *b = mdev->bitmap;
  81. if (!__ratelimit(&drbd_ratelimit_state))
  82. return;
  83. dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
  84. current == mdev->receiver.task ? "receiver" :
  85. current == mdev->asender.task ? "asender" :
  86. current == mdev->worker.task ? "worker" : current->comm,
  87. func, b->bm_why ?: "?",
  88. b->bm_task == mdev->receiver.task ? "receiver" :
  89. b->bm_task == mdev->asender.task ? "asender" :
  90. b->bm_task == mdev->worker.task ? "worker" : "?");
  91. }
  92. void drbd_bm_lock(struct drbd_conf *mdev, char *why)
  93. {
  94. struct drbd_bitmap *b = mdev->bitmap;
  95. int trylock_failed;
  96. if (!b) {
  97. dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
  98. return;
  99. }
  100. trylock_failed = !mutex_trylock(&b->bm_change);
  101. if (trylock_failed) {
  102. dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
  103. current == mdev->receiver.task ? "receiver" :
  104. current == mdev->asender.task ? "asender" :
  105. current == mdev->worker.task ? "worker" : current->comm,
  106. why, b->bm_why ?: "?",
  107. b->bm_task == mdev->receiver.task ? "receiver" :
  108. b->bm_task == mdev->asender.task ? "asender" :
  109. b->bm_task == mdev->worker.task ? "worker" : "?");
  110. mutex_lock(&b->bm_change);
  111. }
  112. if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
  113. dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
  114. b->bm_why = why;
  115. b->bm_task = current;
  116. }
  117. void drbd_bm_unlock(struct drbd_conf *mdev)
  118. {
  119. struct drbd_bitmap *b = mdev->bitmap;
  120. if (!b) {
  121. dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
  122. return;
  123. }
  124. if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
  125. dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
  126. b->bm_why = NULL;
  127. b->bm_task = NULL;
  128. mutex_unlock(&b->bm_change);
  129. }
  130. /* we store some "meta" info about our pages in page->private */
  131. /* at a granularity of 4k storage per bitmap bit:
  132. * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
  133. * 1<<38 bits,
  134. * 1<<23 4k bitmap pages.
  135. * Use 24 bits as page index, covers 2 peta byte storage
  136. * at a granularity of 4k per bit.
  137. * Used to report the failed page idx on io error from the endio handlers.
  138. */
  139. #define BM_PAGE_IDX_MASK ((1UL<<24)-1)
  140. /* this page is currently read in, or written back */
  141. #define BM_PAGE_IO_LOCK 31
  142. /* if there has been an IO error for this page */
  143. #define BM_PAGE_IO_ERROR 30
  144. /* this is to be able to intelligently skip disk IO,
  145. * set if bits have been set since last IO. */
  146. #define BM_PAGE_NEED_WRITEOUT 29
  147. /* to mark for lazy writeout once syncer cleared all clearable bits,
  148. * we if bits have been cleared since last IO. */
  149. #define BM_PAGE_LAZY_WRITEOUT 28
  150. /* store_page_idx uses non-atomic assingment. It is only used directly after
  151. * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
  152. * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
  153. * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
  154. * requires it all to be atomic as well. */
  155. static void bm_store_page_idx(struct page *page, unsigned long idx)
  156. {
  157. BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
  158. page_private(page) |= idx;
  159. }
  160. static unsigned long bm_page_to_idx(struct page *page)
  161. {
  162. return page_private(page) & BM_PAGE_IDX_MASK;
  163. }
  164. /* As is very unlikely that the same page is under IO from more than one
  165. * context, we can get away with a bit per page and one wait queue per bitmap.
  166. */
  167. static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
  168. {
  169. struct drbd_bitmap *b = mdev->bitmap;
  170. void *addr = &page_private(b->bm_pages[page_nr]);
  171. wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
  172. }
  173. static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
  174. {
  175. struct drbd_bitmap *b = mdev->bitmap;
  176. void *addr = &page_private(b->bm_pages[page_nr]);
  177. clear_bit(BM_PAGE_IO_LOCK, addr);
  178. smp_mb__after_clear_bit();
  179. wake_up(&mdev->bitmap->bm_io_wait);
  180. }
  181. /* set _before_ submit_io, so it may be reset due to being changed
  182. * while this page is in flight... will get submitted later again */
  183. static void bm_set_page_unchanged(struct page *page)
  184. {
  185. /* use cmpxchg? */
  186. clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
  187. clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  188. }
  189. static void bm_set_page_need_writeout(struct page *page)
  190. {
  191. set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
  192. }
  193. static int bm_test_page_unchanged(struct page *page)
  194. {
  195. volatile const unsigned long *addr = &page_private(page);
  196. return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
  197. }
  198. static void bm_set_page_io_err(struct page *page)
  199. {
  200. set_bit(BM_PAGE_IO_ERROR, &page_private(page));
  201. }
  202. static void bm_clear_page_io_err(struct page *page)
  203. {
  204. clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
  205. }
  206. static void bm_set_page_lazy_writeout(struct page *page)
  207. {
  208. set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  209. }
  210. static int bm_test_page_lazy_writeout(struct page *page)
  211. {
  212. return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  213. }
  214. /* on a 32bit box, this would allow for exactly (2<<38) bits. */
  215. static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
  216. {
  217. /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
  218. unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
  219. BUG_ON(page_nr >= b->bm_number_of_pages);
  220. return page_nr;
  221. }
  222. static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
  223. {
  224. /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
  225. unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
  226. BUG_ON(page_nr >= b->bm_number_of_pages);
  227. return page_nr;
  228. }
  229. static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
  230. {
  231. struct page *page = b->bm_pages[idx];
  232. return (unsigned long *) kmap_atomic(page, km);
  233. }
  234. static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
  235. {
  236. return __bm_map_pidx(b, idx, KM_IRQ1);
  237. }
  238. static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
  239. {
  240. kunmap_atomic(p_addr, km);
  241. };
  242. static void bm_unmap(unsigned long *p_addr)
  243. {
  244. return __bm_unmap(p_addr, KM_IRQ1);
  245. }
  246. /* long word offset of _bitmap_ sector */
  247. #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
  248. /* word offset from start of bitmap to word number _in_page_
  249. * modulo longs per page
  250. #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
  251. hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
  252. so do it explicitly:
  253. */
  254. #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
  255. /* Long words per page */
  256. #define LWPP (PAGE_SIZE/sizeof(long))
  257. /*
  258. * actually most functions herein should take a struct drbd_bitmap*, not a
  259. * struct drbd_conf*, but for the debug macros I like to have the mdev around
  260. * to be able to report device specific.
  261. */
  262. static void bm_free_pages(struct page **pages, unsigned long number)
  263. {
  264. unsigned long i;
  265. if (!pages)
  266. return;
  267. for (i = 0; i < number; i++) {
  268. if (!pages[i]) {
  269. printk(KERN_ALERT "drbd: bm_free_pages tried to free "
  270. "a NULL pointer; i=%lu n=%lu\n",
  271. i, number);
  272. continue;
  273. }
  274. __free_page(pages[i]);
  275. pages[i] = NULL;
  276. }
  277. }
  278. static void bm_vk_free(void *ptr, int v)
  279. {
  280. if (v)
  281. vfree(ptr);
  282. else
  283. kfree(ptr);
  284. }
  285. /*
  286. * "have" and "want" are NUMBER OF PAGES.
  287. */
  288. static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
  289. {
  290. struct page **old_pages = b->bm_pages;
  291. struct page **new_pages, *page;
  292. unsigned int i, bytes, vmalloced = 0;
  293. unsigned long have = b->bm_number_of_pages;
  294. BUG_ON(have == 0 && old_pages != NULL);
  295. BUG_ON(have != 0 && old_pages == NULL);
  296. if (have == want)
  297. return old_pages;
  298. /* Trying kmalloc first, falling back to vmalloc.
  299. * GFP_KERNEL is ok, as this is done when a lower level disk is
  300. * "attached" to the drbd. Context is receiver thread or cqueue
  301. * thread. As we have no disk yet, we are not in the IO path,
  302. * not even the IO path of the peer. */
  303. bytes = sizeof(struct page *)*want;
  304. new_pages = kmalloc(bytes, GFP_KERNEL);
  305. if (!new_pages) {
  306. new_pages = vmalloc(bytes);
  307. if (!new_pages)
  308. return NULL;
  309. vmalloced = 1;
  310. }
  311. memset(new_pages, 0, bytes);
  312. if (want >= have) {
  313. for (i = 0; i < have; i++)
  314. new_pages[i] = old_pages[i];
  315. for (; i < want; i++) {
  316. page = alloc_page(GFP_HIGHUSER);
  317. if (!page) {
  318. bm_free_pages(new_pages + have, i - have);
  319. bm_vk_free(new_pages, vmalloced);
  320. return NULL;
  321. }
  322. /* we want to know which page it is
  323. * from the endio handlers */
  324. bm_store_page_idx(page, i);
  325. new_pages[i] = page;
  326. }
  327. } else {
  328. for (i = 0; i < want; i++)
  329. new_pages[i] = old_pages[i];
  330. /* NOT HERE, we are outside the spinlock!
  331. bm_free_pages(old_pages + want, have - want);
  332. */
  333. }
  334. if (vmalloced)
  335. set_bit(BM_P_VMALLOCED, &b->bm_flags);
  336. else
  337. clear_bit(BM_P_VMALLOCED, &b->bm_flags);
  338. return new_pages;
  339. }
  340. /*
  341. * called on driver init only. TODO call when a device is created.
  342. * allocates the drbd_bitmap, and stores it in mdev->bitmap.
  343. */
  344. int drbd_bm_init(struct drbd_conf *mdev)
  345. {
  346. struct drbd_bitmap *b = mdev->bitmap;
  347. WARN_ON(b != NULL);
  348. b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
  349. if (!b)
  350. return -ENOMEM;
  351. spin_lock_init(&b->bm_lock);
  352. mutex_init(&b->bm_change);
  353. init_waitqueue_head(&b->bm_io_wait);
  354. mdev->bitmap = b;
  355. return 0;
  356. }
  357. sector_t drbd_bm_capacity(struct drbd_conf *mdev)
  358. {
  359. ERR_IF(!mdev->bitmap) return 0;
  360. return mdev->bitmap->bm_dev_capacity;
  361. }
  362. /* called on driver unload. TODO: call when a device is destroyed.
  363. */
  364. void drbd_bm_cleanup(struct drbd_conf *mdev)
  365. {
  366. ERR_IF (!mdev->bitmap) return;
  367. bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
  368. bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
  369. kfree(mdev->bitmap);
  370. mdev->bitmap = NULL;
  371. }
  372. /*
  373. * since (b->bm_bits % BITS_PER_LONG) != 0,
  374. * this masks out the remaining bits.
  375. * Returns the number of bits cleared.
  376. */
  377. #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
  378. #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
  379. #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
  380. static int bm_clear_surplus(struct drbd_bitmap *b)
  381. {
  382. unsigned long mask;
  383. unsigned long *p_addr, *bm;
  384. int tmp;
  385. int cleared = 0;
  386. /* number of bits modulo bits per page */
  387. tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
  388. /* mask the used bits of the word containing the last bit */
  389. mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
  390. /* bitmap is always stored little endian,
  391. * on disk and in core memory alike */
  392. mask = cpu_to_lel(mask);
  393. /* because of the "extra long to catch oob access" we allocate in
  394. * drbd_bm_resize, bm_number_of_pages -1 is not necessarily the page
  395. * containing the last _relevant_ bitmap word */
  396. p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, b->bm_bits - 1));
  397. bm = p_addr + (tmp/BITS_PER_LONG);
  398. if (mask) {
  399. /* If mask != 0, we are not exactly aligned, so bm now points
  400. * to the long containing the last bit.
  401. * If mask == 0, bm already points to the word immediately
  402. * after the last (long word aligned) bit. */
  403. cleared = hweight_long(*bm & ~mask);
  404. *bm &= mask;
  405. bm++;
  406. }
  407. if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
  408. /* on a 32bit arch, we may need to zero out
  409. * a padding long to align with a 64bit remote */
  410. cleared += hweight_long(*bm);
  411. *bm = 0;
  412. }
  413. bm_unmap(p_addr);
  414. return cleared;
  415. }
  416. static void bm_set_surplus(struct drbd_bitmap *b)
  417. {
  418. unsigned long mask;
  419. unsigned long *p_addr, *bm;
  420. int tmp;
  421. /* number of bits modulo bits per page */
  422. tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
  423. /* mask the used bits of the word containing the last bit */
  424. mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
  425. /* bitmap is always stored little endian,
  426. * on disk and in core memory alike */
  427. mask = cpu_to_lel(mask);
  428. /* because of the "extra long to catch oob access" we allocate in
  429. * drbd_bm_resize, bm_number_of_pages -1 is not necessarily the page
  430. * containing the last _relevant_ bitmap word */
  431. p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, b->bm_bits - 1));
  432. bm = p_addr + (tmp/BITS_PER_LONG);
  433. if (mask) {
  434. /* If mask != 0, we are not exactly aligned, so bm now points
  435. * to the long containing the last bit.
  436. * If mask == 0, bm already points to the word immediately
  437. * after the last (long word aligned) bit. */
  438. *bm |= ~mask;
  439. bm++;
  440. }
  441. if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
  442. /* on a 32bit arch, we may need to zero out
  443. * a padding long to align with a 64bit remote */
  444. *bm = ~0UL;
  445. }
  446. bm_unmap(p_addr);
  447. }
  448. static unsigned long bm_count_bits(struct drbd_bitmap *b)
  449. {
  450. unsigned long *p_addr, *bm, offset = 0;
  451. unsigned long bits = 0;
  452. unsigned long i, do_now;
  453. unsigned long words;
  454. /* due to 64bit alignment, the last long on a 32bit arch
  455. * may be not used at all. The last used long will likely
  456. * be only partially used, always. Don't count those bits,
  457. * but mask them out. */
  458. words = (b->bm_bits + BITS_PER_LONG - 1) >> LN2_BPL;
  459. while (offset < words) {
  460. i = do_now = min_t(size_t, words-offset, LWPP);
  461. p_addr = __bm_map_pidx(b, bm_word_to_page_idx(b, offset), KM_USER0);
  462. bm = p_addr + MLPP(offset);
  463. while (i--) {
  464. bits += hweight_long(*bm++);
  465. }
  466. offset += do_now;
  467. if (offset == words) {
  468. /* last word may only be partially used,
  469. * see also bm_clear_surplus. */
  470. i = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1;
  471. if (i) {
  472. bits -= hweight_long(p_addr[do_now-1] & ~i);
  473. p_addr[do_now-1] &= i;
  474. }
  475. /* 32bit arch, may have an unused padding long */
  476. if (words != b->bm_words)
  477. p_addr[do_now] = 0;
  478. }
  479. __bm_unmap(p_addr, KM_USER0);
  480. cond_resched();
  481. }
  482. return bits;
  483. }
  484. /* offset and len in long words.*/
  485. static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
  486. {
  487. unsigned long *p_addr, *bm;
  488. unsigned int idx;
  489. size_t do_now, end;
  490. #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
  491. end = offset + len;
  492. if (end > b->bm_words) {
  493. printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
  494. return;
  495. }
  496. while (offset < end) {
  497. do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
  498. idx = bm_word_to_page_idx(b, offset);
  499. p_addr = bm_map_pidx(b, idx);
  500. bm = p_addr + MLPP(offset);
  501. if (bm+do_now > p_addr + LWPP) {
  502. printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
  503. p_addr, bm, (int)do_now);
  504. break; /* breaks to after catch_oob_access_end() only! */
  505. }
  506. memset(bm, c, do_now * sizeof(long));
  507. bm_unmap(p_addr);
  508. bm_set_page_need_writeout(b->bm_pages[idx]);
  509. offset += do_now;
  510. }
  511. }
  512. /*
  513. * make sure the bitmap has enough room for the attached storage,
  514. * if necessary, resize.
  515. * called whenever we may have changed the device size.
  516. * returns -ENOMEM if we could not allocate enough memory, 0 on success.
  517. * In case this is actually a resize, we copy the old bitmap into the new one.
  518. * Otherwise, the bitmap is initialized to all bits set.
  519. */
  520. int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
  521. {
  522. struct drbd_bitmap *b = mdev->bitmap;
  523. unsigned long bits, words, owords, obits, *p_addr, *bm;
  524. unsigned long want, have, onpages; /* number of pages */
  525. struct page **npages, **opages = NULL;
  526. int err = 0, growing;
  527. int opages_vmalloced;
  528. ERR_IF(!b) return -ENOMEM;
  529. drbd_bm_lock(mdev, "resize");
  530. dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
  531. (unsigned long long)capacity);
  532. if (capacity == b->bm_dev_capacity)
  533. goto out;
  534. opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
  535. if (capacity == 0) {
  536. spin_lock_irq(&b->bm_lock);
  537. opages = b->bm_pages;
  538. onpages = b->bm_number_of_pages;
  539. owords = b->bm_words;
  540. b->bm_pages = NULL;
  541. b->bm_number_of_pages =
  542. b->bm_set =
  543. b->bm_bits =
  544. b->bm_words =
  545. b->bm_dev_capacity = 0;
  546. spin_unlock_irq(&b->bm_lock);
  547. bm_free_pages(opages, onpages);
  548. bm_vk_free(opages, opages_vmalloced);
  549. goto out;
  550. }
  551. bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
  552. /* if we would use
  553. words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
  554. a 32bit host could present the wrong number of words
  555. to a 64bit host.
  556. */
  557. words = ALIGN(bits, 64) >> LN2_BPL;
  558. if (get_ldev(mdev)) {
  559. D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
  560. put_ldev(mdev);
  561. }
  562. /* one extra long to catch off by one errors */
  563. want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
  564. have = b->bm_number_of_pages;
  565. if (want == have) {
  566. D_ASSERT(b->bm_pages != NULL);
  567. npages = b->bm_pages;
  568. } else {
  569. if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
  570. npages = NULL;
  571. else
  572. npages = bm_realloc_pages(b, want);
  573. }
  574. if (!npages) {
  575. err = -ENOMEM;
  576. goto out;
  577. }
  578. spin_lock_irq(&b->bm_lock);
  579. opages = b->bm_pages;
  580. owords = b->bm_words;
  581. obits = b->bm_bits;
  582. growing = bits > obits;
  583. if (opages && growing && set_new_bits)
  584. bm_set_surplus(b);
  585. b->bm_pages = npages;
  586. b->bm_number_of_pages = want;
  587. b->bm_bits = bits;
  588. b->bm_words = words;
  589. b->bm_dev_capacity = capacity;
  590. if (growing) {
  591. if (set_new_bits) {
  592. bm_memset(b, owords, 0xff, words-owords);
  593. b->bm_set += bits - obits;
  594. } else
  595. bm_memset(b, owords, 0x00, words-owords);
  596. }
  597. if (want < have) {
  598. /* implicit: (opages != NULL) && (opages != npages) */
  599. bm_free_pages(opages + want, have - want);
  600. }
  601. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, words));
  602. bm = p_addr + MLPP(words);
  603. *bm = DRBD_MAGIC;
  604. bm_unmap(p_addr);
  605. (void)bm_clear_surplus(b);
  606. spin_unlock_irq(&b->bm_lock);
  607. if (opages != npages)
  608. bm_vk_free(opages, opages_vmalloced);
  609. if (!growing)
  610. b->bm_set = bm_count_bits(b);
  611. dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
  612. out:
  613. drbd_bm_unlock(mdev);
  614. return err;
  615. }
  616. /* inherently racy:
  617. * if not protected by other means, return value may be out of date when
  618. * leaving this function...
  619. * we still need to lock it, since it is important that this returns
  620. * bm_set == 0 precisely.
  621. *
  622. * maybe bm_set should be atomic_t ?
  623. */
  624. unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
  625. {
  626. struct drbd_bitmap *b = mdev->bitmap;
  627. unsigned long s;
  628. unsigned long flags;
  629. ERR_IF(!b) return 0;
  630. ERR_IF(!b->bm_pages) return 0;
  631. spin_lock_irqsave(&b->bm_lock, flags);
  632. s = b->bm_set;
  633. spin_unlock_irqrestore(&b->bm_lock, flags);
  634. return s;
  635. }
  636. unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
  637. {
  638. unsigned long s;
  639. /* if I don't have a disk, I don't know about out-of-sync status */
  640. if (!get_ldev_if_state(mdev, D_NEGOTIATING))
  641. return 0;
  642. s = _drbd_bm_total_weight(mdev);
  643. put_ldev(mdev);
  644. return s;
  645. }
  646. size_t drbd_bm_words(struct drbd_conf *mdev)
  647. {
  648. struct drbd_bitmap *b = mdev->bitmap;
  649. ERR_IF(!b) return 0;
  650. ERR_IF(!b->bm_pages) return 0;
  651. return b->bm_words;
  652. }
  653. unsigned long drbd_bm_bits(struct drbd_conf *mdev)
  654. {
  655. struct drbd_bitmap *b = mdev->bitmap;
  656. ERR_IF(!b) return 0;
  657. return b->bm_bits;
  658. }
  659. /* merge number words from buffer into the bitmap starting at offset.
  660. * buffer[i] is expected to be little endian unsigned long.
  661. * bitmap must be locked by drbd_bm_lock.
  662. * currently only used from receive_bitmap.
  663. */
  664. void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
  665. unsigned long *buffer)
  666. {
  667. struct drbd_bitmap *b = mdev->bitmap;
  668. unsigned long *p_addr, *bm;
  669. unsigned long word, bits;
  670. unsigned int idx;
  671. size_t end, do_now;
  672. end = offset + number;
  673. ERR_IF(!b) return;
  674. ERR_IF(!b->bm_pages) return;
  675. if (number == 0)
  676. return;
  677. WARN_ON(offset >= b->bm_words);
  678. WARN_ON(end > b->bm_words);
  679. spin_lock_irq(&b->bm_lock);
  680. while (offset < end) {
  681. do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
  682. idx = bm_word_to_page_idx(b, offset);
  683. p_addr = bm_map_pidx(b, idx);
  684. bm = p_addr + MLPP(offset);
  685. offset += do_now;
  686. while (do_now--) {
  687. bits = hweight_long(*bm);
  688. word = *bm | *buffer++;
  689. *bm++ = word;
  690. b->bm_set += hweight_long(word) - bits;
  691. }
  692. bm_unmap(p_addr);
  693. bm_set_page_need_writeout(b->bm_pages[idx]);
  694. }
  695. /* with 32bit <-> 64bit cross-platform connect
  696. * this is only correct for current usage,
  697. * where we _know_ that we are 64 bit aligned,
  698. * and know that this function is used in this way, too...
  699. */
  700. if (end == b->bm_words)
  701. b->bm_set -= bm_clear_surplus(b);
  702. spin_unlock_irq(&b->bm_lock);
  703. }
  704. /* copy number words from the bitmap starting at offset into the buffer.
  705. * buffer[i] will be little endian unsigned long.
  706. */
  707. void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
  708. unsigned long *buffer)
  709. {
  710. struct drbd_bitmap *b = mdev->bitmap;
  711. unsigned long *p_addr, *bm;
  712. size_t end, do_now;
  713. end = offset + number;
  714. ERR_IF(!b) return;
  715. ERR_IF(!b->bm_pages) return;
  716. spin_lock_irq(&b->bm_lock);
  717. if ((offset >= b->bm_words) ||
  718. (end > b->bm_words) ||
  719. (number <= 0))
  720. dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
  721. (unsigned long) offset,
  722. (unsigned long) number,
  723. (unsigned long) b->bm_words);
  724. else {
  725. while (offset < end) {
  726. do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
  727. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
  728. bm = p_addr + MLPP(offset);
  729. offset += do_now;
  730. while (do_now--)
  731. *buffer++ = *bm++;
  732. bm_unmap(p_addr);
  733. }
  734. }
  735. spin_unlock_irq(&b->bm_lock);
  736. }
  737. /* set all bits in the bitmap */
  738. void drbd_bm_set_all(struct drbd_conf *mdev)
  739. {
  740. struct drbd_bitmap *b = mdev->bitmap;
  741. ERR_IF(!b) return;
  742. ERR_IF(!b->bm_pages) return;
  743. spin_lock_irq(&b->bm_lock);
  744. bm_memset(b, 0, 0xff, b->bm_words);
  745. (void)bm_clear_surplus(b);
  746. b->bm_set = b->bm_bits;
  747. spin_unlock_irq(&b->bm_lock);
  748. }
  749. /* clear all bits in the bitmap */
  750. void drbd_bm_clear_all(struct drbd_conf *mdev)
  751. {
  752. struct drbd_bitmap *b = mdev->bitmap;
  753. ERR_IF(!b) return;
  754. ERR_IF(!b->bm_pages) return;
  755. spin_lock_irq(&b->bm_lock);
  756. bm_memset(b, 0, 0, b->bm_words);
  757. b->bm_set = 0;
  758. spin_unlock_irq(&b->bm_lock);
  759. }
  760. struct bm_aio_ctx {
  761. struct drbd_conf *mdev;
  762. atomic_t in_flight;
  763. wait_queue_head_t io_wait;
  764. unsigned flags;
  765. #define BM_AIO_COPY_PAGES 1
  766. int error;
  767. };
  768. /* bv_page may be a copy, or may be the original */
  769. static void bm_async_io_complete(struct bio *bio, int error)
  770. {
  771. struct bm_aio_ctx *ctx = bio->bi_private;
  772. struct drbd_conf *mdev = ctx->mdev;
  773. struct drbd_bitmap *b = mdev->bitmap;
  774. unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
  775. int uptodate = bio_flagged(bio, BIO_UPTODATE);
  776. /* strange behavior of some lower level drivers...
  777. * fail the request by clearing the uptodate flag,
  778. * but do not return any error?!
  779. * do we want to WARN() on this? */
  780. if (!error && !uptodate)
  781. error = -EIO;
  782. if (!bm_test_page_unchanged(b->bm_pages[idx]))
  783. dev_info(DEV, "bitmap page idx %u changed during IO!\n", idx);
  784. if (error) {
  785. /* ctx error will hold the completed-last non-zero error code,
  786. * in case error codes differ. */
  787. ctx->error = error;
  788. bm_set_page_io_err(b->bm_pages[idx]);
  789. /* Not identical to on disk version of it.
  790. * Is BM_PAGE_IO_ERROR enough? */
  791. if (__ratelimit(&drbd_ratelimit_state))
  792. dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
  793. error, idx);
  794. } else {
  795. bm_clear_page_io_err(b->bm_pages[idx]);
  796. dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
  797. }
  798. bm_page_unlock_io(mdev, idx);
  799. /* FIXME give back to page pool */
  800. if (ctx->flags & BM_AIO_COPY_PAGES)
  801. put_page(bio->bi_io_vec[0].bv_page);
  802. bio_put(bio);
  803. if (atomic_dec_and_test(&ctx->in_flight))
  804. wake_up(&ctx->io_wait);
  805. }
  806. static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
  807. {
  808. /* we are process context. we always get a bio */
  809. struct bio *bio = bio_alloc(GFP_KERNEL, 1);
  810. struct drbd_conf *mdev = ctx->mdev;
  811. struct drbd_bitmap *b = mdev->bitmap;
  812. struct page *page;
  813. unsigned int len;
  814. sector_t on_disk_sector =
  815. mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
  816. on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
  817. /* this might happen with very small
  818. * flexible external meta data device,
  819. * or with PAGE_SIZE > 4k */
  820. len = min_t(unsigned int, PAGE_SIZE,
  821. (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
  822. /* serialize IO on this page */
  823. bm_page_lock_io(mdev, page_nr);
  824. /* before memcpy and submit,
  825. * so it can be redirtied any time */
  826. bm_set_page_unchanged(b->bm_pages[page_nr]);
  827. if (ctx->flags & BM_AIO_COPY_PAGES) {
  828. /* FIXME alloc_page is good enough for now, but actually needs
  829. * to use pre-allocated page pool */
  830. void *src, *dest;
  831. page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
  832. dest = kmap_atomic(page, KM_USER0);
  833. src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
  834. memcpy(dest, src, PAGE_SIZE);
  835. kunmap_atomic(src, KM_USER1);
  836. kunmap_atomic(dest, KM_USER0);
  837. bm_store_page_idx(page, page_nr);
  838. } else
  839. page = b->bm_pages[page_nr];
  840. bio->bi_bdev = mdev->ldev->md_bdev;
  841. bio->bi_sector = on_disk_sector;
  842. bio_add_page(bio, page, len, 0);
  843. bio->bi_private = ctx;
  844. bio->bi_end_io = bm_async_io_complete;
  845. if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
  846. bio->bi_rw |= rw;
  847. bio_endio(bio, -EIO);
  848. } else {
  849. submit_bio(rw, bio);
  850. }
  851. }
  852. /*
  853. * bm_rw: read/write the whole bitmap from/to its on disk location.
  854. */
  855. static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
  856. {
  857. struct bm_aio_ctx ctx =
  858. { .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0 };
  859. struct drbd_bitmap *b = mdev->bitmap;
  860. int last_page, i, count = 0;
  861. unsigned long now;
  862. char ppb[10];
  863. int err = 0;
  864. /*
  865. * We are protected against bitmap disappearing/resizing by holding an
  866. * ldev reference (caller must have called get_ldev()).
  867. * For read/write, we are protected against changes to the bitmap by
  868. * the bitmap lock (see drbd_bitmap_io).
  869. * For lazy writeout, we don't care for ongoing changes to the bitmap,
  870. * as we submit copies of pages anyways.
  871. */
  872. if (!ctx.flags)
  873. WARN_ON(!bm_is_locked(b));
  874. /* because of the "extra long to catch oob access" we allocate in
  875. * drbd_bm_resize, bm_number_of_pages -1 is not necessarily the page
  876. * containing the last _relevant_ bitmap word */
  877. last_page = bm_word_to_page_idx(b, b->bm_words - 1);
  878. now = jiffies;
  879. ctx.mdev = mdev;
  880. atomic_set(&ctx.in_flight, 1); /* one extra ref */
  881. init_waitqueue_head(&ctx.io_wait);
  882. ctx.error = 0;
  883. /* let the layers below us try to merge these bios... */
  884. for (i = 0; i <= last_page; i++) {
  885. /* ignore completely unchanged pages */
  886. if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
  887. break;
  888. if (rw & WRITE) {
  889. if (bm_test_page_unchanged(b->bm_pages[i])) {
  890. dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
  891. continue;
  892. }
  893. /* during lazy writeout,
  894. * ignore those pages not marked for lazy writeout. */
  895. if (lazy_writeout_upper_idx &&
  896. !bm_test_page_lazy_writeout(b->bm_pages[i])) {
  897. dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
  898. continue;
  899. }
  900. }
  901. atomic_inc(&ctx.in_flight);
  902. bm_page_io_async(&ctx, i, rw);
  903. ++count;
  904. cond_resched();
  905. }
  906. atomic_dec(&ctx.in_flight); /* drop the extra ref */
  907. wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0);
  908. dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
  909. rw == WRITE ? "WRITE" : "READ",
  910. count, jiffies - now);
  911. if (ctx.error) {
  912. dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
  913. drbd_chk_io_error(mdev, 1, true);
  914. err = -EIO; /* ctx.error ? */
  915. }
  916. now = jiffies;
  917. if (rw == WRITE) {
  918. drbd_md_flush(mdev);
  919. } else /* rw == READ */ {
  920. b->bm_set = bm_count_bits(b);
  921. dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
  922. jiffies - now);
  923. }
  924. now = b->bm_set;
  925. dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
  926. ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
  927. return err;
  928. }
  929. /**
  930. * drbd_bm_read() - Read the whole bitmap from its on disk location.
  931. * @mdev: DRBD device.
  932. */
  933. int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
  934. {
  935. return bm_rw(mdev, READ, 0);
  936. }
  937. /**
  938. * drbd_bm_write() - Write the whole bitmap to its on disk location.
  939. * @mdev: DRBD device.
  940. *
  941. * Will only write pages that have changed since last IO.
  942. */
  943. int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
  944. {
  945. return bm_rw(mdev, WRITE, 0);
  946. }
  947. /**
  948. * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
  949. * @mdev: DRBD device.
  950. * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
  951. */
  952. int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
  953. {
  954. return bm_rw(mdev, WRITE, upper_idx);
  955. }
  956. /**
  957. * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
  958. * @mdev: DRBD device.
  959. * @idx: bitmap page index
  960. *
  961. * We don't want to special case on logical_block_size of the underlaying
  962. * device, so we submit PAGE_SIZE aligned pieces containing the requested enr.
  963. * Note that on "most" systems, PAGE_SIZE is 4k.
  964. */
  965. int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
  966. {
  967. struct bm_aio_ctx ctx = { .flags = BM_AIO_COPY_PAGES, };
  968. if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
  969. dev_info(DEV, "skipped bm page write for idx %u\n", idx);
  970. return 0;
  971. }
  972. ctx.mdev = mdev;
  973. atomic_set(&ctx.in_flight, 1);
  974. init_waitqueue_head(&ctx.io_wait);
  975. bm_page_io_async(&ctx, idx, WRITE_SYNC);
  976. wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0);
  977. if (ctx.error)
  978. drbd_chk_io_error(mdev, 1, true);
  979. /* that should force detach, so the in memory bitmap will be
  980. * gone in a moment as well. */
  981. mdev->bm_writ_cnt++;
  982. return ctx.error;
  983. }
  984. /* NOTE
  985. * find_first_bit returns int, we return unsigned long.
  986. * should not make much difference anyways, but ...
  987. *
  988. * this returns a bit number, NOT a sector!
  989. */
  990. #define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
  991. static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
  992. const int find_zero_bit, const enum km_type km)
  993. {
  994. struct drbd_bitmap *b = mdev->bitmap;
  995. unsigned long i = -1UL;
  996. unsigned long *p_addr;
  997. unsigned long bit_offset; /* bit offset of the mapped page. */
  998. if (bm_fo > b->bm_bits) {
  999. dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
  1000. } else {
  1001. while (bm_fo < b->bm_bits) {
  1002. /* bit offset of the first bit in the page */
  1003. bit_offset = bm_fo & ~BPP_MASK;
  1004. p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
  1005. if (find_zero_bit)
  1006. i = generic_find_next_zero_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
  1007. else
  1008. i = generic_find_next_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
  1009. __bm_unmap(p_addr, km);
  1010. if (i < PAGE_SIZE*8) {
  1011. i = bit_offset + i;
  1012. if (i >= b->bm_bits)
  1013. break;
  1014. goto found;
  1015. }
  1016. bm_fo = bit_offset + PAGE_SIZE*8;
  1017. }
  1018. i = -1UL;
  1019. }
  1020. found:
  1021. return i;
  1022. }
  1023. static unsigned long bm_find_next(struct drbd_conf *mdev,
  1024. unsigned long bm_fo, const int find_zero_bit)
  1025. {
  1026. struct drbd_bitmap *b = mdev->bitmap;
  1027. unsigned long i = -1UL;
  1028. ERR_IF(!b) return i;
  1029. ERR_IF(!b->bm_pages) return i;
  1030. spin_lock_irq(&b->bm_lock);
  1031. if (bm_is_locked(b))
  1032. bm_print_lock_info(mdev);
  1033. i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
  1034. spin_unlock_irq(&b->bm_lock);
  1035. return i;
  1036. }
  1037. unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
  1038. {
  1039. return bm_find_next(mdev, bm_fo, 0);
  1040. }
  1041. #if 0
  1042. /* not yet needed for anything. */
  1043. unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
  1044. {
  1045. return bm_find_next(mdev, bm_fo, 1);
  1046. }
  1047. #endif
  1048. /* does not spin_lock_irqsave.
  1049. * you must take drbd_bm_lock() first */
  1050. unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
  1051. {
  1052. /* WARN_ON(!bm_is_locked(mdev)); */
  1053. return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
  1054. }
  1055. unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
  1056. {
  1057. /* WARN_ON(!bm_is_locked(mdev)); */
  1058. return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
  1059. }
  1060. /* returns number of bits actually changed.
  1061. * for val != 0, we change 0 -> 1, return code positive
  1062. * for val == 0, we change 1 -> 0, return code negative
  1063. * wants bitnr, not sector.
  1064. * expected to be called for only a few bits (e - s about BITS_PER_LONG).
  1065. * Must hold bitmap lock already. */
  1066. static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
  1067. unsigned long e, int val, const enum km_type km)
  1068. {
  1069. struct drbd_bitmap *b = mdev->bitmap;
  1070. unsigned long *p_addr = NULL;
  1071. unsigned long bitnr;
  1072. unsigned int last_page_nr = -1U;
  1073. int c = 0;
  1074. int changed_total = 0;
  1075. if (e >= b->bm_bits) {
  1076. dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
  1077. s, e, b->bm_bits);
  1078. e = b->bm_bits ? b->bm_bits -1 : 0;
  1079. }
  1080. for (bitnr = s; bitnr <= e; bitnr++) {
  1081. unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
  1082. if (page_nr != last_page_nr) {
  1083. if (p_addr)
  1084. __bm_unmap(p_addr, km);
  1085. if (c < 0)
  1086. bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
  1087. else if (c > 0)
  1088. bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
  1089. changed_total += c;
  1090. c = 0;
  1091. p_addr = __bm_map_pidx(b, page_nr, km);
  1092. last_page_nr = page_nr;
  1093. }
  1094. if (val)
  1095. c += (0 == generic___test_and_set_le_bit(bitnr & BPP_MASK, p_addr));
  1096. else
  1097. c -= (0 != generic___test_and_clear_le_bit(bitnr & BPP_MASK, p_addr));
  1098. }
  1099. if (p_addr)
  1100. __bm_unmap(p_addr, km);
  1101. if (c < 0)
  1102. bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
  1103. else if (c > 0)
  1104. bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
  1105. changed_total += c;
  1106. b->bm_set += changed_total;
  1107. return changed_total;
  1108. }
  1109. /* returns number of bits actually changed.
  1110. * for val != 0, we change 0 -> 1, return code positive
  1111. * for val == 0, we change 1 -> 0, return code negative
  1112. * wants bitnr, not sector */
  1113. static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
  1114. const unsigned long e, int val)
  1115. {
  1116. unsigned long flags;
  1117. struct drbd_bitmap *b = mdev->bitmap;
  1118. int c = 0;
  1119. ERR_IF(!b) return 1;
  1120. ERR_IF(!b->bm_pages) return 0;
  1121. spin_lock_irqsave(&b->bm_lock, flags);
  1122. if (bm_is_locked(b))
  1123. bm_print_lock_info(mdev);
  1124. c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
  1125. spin_unlock_irqrestore(&b->bm_lock, flags);
  1126. return c;
  1127. }
  1128. /* returns number of bits changed 0 -> 1 */
  1129. int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
  1130. {
  1131. return bm_change_bits_to(mdev, s, e, 1);
  1132. }
  1133. /* returns number of bits changed 1 -> 0 */
  1134. int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
  1135. {
  1136. return -bm_change_bits_to(mdev, s, e, 0);
  1137. }
  1138. /* sets all bits in full words,
  1139. * from first_word up to, but not including, last_word */
  1140. static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
  1141. int page_nr, int first_word, int last_word)
  1142. {
  1143. int i;
  1144. int bits;
  1145. unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
  1146. for (i = first_word; i < last_word; i++) {
  1147. bits = hweight_long(paddr[i]);
  1148. paddr[i] = ~0UL;
  1149. b->bm_set += BITS_PER_LONG - bits;
  1150. }
  1151. kunmap_atomic(paddr, KM_USER0);
  1152. }
  1153. /* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
  1154. * You must first drbd_bm_lock().
  1155. * Can be called to set the whole bitmap in one go.
  1156. * Sets bits from s to e _inclusive_. */
  1157. void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
  1158. {
  1159. /* First set_bit from the first bit (s)
  1160. * up to the next long boundary (sl),
  1161. * then assign full words up to the last long boundary (el),
  1162. * then set_bit up to and including the last bit (e).
  1163. *
  1164. * Do not use memset, because we must account for changes,
  1165. * so we need to loop over the words with hweight() anyways.
  1166. */
  1167. unsigned long sl = ALIGN(s,BITS_PER_LONG);
  1168. unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
  1169. int first_page;
  1170. int last_page;
  1171. int page_nr;
  1172. int first_word;
  1173. int last_word;
  1174. if (e - s <= 3*BITS_PER_LONG) {
  1175. /* don't bother; el and sl may even be wrong. */
  1176. __bm_change_bits_to(mdev, s, e, 1, KM_USER0);
  1177. return;
  1178. }
  1179. /* difference is large enough that we can trust sl and el */
  1180. /* bits filling the current long */
  1181. if (sl)
  1182. __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
  1183. first_page = sl >> (3 + PAGE_SHIFT);
  1184. last_page = el >> (3 + PAGE_SHIFT);
  1185. /* MLPP: modulo longs per page */
  1186. /* LWPP: long words per page */
  1187. first_word = MLPP(sl >> LN2_BPL);
  1188. last_word = LWPP;
  1189. /* first and full pages, unless first page == last page */
  1190. for (page_nr = first_page; page_nr < last_page; page_nr++) {
  1191. bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
  1192. cond_resched();
  1193. first_word = 0;
  1194. }
  1195. /* last page (respectively only page, for first page == last page) */
  1196. last_word = MLPP(el >> LN2_BPL);
  1197. bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
  1198. /* possibly trailing bits.
  1199. * example: (e & 63) == 63, el will be e+1.
  1200. * if that even was the very last bit,
  1201. * it would trigger an assert in __bm_change_bits_to()
  1202. */
  1203. if (el <= e)
  1204. __bm_change_bits_to(mdev, el, e, 1, KM_USER0);
  1205. }
  1206. /* returns bit state
  1207. * wants bitnr, NOT sector.
  1208. * inherently racy... area needs to be locked by means of {al,rs}_lru
  1209. * 1 ... bit set
  1210. * 0 ... bit not set
  1211. * -1 ... first out of bounds access, stop testing for bits!
  1212. */
  1213. int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
  1214. {
  1215. unsigned long flags;
  1216. struct drbd_bitmap *b = mdev->bitmap;
  1217. unsigned long *p_addr;
  1218. int i;
  1219. ERR_IF(!b) return 0;
  1220. ERR_IF(!b->bm_pages) return 0;
  1221. spin_lock_irqsave(&b->bm_lock, flags);
  1222. if (bm_is_locked(b))
  1223. bm_print_lock_info(mdev);
  1224. if (bitnr < b->bm_bits) {
  1225. p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
  1226. i = generic_test_le_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
  1227. bm_unmap(p_addr);
  1228. } else if (bitnr == b->bm_bits) {
  1229. i = -1;
  1230. } else { /* (bitnr > b->bm_bits) */
  1231. dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
  1232. i = 0;
  1233. }
  1234. spin_unlock_irqrestore(&b->bm_lock, flags);
  1235. return i;
  1236. }
  1237. /* returns number of bits set in the range [s, e] */
  1238. int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
  1239. {
  1240. unsigned long flags;
  1241. struct drbd_bitmap *b = mdev->bitmap;
  1242. unsigned long *p_addr = NULL;
  1243. unsigned long bitnr;
  1244. unsigned int page_nr = -1U;
  1245. int c = 0;
  1246. /* If this is called without a bitmap, that is a bug. But just to be
  1247. * robust in case we screwed up elsewhere, in that case pretend there
  1248. * was one dirty bit in the requested area, so we won't try to do a
  1249. * local read there (no bitmap probably implies no disk) */
  1250. ERR_IF(!b) return 1;
  1251. ERR_IF(!b->bm_pages) return 1;
  1252. spin_lock_irqsave(&b->bm_lock, flags);
  1253. if (bm_is_locked(b))
  1254. bm_print_lock_info(mdev);
  1255. for (bitnr = s; bitnr <= e; bitnr++) {
  1256. unsigned int idx = bm_bit_to_page_idx(b, bitnr);
  1257. if (page_nr != idx) {
  1258. page_nr = idx;
  1259. if (p_addr)
  1260. bm_unmap(p_addr);
  1261. p_addr = bm_map_pidx(b, idx);
  1262. }
  1263. ERR_IF (bitnr >= b->bm_bits) {
  1264. dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
  1265. } else {
  1266. c += (0 != generic_test_le_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
  1267. }
  1268. }
  1269. if (p_addr)
  1270. bm_unmap(p_addr);
  1271. spin_unlock_irqrestore(&b->bm_lock, flags);
  1272. return c;
  1273. }
  1274. /* inherently racy...
  1275. * return value may be already out-of-date when this function returns.
  1276. * but the general usage is that this is only use during a cstate when bits are
  1277. * only cleared, not set, and typically only care for the case when the return
  1278. * value is zero, or we already "locked" this "bitmap extent" by other means.
  1279. *
  1280. * enr is bm-extent number, since we chose to name one sector (512 bytes)
  1281. * worth of the bitmap a "bitmap extent".
  1282. *
  1283. * TODO
  1284. * I think since we use it like a reference count, we should use the real
  1285. * reference count of some bitmap extent element from some lru instead...
  1286. *
  1287. */
  1288. int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
  1289. {
  1290. struct drbd_bitmap *b = mdev->bitmap;
  1291. int count, s, e;
  1292. unsigned long flags;
  1293. unsigned long *p_addr, *bm;
  1294. ERR_IF(!b) return 0;
  1295. ERR_IF(!b->bm_pages) return 0;
  1296. spin_lock_irqsave(&b->bm_lock, flags);
  1297. if (bm_is_locked(b))
  1298. bm_print_lock_info(mdev);
  1299. s = S2W(enr);
  1300. e = min((size_t)S2W(enr+1), b->bm_words);
  1301. count = 0;
  1302. if (s < b->bm_words) {
  1303. int n = e-s;
  1304. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
  1305. bm = p_addr + MLPP(s);
  1306. while (n--)
  1307. count += hweight_long(*bm++);
  1308. bm_unmap(p_addr);
  1309. } else {
  1310. dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
  1311. }
  1312. spin_unlock_irqrestore(&b->bm_lock, flags);
  1313. return count;
  1314. }
  1315. /* set all bits covered by the AL-extent al_enr */
  1316. unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
  1317. {
  1318. struct drbd_bitmap *b = mdev->bitmap;
  1319. unsigned long *p_addr, *bm;
  1320. unsigned long weight;
  1321. int count, s, e, i, do_now;
  1322. ERR_IF(!b) return 0;
  1323. ERR_IF(!b->bm_pages) return 0;
  1324. spin_lock_irq(&b->bm_lock);
  1325. if (bm_is_locked(b))
  1326. bm_print_lock_info(mdev);
  1327. weight = b->bm_set;
  1328. s = al_enr * BM_WORDS_PER_AL_EXT;
  1329. e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
  1330. /* assert that s and e are on the same page */
  1331. D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
  1332. == s >> (PAGE_SHIFT - LN2_BPL + 3));
  1333. count = 0;
  1334. if (s < b->bm_words) {
  1335. i = do_now = e-s;
  1336. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
  1337. bm = p_addr + MLPP(s);
  1338. while (i--) {
  1339. count += hweight_long(*bm);
  1340. *bm = -1UL;
  1341. bm++;
  1342. }
  1343. bm_unmap(p_addr);
  1344. b->bm_set += do_now*BITS_PER_LONG - count;
  1345. if (e == b->bm_words)
  1346. b->bm_set -= bm_clear_surplus(b);
  1347. } else {
  1348. dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
  1349. }
  1350. weight = b->bm_set - weight;
  1351. spin_unlock_irq(&b->bm_lock);
  1352. return weight;
  1353. }