drbd_worker.c 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542
  1. /*
  2. drbd_worker.c
  3. This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
  4. Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
  5. Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
  6. Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  7. drbd is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2, or (at your option)
  10. any later version.
  11. drbd is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with drbd; see the file COPYING. If not, write to
  17. the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #include <linux/module.h>
  20. #include <linux/drbd.h>
  21. #include <linux/sched.h>
  22. #include <linux/smp_lock.h>
  23. #include <linux/wait.h>
  24. #include <linux/mm.h>
  25. #include <linux/memcontrol.h>
  26. #include <linux/mm_inline.h>
  27. #include <linux/slab.h>
  28. #include <linux/random.h>
  29. #include <linux/string.h>
  30. #include <linux/scatterlist.h>
  31. #include "drbd_int.h"
  32. #include "drbd_req.h"
  33. #define SLEEP_TIME (HZ/10)
  34. static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
  35. /* defined here:
  36. drbd_md_io_complete
  37. drbd_endio_sec
  38. drbd_endio_pri
  39. * more endio handlers:
  40. atodb_endio in drbd_actlog.c
  41. drbd_bm_async_io_complete in drbd_bitmap.c
  42. * For all these callbacks, note the following:
  43. * The callbacks will be called in irq context by the IDE drivers,
  44. * and in Softirqs/Tasklets/BH context by the SCSI drivers.
  45. * Try to get the locking right :)
  46. *
  47. */
  48. /* About the global_state_lock
  49. Each state transition on an device holds a read lock. In case we have
  50. to evaluate the sync after dependencies, we grab a write lock, because
  51. we need stable states on all devices for that. */
  52. rwlock_t global_state_lock;
  53. /* used for synchronous meta data and bitmap IO
  54. * submitted by drbd_md_sync_page_io()
  55. */
  56. void drbd_md_io_complete(struct bio *bio, int error)
  57. {
  58. struct drbd_md_io *md_io;
  59. md_io = (struct drbd_md_io *)bio->bi_private;
  60. md_io->error = error;
  61. complete(&md_io->event);
  62. }
  63. /* reads on behalf of the partner,
  64. * "submitted" by the receiver
  65. */
  66. void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
  67. {
  68. unsigned long flags = 0;
  69. struct drbd_conf *mdev = e->mdev;
  70. D_ASSERT(e->block_id != ID_VACANT);
  71. spin_lock_irqsave(&mdev->req_lock, flags);
  72. mdev->read_cnt += e->size >> 9;
  73. list_del(&e->w.list);
  74. if (list_empty(&mdev->read_ee))
  75. wake_up(&mdev->ee_wait);
  76. if (test_bit(__EE_WAS_ERROR, &e->flags))
  77. __drbd_chk_io_error(mdev, FALSE);
  78. spin_unlock_irqrestore(&mdev->req_lock, flags);
  79. drbd_queue_work(&mdev->data.work, &e->w);
  80. put_ldev(mdev);
  81. }
  82. static int is_failed_barrier(int ee_flags)
  83. {
  84. return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
  85. == (EE_IS_BARRIER|EE_WAS_ERROR);
  86. }
  87. /* writes on behalf of the partner, or resync writes,
  88. * "submitted" by the receiver, final stage. */
  89. static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
  90. {
  91. unsigned long flags = 0;
  92. struct drbd_conf *mdev = e->mdev;
  93. sector_t e_sector;
  94. int do_wake;
  95. int is_syncer_req;
  96. int do_al_complete_io;
  97. /* if this is a failed barrier request, disable use of barriers,
  98. * and schedule for resubmission */
  99. if (is_failed_barrier(e->flags)) {
  100. drbd_bump_write_ordering(mdev, WO_bdev_flush);
  101. spin_lock_irqsave(&mdev->req_lock, flags);
  102. list_del(&e->w.list);
  103. e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
  104. e->w.cb = w_e_reissue;
  105. /* put_ldev actually happens below, once we come here again. */
  106. __release(local);
  107. spin_unlock_irqrestore(&mdev->req_lock, flags);
  108. drbd_queue_work(&mdev->data.work, &e->w);
  109. return;
  110. }
  111. D_ASSERT(e->block_id != ID_VACANT);
  112. /* after we moved e to done_ee,
  113. * we may no longer access it,
  114. * it may be freed/reused already!
  115. * (as soon as we release the req_lock) */
  116. e_sector = e->sector;
  117. do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
  118. is_syncer_req = is_syncer_block_id(e->block_id);
  119. spin_lock_irqsave(&mdev->req_lock, flags);
  120. mdev->writ_cnt += e->size >> 9;
  121. list_del(&e->w.list); /* has been on active_ee or sync_ee */
  122. list_add_tail(&e->w.list, &mdev->done_ee);
  123. /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
  124. * neither did we wake possibly waiting conflicting requests.
  125. * done from "drbd_process_done_ee" within the appropriate w.cb
  126. * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
  127. do_wake = is_syncer_req
  128. ? list_empty(&mdev->sync_ee)
  129. : list_empty(&mdev->active_ee);
  130. if (test_bit(__EE_WAS_ERROR, &e->flags))
  131. __drbd_chk_io_error(mdev, FALSE);
  132. spin_unlock_irqrestore(&mdev->req_lock, flags);
  133. if (is_syncer_req)
  134. drbd_rs_complete_io(mdev, e_sector);
  135. if (do_wake)
  136. wake_up(&mdev->ee_wait);
  137. if (do_al_complete_io)
  138. drbd_al_complete_io(mdev, e_sector);
  139. wake_asender(mdev);
  140. put_ldev(mdev);
  141. }
  142. /* writes on behalf of the partner, or resync writes,
  143. * "submitted" by the receiver.
  144. */
  145. void drbd_endio_sec(struct bio *bio, int error)
  146. {
  147. struct drbd_epoch_entry *e = bio->bi_private;
  148. struct drbd_conf *mdev = e->mdev;
  149. int uptodate = bio_flagged(bio, BIO_UPTODATE);
  150. int is_write = bio_data_dir(bio) == WRITE;
  151. if (error)
  152. dev_warn(DEV, "%s: error=%d s=%llus\n",
  153. is_write ? "write" : "read", error,
  154. (unsigned long long)e->sector);
  155. if (!error && !uptodate) {
  156. dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
  157. is_write ? "write" : "read",
  158. (unsigned long long)e->sector);
  159. /* strange behavior of some lower level drivers...
  160. * fail the request by clearing the uptodate flag,
  161. * but do not return any error?! */
  162. error = -EIO;
  163. }
  164. if (error)
  165. set_bit(__EE_WAS_ERROR, &e->flags);
  166. bio_put(bio); /* no need for the bio anymore */
  167. if (atomic_dec_and_test(&e->pending_bios)) {
  168. if (is_write)
  169. drbd_endio_write_sec_final(e);
  170. else
  171. drbd_endio_read_sec_final(e);
  172. }
  173. }
  174. /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  175. */
  176. void drbd_endio_pri(struct bio *bio, int error)
  177. {
  178. unsigned long flags;
  179. struct drbd_request *req = bio->bi_private;
  180. struct drbd_conf *mdev = req->mdev;
  181. struct bio_and_error m;
  182. enum drbd_req_event what;
  183. int uptodate = bio_flagged(bio, BIO_UPTODATE);
  184. if (error)
  185. dev_warn(DEV, "p %s: error=%d\n",
  186. bio_data_dir(bio) == WRITE ? "write" : "read", error);
  187. if (!error && !uptodate) {
  188. dev_warn(DEV, "p %s: setting error to -EIO\n",
  189. bio_data_dir(bio) == WRITE ? "write" : "read");
  190. /* strange behavior of some lower level drivers...
  191. * fail the request by clearing the uptodate flag,
  192. * but do not return any error?! */
  193. error = -EIO;
  194. }
  195. /* to avoid recursion in __req_mod */
  196. if (unlikely(error)) {
  197. what = (bio_data_dir(bio) == WRITE)
  198. ? write_completed_with_error
  199. : (bio_rw(bio) == READ)
  200. ? read_completed_with_error
  201. : read_ahead_completed_with_error;
  202. } else
  203. what = completed_ok;
  204. bio_put(req->private_bio);
  205. req->private_bio = ERR_PTR(error);
  206. spin_lock_irqsave(&mdev->req_lock, flags);
  207. __req_mod(req, what, &m);
  208. spin_unlock_irqrestore(&mdev->req_lock, flags);
  209. if (m.bio)
  210. complete_master_bio(mdev, &m);
  211. }
  212. int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  213. {
  214. struct drbd_request *req = container_of(w, struct drbd_request, w);
  215. /* We should not detach for read io-error,
  216. * but try to WRITE the P_DATA_REPLY to the failed location,
  217. * to give the disk the chance to relocate that block */
  218. spin_lock_irq(&mdev->req_lock);
  219. if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
  220. _req_mod(req, read_retry_remote_canceled);
  221. spin_unlock_irq(&mdev->req_lock);
  222. dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
  223. return 1;
  224. }
  225. spin_unlock_irq(&mdev->req_lock);
  226. return w_send_read_req(mdev, w, 0);
  227. }
  228. int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  229. {
  230. ERR_IF(cancel) return 1;
  231. dev_err(DEV, "resync inactive, but callback triggered??\n");
  232. return 1; /* Simply ignore this! */
  233. }
  234. void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
  235. {
  236. struct hash_desc desc;
  237. struct scatterlist sg;
  238. struct page *page = e->pages;
  239. struct page *tmp;
  240. unsigned len;
  241. desc.tfm = tfm;
  242. desc.flags = 0;
  243. sg_init_table(&sg, 1);
  244. crypto_hash_init(&desc);
  245. while ((tmp = page_chain_next(page))) {
  246. /* all but the last page will be fully used */
  247. sg_set_page(&sg, page, PAGE_SIZE, 0);
  248. crypto_hash_update(&desc, &sg, sg.length);
  249. page = tmp;
  250. }
  251. /* and now the last, possibly only partially used page */
  252. len = e->size & (PAGE_SIZE - 1);
  253. sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
  254. crypto_hash_update(&desc, &sg, sg.length);
  255. crypto_hash_final(&desc, digest);
  256. }
  257. void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
  258. {
  259. struct hash_desc desc;
  260. struct scatterlist sg;
  261. struct bio_vec *bvec;
  262. int i;
  263. desc.tfm = tfm;
  264. desc.flags = 0;
  265. sg_init_table(&sg, 1);
  266. crypto_hash_init(&desc);
  267. __bio_for_each_segment(bvec, bio, i, 0) {
  268. sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
  269. crypto_hash_update(&desc, &sg, sg.length);
  270. }
  271. crypto_hash_final(&desc, digest);
  272. }
  273. static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  274. {
  275. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  276. int digest_size;
  277. void *digest;
  278. int ok;
  279. D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
  280. if (unlikely(cancel)) {
  281. drbd_free_ee(mdev, e);
  282. return 1;
  283. }
  284. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  285. digest_size = crypto_hash_digestsize(mdev->csums_tfm);
  286. digest = kmalloc(digest_size, GFP_NOIO);
  287. if (digest) {
  288. drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
  289. inc_rs_pending(mdev);
  290. ok = drbd_send_drequest_csum(mdev,
  291. e->sector,
  292. e->size,
  293. digest,
  294. digest_size,
  295. P_CSUM_RS_REQUEST);
  296. kfree(digest);
  297. } else {
  298. dev_err(DEV, "kmalloc() of digest failed.\n");
  299. ok = 0;
  300. }
  301. } else
  302. ok = 1;
  303. drbd_free_ee(mdev, e);
  304. if (unlikely(!ok))
  305. dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
  306. return ok;
  307. }
  308. #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  309. static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
  310. {
  311. struct drbd_epoch_entry *e;
  312. if (!get_ldev(mdev))
  313. return 0;
  314. /* GFP_TRY, because if there is no memory available right now, this may
  315. * be rescheduled for later. It is "only" background resync, after all. */
  316. e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
  317. if (!e)
  318. goto fail;
  319. spin_lock_irq(&mdev->req_lock);
  320. list_add(&e->w.list, &mdev->read_ee);
  321. spin_unlock_irq(&mdev->req_lock);
  322. e->w.cb = w_e_send_csum;
  323. if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
  324. return 1;
  325. drbd_free_ee(mdev, e);
  326. fail:
  327. put_ldev(mdev);
  328. return 2;
  329. }
  330. void resync_timer_fn(unsigned long data)
  331. {
  332. unsigned long flags;
  333. struct drbd_conf *mdev = (struct drbd_conf *) data;
  334. int queue;
  335. spin_lock_irqsave(&mdev->req_lock, flags);
  336. if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
  337. queue = 1;
  338. if (mdev->state.conn == C_VERIFY_S)
  339. mdev->resync_work.cb = w_make_ov_request;
  340. else
  341. mdev->resync_work.cb = w_make_resync_request;
  342. } else {
  343. queue = 0;
  344. mdev->resync_work.cb = w_resync_inactive;
  345. }
  346. spin_unlock_irqrestore(&mdev->req_lock, flags);
  347. /* harmless race: list_empty outside data.work.q_lock */
  348. if (list_empty(&mdev->resync_work.list) && queue)
  349. drbd_queue_work(&mdev->data.work, &mdev->resync_work);
  350. }
  351. static int calc_resync_rate(struct drbd_conf *mdev)
  352. {
  353. int d = mdev->data_delay / 1000; /* us -> ms */
  354. int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */
  355. int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */
  356. int cr = mdev->sync_conf.rate;
  357. return d <= td ? cr :
  358. d >= hd ? 0 :
  359. cr + (cr * (td - d) / (hd - td));
  360. }
  361. int w_make_resync_request(struct drbd_conf *mdev,
  362. struct drbd_work *w, int cancel)
  363. {
  364. unsigned long bit;
  365. sector_t sector;
  366. const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
  367. int max_segment_size;
  368. int number, i, size, pe, mx;
  369. int align, queued, sndbuf;
  370. if (unlikely(cancel))
  371. return 1;
  372. if (unlikely(mdev->state.conn < C_CONNECTED)) {
  373. dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
  374. return 0;
  375. }
  376. if (mdev->state.conn != C_SYNC_TARGET)
  377. dev_err(DEV, "%s in w_make_resync_request\n",
  378. drbd_conn_str(mdev->state.conn));
  379. if (!get_ldev(mdev)) {
  380. /* Since we only need to access mdev->rsync a
  381. get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
  382. to continue resync with a broken disk makes no sense at
  383. all */
  384. dev_err(DEV, "Disk broke down during resync!\n");
  385. mdev->resync_work.cb = w_resync_inactive;
  386. return 1;
  387. }
  388. /* starting with drbd 8.3.8, we can handle multi-bio EEs,
  389. * if it should be necessary */
  390. max_segment_size = mdev->agreed_pro_version < 94 ?
  391. queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
  392. mdev->c_sync_rate = calc_resync_rate(mdev);
  393. number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
  394. pe = atomic_read(&mdev->rs_pending_cnt);
  395. mutex_lock(&mdev->data.mutex);
  396. if (mdev->data.socket)
  397. mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
  398. else
  399. mx = 1;
  400. mutex_unlock(&mdev->data.mutex);
  401. /* For resync rates >160MB/sec, allow more pending RS requests */
  402. if (number > mx)
  403. mx = number;
  404. /* Limit the number of pending RS requests to no more than the peer's receive buffer */
  405. if ((pe + number) > mx) {
  406. number = mx - pe;
  407. }
  408. for (i = 0; i < number; i++) {
  409. /* Stop generating RS requests, when half of the send buffer is filled */
  410. mutex_lock(&mdev->data.mutex);
  411. if (mdev->data.socket) {
  412. queued = mdev->data.socket->sk->sk_wmem_queued;
  413. sndbuf = mdev->data.socket->sk->sk_sndbuf;
  414. } else {
  415. queued = 1;
  416. sndbuf = 0;
  417. }
  418. mutex_unlock(&mdev->data.mutex);
  419. if (queued > sndbuf / 2)
  420. goto requeue;
  421. next_sector:
  422. size = BM_BLOCK_SIZE;
  423. bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
  424. if (bit == -1UL) {
  425. mdev->bm_resync_fo = drbd_bm_bits(mdev);
  426. mdev->resync_work.cb = w_resync_inactive;
  427. put_ldev(mdev);
  428. return 1;
  429. }
  430. sector = BM_BIT_TO_SECT(bit);
  431. if (drbd_try_rs_begin_io(mdev, sector)) {
  432. mdev->bm_resync_fo = bit;
  433. goto requeue;
  434. }
  435. mdev->bm_resync_fo = bit + 1;
  436. if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
  437. drbd_rs_complete_io(mdev, sector);
  438. goto next_sector;
  439. }
  440. #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
  441. /* try to find some adjacent bits.
  442. * we stop if we have already the maximum req size.
  443. *
  444. * Additionally always align bigger requests, in order to
  445. * be prepared for all stripe sizes of software RAIDs.
  446. */
  447. align = 1;
  448. for (;;) {
  449. if (size + BM_BLOCK_SIZE > max_segment_size)
  450. break;
  451. /* Be always aligned */
  452. if (sector & ((1<<(align+3))-1))
  453. break;
  454. /* do not cross extent boundaries */
  455. if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
  456. break;
  457. /* now, is it actually dirty, after all?
  458. * caution, drbd_bm_test_bit is tri-state for some
  459. * obscure reason; ( b == 0 ) would get the out-of-band
  460. * only accidentally right because of the "oddly sized"
  461. * adjustment below */
  462. if (drbd_bm_test_bit(mdev, bit+1) != 1)
  463. break;
  464. bit++;
  465. size += BM_BLOCK_SIZE;
  466. if ((BM_BLOCK_SIZE << align) <= size)
  467. align++;
  468. i++;
  469. }
  470. /* if we merged some,
  471. * reset the offset to start the next drbd_bm_find_next from */
  472. if (size > BM_BLOCK_SIZE)
  473. mdev->bm_resync_fo = bit + 1;
  474. #endif
  475. /* adjust very last sectors, in case we are oddly sized */
  476. if (sector + (size>>9) > capacity)
  477. size = (capacity-sector)<<9;
  478. if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
  479. switch (read_for_csum(mdev, sector, size)) {
  480. case 0: /* Disk failure*/
  481. put_ldev(mdev);
  482. return 0;
  483. case 2: /* Allocation failed */
  484. drbd_rs_complete_io(mdev, sector);
  485. mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
  486. goto requeue;
  487. /* case 1: everything ok */
  488. }
  489. } else {
  490. inc_rs_pending(mdev);
  491. if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
  492. sector, size, ID_SYNCER)) {
  493. dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
  494. dec_rs_pending(mdev);
  495. put_ldev(mdev);
  496. return 0;
  497. }
  498. }
  499. }
  500. if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
  501. /* last syncer _request_ was sent,
  502. * but the P_RS_DATA_REPLY not yet received. sync will end (and
  503. * next sync group will resume), as soon as we receive the last
  504. * resync data block, and the last bit is cleared.
  505. * until then resync "work" is "inactive" ...
  506. */
  507. mdev->resync_work.cb = w_resync_inactive;
  508. put_ldev(mdev);
  509. return 1;
  510. }
  511. requeue:
  512. mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
  513. put_ldev(mdev);
  514. return 1;
  515. }
  516. static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  517. {
  518. int number, i, size;
  519. sector_t sector;
  520. const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
  521. if (unlikely(cancel))
  522. return 1;
  523. if (unlikely(mdev->state.conn < C_CONNECTED)) {
  524. dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
  525. return 0;
  526. }
  527. number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
  528. if (atomic_read(&mdev->rs_pending_cnt) > number)
  529. goto requeue;
  530. number -= atomic_read(&mdev->rs_pending_cnt);
  531. sector = mdev->ov_position;
  532. for (i = 0; i < number; i++) {
  533. if (sector >= capacity) {
  534. mdev->resync_work.cb = w_resync_inactive;
  535. return 1;
  536. }
  537. size = BM_BLOCK_SIZE;
  538. if (drbd_try_rs_begin_io(mdev, sector)) {
  539. mdev->ov_position = sector;
  540. goto requeue;
  541. }
  542. if (sector + (size>>9) > capacity)
  543. size = (capacity-sector)<<9;
  544. inc_rs_pending(mdev);
  545. if (!drbd_send_ov_request(mdev, sector, size)) {
  546. dec_rs_pending(mdev);
  547. return 0;
  548. }
  549. sector += BM_SECT_PER_BIT;
  550. }
  551. mdev->ov_position = sector;
  552. requeue:
  553. mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
  554. return 1;
  555. }
  556. int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  557. {
  558. kfree(w);
  559. ov_oos_print(mdev);
  560. drbd_resync_finished(mdev);
  561. return 1;
  562. }
  563. static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  564. {
  565. kfree(w);
  566. drbd_resync_finished(mdev);
  567. return 1;
  568. }
  569. int drbd_resync_finished(struct drbd_conf *mdev)
  570. {
  571. unsigned long db, dt, dbdt;
  572. unsigned long n_oos;
  573. union drbd_state os, ns;
  574. struct drbd_work *w;
  575. char *khelper_cmd = NULL;
  576. /* Remove all elements from the resync LRU. Since future actions
  577. * might set bits in the (main) bitmap, then the entries in the
  578. * resync LRU would be wrong. */
  579. if (drbd_rs_del_all(mdev)) {
  580. /* In case this is not possible now, most probably because
  581. * there are P_RS_DATA_REPLY Packets lingering on the worker's
  582. * queue (or even the read operations for those packets
  583. * is not finished by now). Retry in 100ms. */
  584. drbd_kick_lo(mdev);
  585. __set_current_state(TASK_INTERRUPTIBLE);
  586. schedule_timeout(HZ / 10);
  587. w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
  588. if (w) {
  589. w->cb = w_resync_finished;
  590. drbd_queue_work(&mdev->data.work, w);
  591. return 1;
  592. }
  593. dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
  594. }
  595. dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
  596. if (dt <= 0)
  597. dt = 1;
  598. db = mdev->rs_total;
  599. dbdt = Bit2KB(db/dt);
  600. mdev->rs_paused /= HZ;
  601. if (!get_ldev(mdev))
  602. goto out;
  603. spin_lock_irq(&mdev->req_lock);
  604. os = mdev->state;
  605. /* This protects us against multiple calls (that can happen in the presence
  606. of application IO), and against connectivity loss just before we arrive here. */
  607. if (os.conn <= C_CONNECTED)
  608. goto out_unlock;
  609. ns = os;
  610. ns.conn = C_CONNECTED;
  611. dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
  612. (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
  613. "Online verify " : "Resync",
  614. dt + mdev->rs_paused, mdev->rs_paused, dbdt);
  615. n_oos = drbd_bm_total_weight(mdev);
  616. if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
  617. if (n_oos) {
  618. dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
  619. n_oos, Bit2KB(1));
  620. khelper_cmd = "out-of-sync";
  621. }
  622. } else {
  623. D_ASSERT((n_oos - mdev->rs_failed) == 0);
  624. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
  625. khelper_cmd = "after-resync-target";
  626. if (mdev->csums_tfm && mdev->rs_total) {
  627. const unsigned long s = mdev->rs_same_csum;
  628. const unsigned long t = mdev->rs_total;
  629. const int ratio =
  630. (t == 0) ? 0 :
  631. (t < 100000) ? ((s*100)/t) : (s/(t/100));
  632. dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
  633. "transferred %luK total %luK\n",
  634. ratio,
  635. Bit2KB(mdev->rs_same_csum),
  636. Bit2KB(mdev->rs_total - mdev->rs_same_csum),
  637. Bit2KB(mdev->rs_total));
  638. }
  639. }
  640. if (mdev->rs_failed) {
  641. dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
  642. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
  643. ns.disk = D_INCONSISTENT;
  644. ns.pdsk = D_UP_TO_DATE;
  645. } else {
  646. ns.disk = D_UP_TO_DATE;
  647. ns.pdsk = D_INCONSISTENT;
  648. }
  649. } else {
  650. ns.disk = D_UP_TO_DATE;
  651. ns.pdsk = D_UP_TO_DATE;
  652. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
  653. if (mdev->p_uuid) {
  654. int i;
  655. for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
  656. _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
  657. drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
  658. _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
  659. } else {
  660. dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
  661. }
  662. }
  663. drbd_uuid_set_bm(mdev, 0UL);
  664. if (mdev->p_uuid) {
  665. /* Now the two UUID sets are equal, update what we
  666. * know of the peer. */
  667. int i;
  668. for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
  669. mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
  670. }
  671. }
  672. _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
  673. out_unlock:
  674. spin_unlock_irq(&mdev->req_lock);
  675. put_ldev(mdev);
  676. out:
  677. mdev->rs_total = 0;
  678. mdev->rs_failed = 0;
  679. mdev->rs_paused = 0;
  680. mdev->ov_start_sector = 0;
  681. if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
  682. dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
  683. drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
  684. }
  685. if (khelper_cmd)
  686. drbd_khelper(mdev, khelper_cmd);
  687. return 1;
  688. }
  689. /* helper */
  690. static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
  691. {
  692. if (drbd_ee_has_active_page(e)) {
  693. /* This might happen if sendpage() has not finished */
  694. spin_lock_irq(&mdev->req_lock);
  695. list_add_tail(&e->w.list, &mdev->net_ee);
  696. spin_unlock_irq(&mdev->req_lock);
  697. } else
  698. drbd_free_ee(mdev, e);
  699. }
  700. /**
  701. * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
  702. * @mdev: DRBD device.
  703. * @w: work object.
  704. * @cancel: The connection will be closed anyways
  705. */
  706. int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  707. {
  708. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  709. int ok;
  710. if (unlikely(cancel)) {
  711. drbd_free_ee(mdev, e);
  712. dec_unacked(mdev);
  713. return 1;
  714. }
  715. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  716. ok = drbd_send_block(mdev, P_DATA_REPLY, e);
  717. } else {
  718. if (__ratelimit(&drbd_ratelimit_state))
  719. dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
  720. (unsigned long long)e->sector);
  721. ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
  722. }
  723. dec_unacked(mdev);
  724. move_to_net_ee_or_free(mdev, e);
  725. if (unlikely(!ok))
  726. dev_err(DEV, "drbd_send_block() failed\n");
  727. return ok;
  728. }
  729. /**
  730. * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
  731. * @mdev: DRBD device.
  732. * @w: work object.
  733. * @cancel: The connection will be closed anyways
  734. */
  735. int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  736. {
  737. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  738. int ok;
  739. if (unlikely(cancel)) {
  740. drbd_free_ee(mdev, e);
  741. dec_unacked(mdev);
  742. return 1;
  743. }
  744. if (get_ldev_if_state(mdev, D_FAILED)) {
  745. drbd_rs_complete_io(mdev, e->sector);
  746. put_ldev(mdev);
  747. }
  748. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  749. if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
  750. inc_rs_pending(mdev);
  751. ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
  752. } else {
  753. if (__ratelimit(&drbd_ratelimit_state))
  754. dev_err(DEV, "Not sending RSDataReply, "
  755. "partner DISKLESS!\n");
  756. ok = 1;
  757. }
  758. } else {
  759. if (__ratelimit(&drbd_ratelimit_state))
  760. dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
  761. (unsigned long long)e->sector);
  762. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  763. /* update resync data with failure */
  764. drbd_rs_failed_io(mdev, e->sector, e->size);
  765. }
  766. dec_unacked(mdev);
  767. move_to_net_ee_or_free(mdev, e);
  768. if (unlikely(!ok))
  769. dev_err(DEV, "drbd_send_block() failed\n");
  770. return ok;
  771. }
  772. int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  773. {
  774. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  775. struct digest_info *di;
  776. int digest_size;
  777. void *digest = NULL;
  778. int ok, eq = 0;
  779. if (unlikely(cancel)) {
  780. drbd_free_ee(mdev, e);
  781. dec_unacked(mdev);
  782. return 1;
  783. }
  784. drbd_rs_complete_io(mdev, e->sector);
  785. di = (struct digest_info *)(unsigned long)e->block_id;
  786. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  787. /* quick hack to try to avoid a race against reconfiguration.
  788. * a real fix would be much more involved,
  789. * introducing more locking mechanisms */
  790. if (mdev->csums_tfm) {
  791. digest_size = crypto_hash_digestsize(mdev->csums_tfm);
  792. D_ASSERT(digest_size == di->digest_size);
  793. digest = kmalloc(digest_size, GFP_NOIO);
  794. }
  795. if (digest) {
  796. drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
  797. eq = !memcmp(digest, di->digest, digest_size);
  798. kfree(digest);
  799. }
  800. if (eq) {
  801. drbd_set_in_sync(mdev, e->sector, e->size);
  802. /* rs_same_csums unit is BM_BLOCK_SIZE */
  803. mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
  804. ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
  805. } else {
  806. inc_rs_pending(mdev);
  807. e->block_id = ID_SYNCER;
  808. ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
  809. }
  810. } else {
  811. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  812. if (__ratelimit(&drbd_ratelimit_state))
  813. dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
  814. }
  815. dec_unacked(mdev);
  816. kfree(di);
  817. move_to_net_ee_or_free(mdev, e);
  818. if (unlikely(!ok))
  819. dev_err(DEV, "drbd_send_block/ack() failed\n");
  820. return ok;
  821. }
  822. int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  823. {
  824. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  825. int digest_size;
  826. void *digest;
  827. int ok = 1;
  828. if (unlikely(cancel))
  829. goto out;
  830. if (unlikely((e->flags & EE_WAS_ERROR) != 0))
  831. goto out;
  832. digest_size = crypto_hash_digestsize(mdev->verify_tfm);
  833. /* FIXME if this allocation fails, online verify will not terminate! */
  834. digest = kmalloc(digest_size, GFP_NOIO);
  835. if (digest) {
  836. drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
  837. inc_rs_pending(mdev);
  838. ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
  839. digest, digest_size, P_OV_REPLY);
  840. if (!ok)
  841. dec_rs_pending(mdev);
  842. kfree(digest);
  843. }
  844. out:
  845. drbd_free_ee(mdev, e);
  846. dec_unacked(mdev);
  847. return ok;
  848. }
  849. void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
  850. {
  851. if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
  852. mdev->ov_last_oos_size += size>>9;
  853. } else {
  854. mdev->ov_last_oos_start = sector;
  855. mdev->ov_last_oos_size = size>>9;
  856. }
  857. drbd_set_out_of_sync(mdev, sector, size);
  858. set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
  859. }
  860. int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  861. {
  862. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  863. struct digest_info *di;
  864. int digest_size;
  865. void *digest;
  866. int ok, eq = 0;
  867. if (unlikely(cancel)) {
  868. drbd_free_ee(mdev, e);
  869. dec_unacked(mdev);
  870. return 1;
  871. }
  872. /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
  873. * the resync lru has been cleaned up already */
  874. drbd_rs_complete_io(mdev, e->sector);
  875. di = (struct digest_info *)(unsigned long)e->block_id;
  876. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  877. digest_size = crypto_hash_digestsize(mdev->verify_tfm);
  878. digest = kmalloc(digest_size, GFP_NOIO);
  879. if (digest) {
  880. drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
  881. D_ASSERT(digest_size == di->digest_size);
  882. eq = !memcmp(digest, di->digest, digest_size);
  883. kfree(digest);
  884. }
  885. } else {
  886. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  887. if (__ratelimit(&drbd_ratelimit_state))
  888. dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
  889. }
  890. dec_unacked(mdev);
  891. kfree(di);
  892. if (!eq)
  893. drbd_ov_oos_found(mdev, e->sector, e->size);
  894. else
  895. ov_oos_print(mdev);
  896. ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
  897. eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
  898. drbd_free_ee(mdev, e);
  899. if (--mdev->ov_left == 0) {
  900. ov_oos_print(mdev);
  901. drbd_resync_finished(mdev);
  902. }
  903. return ok;
  904. }
  905. int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  906. {
  907. struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
  908. complete(&b->done);
  909. return 1;
  910. }
  911. int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  912. {
  913. struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
  914. struct p_barrier *p = &mdev->data.sbuf.barrier;
  915. int ok = 1;
  916. /* really avoid racing with tl_clear. w.cb may have been referenced
  917. * just before it was reassigned and re-queued, so double check that.
  918. * actually, this race was harmless, since we only try to send the
  919. * barrier packet here, and otherwise do nothing with the object.
  920. * but compare with the head of w_clear_epoch */
  921. spin_lock_irq(&mdev->req_lock);
  922. if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
  923. cancel = 1;
  924. spin_unlock_irq(&mdev->req_lock);
  925. if (cancel)
  926. return 1;
  927. if (!drbd_get_data_sock(mdev))
  928. return 0;
  929. p->barrier = b->br_number;
  930. /* inc_ap_pending was done where this was queued.
  931. * dec_ap_pending will be done in got_BarrierAck
  932. * or (on connection loss) in w_clear_epoch. */
  933. ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
  934. (struct p_header *)p, sizeof(*p), 0);
  935. drbd_put_data_sock(mdev);
  936. return ok;
  937. }
  938. int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  939. {
  940. if (cancel)
  941. return 1;
  942. return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
  943. }
  944. /**
  945. * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
  946. * @mdev: DRBD device.
  947. * @w: work object.
  948. * @cancel: The connection will be closed anyways
  949. */
  950. int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  951. {
  952. struct drbd_request *req = container_of(w, struct drbd_request, w);
  953. int ok;
  954. if (unlikely(cancel)) {
  955. req_mod(req, send_canceled);
  956. return 1;
  957. }
  958. ok = drbd_send_dblock(mdev, req);
  959. req_mod(req, ok ? handed_over_to_network : send_failed);
  960. return ok;
  961. }
  962. /**
  963. * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
  964. * @mdev: DRBD device.
  965. * @w: work object.
  966. * @cancel: The connection will be closed anyways
  967. */
  968. int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  969. {
  970. struct drbd_request *req = container_of(w, struct drbd_request, w);
  971. int ok;
  972. if (unlikely(cancel)) {
  973. req_mod(req, send_canceled);
  974. return 1;
  975. }
  976. ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
  977. (unsigned long)req);
  978. if (!ok) {
  979. /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
  980. * so this is probably redundant */
  981. if (mdev->state.conn >= C_CONNECTED)
  982. drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
  983. }
  984. req_mod(req, ok ? handed_over_to_network : send_failed);
  985. return ok;
  986. }
  987. static int _drbd_may_sync_now(struct drbd_conf *mdev)
  988. {
  989. struct drbd_conf *odev = mdev;
  990. while (1) {
  991. if (odev->sync_conf.after == -1)
  992. return 1;
  993. odev = minor_to_mdev(odev->sync_conf.after);
  994. ERR_IF(!odev) return 1;
  995. if ((odev->state.conn >= C_SYNC_SOURCE &&
  996. odev->state.conn <= C_PAUSED_SYNC_T) ||
  997. odev->state.aftr_isp || odev->state.peer_isp ||
  998. odev->state.user_isp)
  999. return 0;
  1000. }
  1001. }
  1002. /**
  1003. * _drbd_pause_after() - Pause resync on all devices that may not resync now
  1004. * @mdev: DRBD device.
  1005. *
  1006. * Called from process context only (admin command and after_state_ch).
  1007. */
  1008. static int _drbd_pause_after(struct drbd_conf *mdev)
  1009. {
  1010. struct drbd_conf *odev;
  1011. int i, rv = 0;
  1012. for (i = 0; i < minor_count; i++) {
  1013. odev = minor_to_mdev(i);
  1014. if (!odev)
  1015. continue;
  1016. if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
  1017. continue;
  1018. if (!_drbd_may_sync_now(odev))
  1019. rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
  1020. != SS_NOTHING_TO_DO);
  1021. }
  1022. return rv;
  1023. }
  1024. /**
  1025. * _drbd_resume_next() - Resume resync on all devices that may resync now
  1026. * @mdev: DRBD device.
  1027. *
  1028. * Called from process context only (admin command and worker).
  1029. */
  1030. static int _drbd_resume_next(struct drbd_conf *mdev)
  1031. {
  1032. struct drbd_conf *odev;
  1033. int i, rv = 0;
  1034. for (i = 0; i < minor_count; i++) {
  1035. odev = minor_to_mdev(i);
  1036. if (!odev)
  1037. continue;
  1038. if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
  1039. continue;
  1040. if (odev->state.aftr_isp) {
  1041. if (_drbd_may_sync_now(odev))
  1042. rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
  1043. CS_HARD, NULL)
  1044. != SS_NOTHING_TO_DO) ;
  1045. }
  1046. }
  1047. return rv;
  1048. }
  1049. void resume_next_sg(struct drbd_conf *mdev)
  1050. {
  1051. write_lock_irq(&global_state_lock);
  1052. _drbd_resume_next(mdev);
  1053. write_unlock_irq(&global_state_lock);
  1054. }
  1055. void suspend_other_sg(struct drbd_conf *mdev)
  1056. {
  1057. write_lock_irq(&global_state_lock);
  1058. _drbd_pause_after(mdev);
  1059. write_unlock_irq(&global_state_lock);
  1060. }
  1061. static int sync_after_error(struct drbd_conf *mdev, int o_minor)
  1062. {
  1063. struct drbd_conf *odev;
  1064. if (o_minor == -1)
  1065. return NO_ERROR;
  1066. if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
  1067. return ERR_SYNC_AFTER;
  1068. /* check for loops */
  1069. odev = minor_to_mdev(o_minor);
  1070. while (1) {
  1071. if (odev == mdev)
  1072. return ERR_SYNC_AFTER_CYCLE;
  1073. /* dependency chain ends here, no cycles. */
  1074. if (odev->sync_conf.after == -1)
  1075. return NO_ERROR;
  1076. /* follow the dependency chain */
  1077. odev = minor_to_mdev(odev->sync_conf.after);
  1078. }
  1079. }
  1080. int drbd_alter_sa(struct drbd_conf *mdev, int na)
  1081. {
  1082. int changes;
  1083. int retcode;
  1084. write_lock_irq(&global_state_lock);
  1085. retcode = sync_after_error(mdev, na);
  1086. if (retcode == NO_ERROR) {
  1087. mdev->sync_conf.after = na;
  1088. do {
  1089. changes = _drbd_pause_after(mdev);
  1090. changes |= _drbd_resume_next(mdev);
  1091. } while (changes);
  1092. }
  1093. write_unlock_irq(&global_state_lock);
  1094. return retcode;
  1095. }
  1096. static void ping_peer(struct drbd_conf *mdev)
  1097. {
  1098. clear_bit(GOT_PING_ACK, &mdev->flags);
  1099. request_ping(mdev);
  1100. wait_event(mdev->misc_wait,
  1101. test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
  1102. }
  1103. /**
  1104. * drbd_start_resync() - Start the resync process
  1105. * @mdev: DRBD device.
  1106. * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
  1107. *
  1108. * This function might bring you directly into one of the
  1109. * C_PAUSED_SYNC_* states.
  1110. */
  1111. void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
  1112. {
  1113. union drbd_state ns;
  1114. int r;
  1115. if (mdev->state.conn >= C_SYNC_SOURCE) {
  1116. dev_err(DEV, "Resync already running!\n");
  1117. return;
  1118. }
  1119. /* In case a previous resync run was aborted by an IO error/detach on the peer. */
  1120. drbd_rs_cancel_all(mdev);
  1121. if (side == C_SYNC_TARGET) {
  1122. /* Since application IO was locked out during C_WF_BITMAP_T and
  1123. C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
  1124. we check that we might make the data inconsistent. */
  1125. r = drbd_khelper(mdev, "before-resync-target");
  1126. r = (r >> 8) & 0xff;
  1127. if (r > 0) {
  1128. dev_info(DEV, "before-resync-target handler returned %d, "
  1129. "dropping connection.\n", r);
  1130. drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
  1131. return;
  1132. }
  1133. }
  1134. drbd_state_lock(mdev);
  1135. if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
  1136. drbd_state_unlock(mdev);
  1137. return;
  1138. }
  1139. if (side == C_SYNC_TARGET) {
  1140. mdev->bm_resync_fo = 0;
  1141. } else /* side == C_SYNC_SOURCE */ {
  1142. u64 uuid;
  1143. get_random_bytes(&uuid, sizeof(u64));
  1144. drbd_uuid_set(mdev, UI_BITMAP, uuid);
  1145. drbd_send_sync_uuid(mdev, uuid);
  1146. D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
  1147. }
  1148. write_lock_irq(&global_state_lock);
  1149. ns = mdev->state;
  1150. ns.aftr_isp = !_drbd_may_sync_now(mdev);
  1151. ns.conn = side;
  1152. if (side == C_SYNC_TARGET)
  1153. ns.disk = D_INCONSISTENT;
  1154. else /* side == C_SYNC_SOURCE */
  1155. ns.pdsk = D_INCONSISTENT;
  1156. r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
  1157. ns = mdev->state;
  1158. if (ns.conn < C_CONNECTED)
  1159. r = SS_UNKNOWN_ERROR;
  1160. if (r == SS_SUCCESS) {
  1161. mdev->rs_total =
  1162. mdev->rs_mark_left = drbd_bm_total_weight(mdev);
  1163. mdev->rs_failed = 0;
  1164. mdev->rs_paused = 0;
  1165. mdev->rs_start =
  1166. mdev->rs_mark_time = jiffies;
  1167. mdev->rs_same_csum = 0;
  1168. _drbd_pause_after(mdev);
  1169. }
  1170. write_unlock_irq(&global_state_lock);
  1171. put_ldev(mdev);
  1172. if (r == SS_SUCCESS) {
  1173. dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
  1174. drbd_conn_str(ns.conn),
  1175. (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
  1176. (unsigned long) mdev->rs_total);
  1177. if (mdev->rs_total == 0) {
  1178. /* Peer still reachable? Beware of failing before-resync-target handlers! */
  1179. ping_peer(mdev);
  1180. drbd_resync_finished(mdev);
  1181. }
  1182. /* ns.conn may already be != mdev->state.conn,
  1183. * we may have been paused in between, or become paused until
  1184. * the timer triggers.
  1185. * No matter, that is handled in resync_timer_fn() */
  1186. if (ns.conn == C_SYNC_TARGET)
  1187. mod_timer(&mdev->resync_timer, jiffies);
  1188. drbd_md_sync(mdev);
  1189. }
  1190. drbd_state_unlock(mdev);
  1191. }
  1192. int drbd_worker(struct drbd_thread *thi)
  1193. {
  1194. struct drbd_conf *mdev = thi->mdev;
  1195. struct drbd_work *w = NULL;
  1196. LIST_HEAD(work_list);
  1197. int intr = 0, i;
  1198. sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
  1199. while (get_t_state(thi) == Running) {
  1200. drbd_thread_current_set_cpu(mdev);
  1201. if (down_trylock(&mdev->data.work.s)) {
  1202. mutex_lock(&mdev->data.mutex);
  1203. if (mdev->data.socket && !mdev->net_conf->no_cork)
  1204. drbd_tcp_uncork(mdev->data.socket);
  1205. mutex_unlock(&mdev->data.mutex);
  1206. intr = down_interruptible(&mdev->data.work.s);
  1207. mutex_lock(&mdev->data.mutex);
  1208. if (mdev->data.socket && !mdev->net_conf->no_cork)
  1209. drbd_tcp_cork(mdev->data.socket);
  1210. mutex_unlock(&mdev->data.mutex);
  1211. }
  1212. if (intr) {
  1213. D_ASSERT(intr == -EINTR);
  1214. flush_signals(current);
  1215. ERR_IF (get_t_state(thi) == Running)
  1216. continue;
  1217. break;
  1218. }
  1219. if (get_t_state(thi) != Running)
  1220. break;
  1221. /* With this break, we have done a down() but not consumed
  1222. the entry from the list. The cleanup code takes care of
  1223. this... */
  1224. w = NULL;
  1225. spin_lock_irq(&mdev->data.work.q_lock);
  1226. ERR_IF(list_empty(&mdev->data.work.q)) {
  1227. /* something terribly wrong in our logic.
  1228. * we were able to down() the semaphore,
  1229. * but the list is empty... doh.
  1230. *
  1231. * what is the best thing to do now?
  1232. * try again from scratch, restarting the receiver,
  1233. * asender, whatnot? could break even more ugly,
  1234. * e.g. when we are primary, but no good local data.
  1235. *
  1236. * I'll try to get away just starting over this loop.
  1237. */
  1238. spin_unlock_irq(&mdev->data.work.q_lock);
  1239. continue;
  1240. }
  1241. w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
  1242. list_del_init(&w->list);
  1243. spin_unlock_irq(&mdev->data.work.q_lock);
  1244. if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
  1245. /* dev_warn(DEV, "worker: a callback failed! \n"); */
  1246. if (mdev->state.conn >= C_CONNECTED)
  1247. drbd_force_state(mdev,
  1248. NS(conn, C_NETWORK_FAILURE));
  1249. }
  1250. }
  1251. D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
  1252. D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
  1253. spin_lock_irq(&mdev->data.work.q_lock);
  1254. i = 0;
  1255. while (!list_empty(&mdev->data.work.q)) {
  1256. list_splice_init(&mdev->data.work.q, &work_list);
  1257. spin_unlock_irq(&mdev->data.work.q_lock);
  1258. while (!list_empty(&work_list)) {
  1259. w = list_entry(work_list.next, struct drbd_work, list);
  1260. list_del_init(&w->list);
  1261. w->cb(mdev, w, 1);
  1262. i++; /* dead debugging code */
  1263. }
  1264. spin_lock_irq(&mdev->data.work.q_lock);
  1265. }
  1266. sema_init(&mdev->data.work.s, 0);
  1267. /* DANGEROUS race: if someone did queue his work within the spinlock,
  1268. * but up() ed outside the spinlock, we could get an up() on the
  1269. * semaphore without corresponding list entry.
  1270. * So don't do that.
  1271. */
  1272. spin_unlock_irq(&mdev->data.work.q_lock);
  1273. D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
  1274. /* _drbd_set_state only uses stop_nowait.
  1275. * wait here for the Exiting receiver. */
  1276. drbd_thread_stop(&mdev->receiver);
  1277. drbd_mdev_cleanup(mdev);
  1278. dev_info(DEV, "worker terminated\n");
  1279. clear_bit(DEVICE_DYING, &mdev->flags);
  1280. clear_bit(CONFIG_PENDING, &mdev->flags);
  1281. wake_up(&mdev->state_wait);
  1282. return 0;
  1283. }