drbd_worker.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558
  1. /*
  2. drbd_worker.c
  3. This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
  4. Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
  5. Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
  6. Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  7. drbd is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2, or (at your option)
  10. any later version.
  11. drbd is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with drbd; see the file COPYING. If not, write to
  17. the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #include <linux/module.h>
  20. #include <linux/drbd.h>
  21. #include <linux/sched.h>
  22. #include <linux/smp_lock.h>
  23. #include <linux/wait.h>
  24. #include <linux/mm.h>
  25. #include <linux/memcontrol.h>
  26. #include <linux/mm_inline.h>
  27. #include <linux/slab.h>
  28. #include <linux/random.h>
  29. #include <linux/string.h>
  30. #include <linux/scatterlist.h>
  31. #include "drbd_int.h"
  32. #include "drbd_req.h"
  33. #define SLEEP_TIME (HZ/10)
  34. static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
  35. /* defined here:
  36. drbd_md_io_complete
  37. drbd_endio_sec
  38. drbd_endio_pri
  39. * more endio handlers:
  40. atodb_endio in drbd_actlog.c
  41. drbd_bm_async_io_complete in drbd_bitmap.c
  42. * For all these callbacks, note the following:
  43. * The callbacks will be called in irq context by the IDE drivers,
  44. * and in Softirqs/Tasklets/BH context by the SCSI drivers.
  45. * Try to get the locking right :)
  46. *
  47. */
  48. /* About the global_state_lock
  49. Each state transition on an device holds a read lock. In case we have
  50. to evaluate the sync after dependencies, we grab a write lock, because
  51. we need stable states on all devices for that. */
  52. rwlock_t global_state_lock;
  53. /* used for synchronous meta data and bitmap IO
  54. * submitted by drbd_md_sync_page_io()
  55. */
  56. void drbd_md_io_complete(struct bio *bio, int error)
  57. {
  58. struct drbd_md_io *md_io;
  59. md_io = (struct drbd_md_io *)bio->bi_private;
  60. md_io->error = error;
  61. complete(&md_io->event);
  62. }
  63. /* reads on behalf of the partner,
  64. * "submitted" by the receiver
  65. */
  66. void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
  67. {
  68. unsigned long flags = 0;
  69. struct drbd_conf *mdev = e->mdev;
  70. D_ASSERT(e->block_id != ID_VACANT);
  71. spin_lock_irqsave(&mdev->req_lock, flags);
  72. mdev->read_cnt += e->size >> 9;
  73. list_del(&e->w.list);
  74. if (list_empty(&mdev->read_ee))
  75. wake_up(&mdev->ee_wait);
  76. if (test_bit(__EE_WAS_ERROR, &e->flags))
  77. __drbd_chk_io_error(mdev, FALSE);
  78. spin_unlock_irqrestore(&mdev->req_lock, flags);
  79. drbd_queue_work(&mdev->data.work, &e->w);
  80. put_ldev(mdev);
  81. }
  82. static int is_failed_barrier(int ee_flags)
  83. {
  84. return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
  85. == (EE_IS_BARRIER|EE_WAS_ERROR);
  86. }
  87. /* writes on behalf of the partner, or resync writes,
  88. * "submitted" by the receiver, final stage. */
  89. static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
  90. {
  91. unsigned long flags = 0;
  92. struct drbd_conf *mdev = e->mdev;
  93. sector_t e_sector;
  94. int do_wake;
  95. int is_syncer_req;
  96. int do_al_complete_io;
  97. /* if this is a failed barrier request, disable use of barriers,
  98. * and schedule for resubmission */
  99. if (is_failed_barrier(e->flags)) {
  100. drbd_bump_write_ordering(mdev, WO_bdev_flush);
  101. spin_lock_irqsave(&mdev->req_lock, flags);
  102. list_del(&e->w.list);
  103. e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
  104. e->w.cb = w_e_reissue;
  105. /* put_ldev actually happens below, once we come here again. */
  106. __release(local);
  107. spin_unlock_irqrestore(&mdev->req_lock, flags);
  108. drbd_queue_work(&mdev->data.work, &e->w);
  109. return;
  110. }
  111. D_ASSERT(e->block_id != ID_VACANT);
  112. /* after we moved e to done_ee,
  113. * we may no longer access it,
  114. * it may be freed/reused already!
  115. * (as soon as we release the req_lock) */
  116. e_sector = e->sector;
  117. do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
  118. is_syncer_req = is_syncer_block_id(e->block_id);
  119. spin_lock_irqsave(&mdev->req_lock, flags);
  120. mdev->writ_cnt += e->size >> 9;
  121. list_del(&e->w.list); /* has been on active_ee or sync_ee */
  122. list_add_tail(&e->w.list, &mdev->done_ee);
  123. /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
  124. * neither did we wake possibly waiting conflicting requests.
  125. * done from "drbd_process_done_ee" within the appropriate w.cb
  126. * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
  127. do_wake = is_syncer_req
  128. ? list_empty(&mdev->sync_ee)
  129. : list_empty(&mdev->active_ee);
  130. if (test_bit(__EE_WAS_ERROR, &e->flags))
  131. __drbd_chk_io_error(mdev, FALSE);
  132. spin_unlock_irqrestore(&mdev->req_lock, flags);
  133. if (is_syncer_req)
  134. drbd_rs_complete_io(mdev, e_sector);
  135. if (do_wake)
  136. wake_up(&mdev->ee_wait);
  137. if (do_al_complete_io)
  138. drbd_al_complete_io(mdev, e_sector);
  139. wake_asender(mdev);
  140. put_ldev(mdev);
  141. }
  142. /* writes on behalf of the partner, or resync writes,
  143. * "submitted" by the receiver.
  144. */
  145. void drbd_endio_sec(struct bio *bio, int error)
  146. {
  147. struct drbd_epoch_entry *e = bio->bi_private;
  148. struct drbd_conf *mdev = e->mdev;
  149. int uptodate = bio_flagged(bio, BIO_UPTODATE);
  150. int is_write = bio_data_dir(bio) == WRITE;
  151. if (error)
  152. dev_warn(DEV, "%s: error=%d s=%llus\n",
  153. is_write ? "write" : "read", error,
  154. (unsigned long long)e->sector);
  155. if (!error && !uptodate) {
  156. dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
  157. is_write ? "write" : "read",
  158. (unsigned long long)e->sector);
  159. /* strange behavior of some lower level drivers...
  160. * fail the request by clearing the uptodate flag,
  161. * but do not return any error?! */
  162. error = -EIO;
  163. }
  164. if (error)
  165. set_bit(__EE_WAS_ERROR, &e->flags);
  166. bio_put(bio); /* no need for the bio anymore */
  167. if (atomic_dec_and_test(&e->pending_bios)) {
  168. if (is_write)
  169. drbd_endio_write_sec_final(e);
  170. else
  171. drbd_endio_read_sec_final(e);
  172. }
  173. }
  174. /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  175. */
  176. void drbd_endio_pri(struct bio *bio, int error)
  177. {
  178. unsigned long flags;
  179. struct drbd_request *req = bio->bi_private;
  180. struct drbd_conf *mdev = req->mdev;
  181. struct bio_and_error m;
  182. enum drbd_req_event what;
  183. int uptodate = bio_flagged(bio, BIO_UPTODATE);
  184. if (error)
  185. dev_warn(DEV, "p %s: error=%d\n",
  186. bio_data_dir(bio) == WRITE ? "write" : "read", error);
  187. if (!error && !uptodate) {
  188. dev_warn(DEV, "p %s: setting error to -EIO\n",
  189. bio_data_dir(bio) == WRITE ? "write" : "read");
  190. /* strange behavior of some lower level drivers...
  191. * fail the request by clearing the uptodate flag,
  192. * but do not return any error?! */
  193. error = -EIO;
  194. }
  195. /* to avoid recursion in __req_mod */
  196. if (unlikely(error)) {
  197. what = (bio_data_dir(bio) == WRITE)
  198. ? write_completed_with_error
  199. : (bio_rw(bio) == READA)
  200. ? read_completed_with_error
  201. : read_ahead_completed_with_error;
  202. } else
  203. what = completed_ok;
  204. bio_put(req->private_bio);
  205. req->private_bio = ERR_PTR(error);
  206. spin_lock_irqsave(&mdev->req_lock, flags);
  207. __req_mod(req, what, &m);
  208. spin_unlock_irqrestore(&mdev->req_lock, flags);
  209. if (m.bio)
  210. complete_master_bio(mdev, &m);
  211. }
  212. int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  213. {
  214. struct drbd_request *req = container_of(w, struct drbd_request, w);
  215. /* NOTE: mdev->ldev can be NULL by the time we get here! */
  216. /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
  217. /* the only way this callback is scheduled is from _req_may_be_done,
  218. * when it is done and had a local write error, see comments there */
  219. drbd_req_free(req);
  220. return TRUE;
  221. }
  222. int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  223. {
  224. struct drbd_request *req = container_of(w, struct drbd_request, w);
  225. /* We should not detach for read io-error,
  226. * but try to WRITE the P_DATA_REPLY to the failed location,
  227. * to give the disk the chance to relocate that block */
  228. spin_lock_irq(&mdev->req_lock);
  229. if (cancel ||
  230. mdev->state.conn < C_CONNECTED ||
  231. mdev->state.pdsk <= D_INCONSISTENT) {
  232. _req_mod(req, send_canceled);
  233. spin_unlock_irq(&mdev->req_lock);
  234. dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
  235. return 1;
  236. }
  237. spin_unlock_irq(&mdev->req_lock);
  238. return w_send_read_req(mdev, w, 0);
  239. }
  240. int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  241. {
  242. ERR_IF(cancel) return 1;
  243. dev_err(DEV, "resync inactive, but callback triggered??\n");
  244. return 1; /* Simply ignore this! */
  245. }
  246. void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
  247. {
  248. struct hash_desc desc;
  249. struct scatterlist sg;
  250. struct page *page = e->pages;
  251. struct page *tmp;
  252. unsigned len;
  253. desc.tfm = tfm;
  254. desc.flags = 0;
  255. sg_init_table(&sg, 1);
  256. crypto_hash_init(&desc);
  257. while ((tmp = page_chain_next(page))) {
  258. /* all but the last page will be fully used */
  259. sg_set_page(&sg, page, PAGE_SIZE, 0);
  260. crypto_hash_update(&desc, &sg, sg.length);
  261. page = tmp;
  262. }
  263. /* and now the last, possibly only partially used page */
  264. len = e->size & (PAGE_SIZE - 1);
  265. sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
  266. crypto_hash_update(&desc, &sg, sg.length);
  267. crypto_hash_final(&desc, digest);
  268. }
  269. void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
  270. {
  271. struct hash_desc desc;
  272. struct scatterlist sg;
  273. struct bio_vec *bvec;
  274. int i;
  275. desc.tfm = tfm;
  276. desc.flags = 0;
  277. sg_init_table(&sg, 1);
  278. crypto_hash_init(&desc);
  279. __bio_for_each_segment(bvec, bio, i, 0) {
  280. sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
  281. crypto_hash_update(&desc, &sg, sg.length);
  282. }
  283. crypto_hash_final(&desc, digest);
  284. }
  285. static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  286. {
  287. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  288. int digest_size;
  289. void *digest;
  290. int ok;
  291. D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
  292. if (unlikely(cancel)) {
  293. drbd_free_ee(mdev, e);
  294. return 1;
  295. }
  296. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  297. digest_size = crypto_hash_digestsize(mdev->csums_tfm);
  298. digest = kmalloc(digest_size, GFP_NOIO);
  299. if (digest) {
  300. drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
  301. inc_rs_pending(mdev);
  302. ok = drbd_send_drequest_csum(mdev,
  303. e->sector,
  304. e->size,
  305. digest,
  306. digest_size,
  307. P_CSUM_RS_REQUEST);
  308. kfree(digest);
  309. } else {
  310. dev_err(DEV, "kmalloc() of digest failed.\n");
  311. ok = 0;
  312. }
  313. } else
  314. ok = 1;
  315. drbd_free_ee(mdev, e);
  316. if (unlikely(!ok))
  317. dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
  318. return ok;
  319. }
  320. #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  321. static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
  322. {
  323. struct drbd_epoch_entry *e;
  324. if (!get_ldev(mdev))
  325. return 0;
  326. /* GFP_TRY, because if there is no memory available right now, this may
  327. * be rescheduled for later. It is "only" background resync, after all. */
  328. e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
  329. if (!e)
  330. goto fail;
  331. spin_lock_irq(&mdev->req_lock);
  332. list_add(&e->w.list, &mdev->read_ee);
  333. spin_unlock_irq(&mdev->req_lock);
  334. e->w.cb = w_e_send_csum;
  335. if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
  336. return 1;
  337. drbd_free_ee(mdev, e);
  338. fail:
  339. put_ldev(mdev);
  340. return 2;
  341. }
  342. void resync_timer_fn(unsigned long data)
  343. {
  344. unsigned long flags;
  345. struct drbd_conf *mdev = (struct drbd_conf *) data;
  346. int queue;
  347. spin_lock_irqsave(&mdev->req_lock, flags);
  348. if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
  349. queue = 1;
  350. if (mdev->state.conn == C_VERIFY_S)
  351. mdev->resync_work.cb = w_make_ov_request;
  352. else
  353. mdev->resync_work.cb = w_make_resync_request;
  354. } else {
  355. queue = 0;
  356. mdev->resync_work.cb = w_resync_inactive;
  357. }
  358. spin_unlock_irqrestore(&mdev->req_lock, flags);
  359. /* harmless race: list_empty outside data.work.q_lock */
  360. if (list_empty(&mdev->resync_work.list) && queue)
  361. drbd_queue_work(&mdev->data.work, &mdev->resync_work);
  362. }
  363. static int calc_resync_rate(struct drbd_conf *mdev)
  364. {
  365. int d = mdev->data_delay / 1000; /* us -> ms */
  366. int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */
  367. int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */
  368. int cr = mdev->sync_conf.rate;
  369. return d <= td ? cr :
  370. d >= hd ? 0 :
  371. cr + (cr * (td - d) / (hd - td));
  372. }
  373. int w_make_resync_request(struct drbd_conf *mdev,
  374. struct drbd_work *w, int cancel)
  375. {
  376. unsigned long bit;
  377. sector_t sector;
  378. const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
  379. int max_segment_size;
  380. int number, i, size, pe, mx;
  381. int align, queued, sndbuf;
  382. if (unlikely(cancel))
  383. return 1;
  384. if (unlikely(mdev->state.conn < C_CONNECTED)) {
  385. dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
  386. return 0;
  387. }
  388. if (mdev->state.conn != C_SYNC_TARGET)
  389. dev_err(DEV, "%s in w_make_resync_request\n",
  390. drbd_conn_str(mdev->state.conn));
  391. if (!get_ldev(mdev)) {
  392. /* Since we only need to access mdev->rsync a
  393. get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
  394. to continue resync with a broken disk makes no sense at
  395. all */
  396. dev_err(DEV, "Disk broke down during resync!\n");
  397. mdev->resync_work.cb = w_resync_inactive;
  398. return 1;
  399. }
  400. /* starting with drbd 8.3.8, we can handle multi-bio EEs,
  401. * if it should be necessary */
  402. max_segment_size = mdev->agreed_pro_version < 94 ?
  403. queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
  404. mdev->c_sync_rate = calc_resync_rate(mdev);
  405. number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
  406. pe = atomic_read(&mdev->rs_pending_cnt);
  407. mutex_lock(&mdev->data.mutex);
  408. if (mdev->data.socket)
  409. mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
  410. else
  411. mx = 1;
  412. mutex_unlock(&mdev->data.mutex);
  413. /* For resync rates >160MB/sec, allow more pending RS requests */
  414. if (number > mx)
  415. mx = number;
  416. /* Limit the number of pending RS requests to no more than the peer's receive buffer */
  417. if ((pe + number) > mx) {
  418. number = mx - pe;
  419. }
  420. for (i = 0; i < number; i++) {
  421. /* Stop generating RS requests, when half of the send buffer is filled */
  422. mutex_lock(&mdev->data.mutex);
  423. if (mdev->data.socket) {
  424. queued = mdev->data.socket->sk->sk_wmem_queued;
  425. sndbuf = mdev->data.socket->sk->sk_sndbuf;
  426. } else {
  427. queued = 1;
  428. sndbuf = 0;
  429. }
  430. mutex_unlock(&mdev->data.mutex);
  431. if (queued > sndbuf / 2)
  432. goto requeue;
  433. next_sector:
  434. size = BM_BLOCK_SIZE;
  435. bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
  436. if (bit == -1UL) {
  437. mdev->bm_resync_fo = drbd_bm_bits(mdev);
  438. mdev->resync_work.cb = w_resync_inactive;
  439. put_ldev(mdev);
  440. return 1;
  441. }
  442. sector = BM_BIT_TO_SECT(bit);
  443. if (drbd_try_rs_begin_io(mdev, sector)) {
  444. mdev->bm_resync_fo = bit;
  445. goto requeue;
  446. }
  447. mdev->bm_resync_fo = bit + 1;
  448. if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
  449. drbd_rs_complete_io(mdev, sector);
  450. goto next_sector;
  451. }
  452. #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
  453. /* try to find some adjacent bits.
  454. * we stop if we have already the maximum req size.
  455. *
  456. * Additionally always align bigger requests, in order to
  457. * be prepared for all stripe sizes of software RAIDs.
  458. */
  459. align = 1;
  460. for (;;) {
  461. if (size + BM_BLOCK_SIZE > max_segment_size)
  462. break;
  463. /* Be always aligned */
  464. if (sector & ((1<<(align+3))-1))
  465. break;
  466. /* do not cross extent boundaries */
  467. if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
  468. break;
  469. /* now, is it actually dirty, after all?
  470. * caution, drbd_bm_test_bit is tri-state for some
  471. * obscure reason; ( b == 0 ) would get the out-of-band
  472. * only accidentally right because of the "oddly sized"
  473. * adjustment below */
  474. if (drbd_bm_test_bit(mdev, bit+1) != 1)
  475. break;
  476. bit++;
  477. size += BM_BLOCK_SIZE;
  478. if ((BM_BLOCK_SIZE << align) <= size)
  479. align++;
  480. i++;
  481. }
  482. /* if we merged some,
  483. * reset the offset to start the next drbd_bm_find_next from */
  484. if (size > BM_BLOCK_SIZE)
  485. mdev->bm_resync_fo = bit + 1;
  486. #endif
  487. /* adjust very last sectors, in case we are oddly sized */
  488. if (sector + (size>>9) > capacity)
  489. size = (capacity-sector)<<9;
  490. if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
  491. switch (read_for_csum(mdev, sector, size)) {
  492. case 0: /* Disk failure*/
  493. put_ldev(mdev);
  494. return 0;
  495. case 2: /* Allocation failed */
  496. drbd_rs_complete_io(mdev, sector);
  497. mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
  498. goto requeue;
  499. /* case 1: everything ok */
  500. }
  501. } else {
  502. inc_rs_pending(mdev);
  503. if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
  504. sector, size, ID_SYNCER)) {
  505. dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
  506. dec_rs_pending(mdev);
  507. put_ldev(mdev);
  508. return 0;
  509. }
  510. }
  511. }
  512. if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
  513. /* last syncer _request_ was sent,
  514. * but the P_RS_DATA_REPLY not yet received. sync will end (and
  515. * next sync group will resume), as soon as we receive the last
  516. * resync data block, and the last bit is cleared.
  517. * until then resync "work" is "inactive" ...
  518. */
  519. mdev->resync_work.cb = w_resync_inactive;
  520. put_ldev(mdev);
  521. return 1;
  522. }
  523. requeue:
  524. mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
  525. put_ldev(mdev);
  526. return 1;
  527. }
  528. static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  529. {
  530. int number, i, size;
  531. sector_t sector;
  532. const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
  533. if (unlikely(cancel))
  534. return 1;
  535. if (unlikely(mdev->state.conn < C_CONNECTED)) {
  536. dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
  537. return 0;
  538. }
  539. number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
  540. if (atomic_read(&mdev->rs_pending_cnt) > number)
  541. goto requeue;
  542. number -= atomic_read(&mdev->rs_pending_cnt);
  543. sector = mdev->ov_position;
  544. for (i = 0; i < number; i++) {
  545. if (sector >= capacity) {
  546. mdev->resync_work.cb = w_resync_inactive;
  547. return 1;
  548. }
  549. size = BM_BLOCK_SIZE;
  550. if (drbd_try_rs_begin_io(mdev, sector)) {
  551. mdev->ov_position = sector;
  552. goto requeue;
  553. }
  554. if (sector + (size>>9) > capacity)
  555. size = (capacity-sector)<<9;
  556. inc_rs_pending(mdev);
  557. if (!drbd_send_ov_request(mdev, sector, size)) {
  558. dec_rs_pending(mdev);
  559. return 0;
  560. }
  561. sector += BM_SECT_PER_BIT;
  562. }
  563. mdev->ov_position = sector;
  564. requeue:
  565. mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
  566. return 1;
  567. }
  568. int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  569. {
  570. kfree(w);
  571. ov_oos_print(mdev);
  572. drbd_resync_finished(mdev);
  573. return 1;
  574. }
  575. static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  576. {
  577. kfree(w);
  578. drbd_resync_finished(mdev);
  579. return 1;
  580. }
  581. int drbd_resync_finished(struct drbd_conf *mdev)
  582. {
  583. unsigned long db, dt, dbdt;
  584. unsigned long n_oos;
  585. union drbd_state os, ns;
  586. struct drbd_work *w;
  587. char *khelper_cmd = NULL;
  588. /* Remove all elements from the resync LRU. Since future actions
  589. * might set bits in the (main) bitmap, then the entries in the
  590. * resync LRU would be wrong. */
  591. if (drbd_rs_del_all(mdev)) {
  592. /* In case this is not possible now, most probably because
  593. * there are P_RS_DATA_REPLY Packets lingering on the worker's
  594. * queue (or even the read operations for those packets
  595. * is not finished by now). Retry in 100ms. */
  596. drbd_kick_lo(mdev);
  597. __set_current_state(TASK_INTERRUPTIBLE);
  598. schedule_timeout(HZ / 10);
  599. w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
  600. if (w) {
  601. w->cb = w_resync_finished;
  602. drbd_queue_work(&mdev->data.work, w);
  603. return 1;
  604. }
  605. dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
  606. }
  607. dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
  608. if (dt <= 0)
  609. dt = 1;
  610. db = mdev->rs_total;
  611. dbdt = Bit2KB(db/dt);
  612. mdev->rs_paused /= HZ;
  613. if (!get_ldev(mdev))
  614. goto out;
  615. spin_lock_irq(&mdev->req_lock);
  616. os = mdev->state;
  617. /* This protects us against multiple calls (that can happen in the presence
  618. of application IO), and against connectivity loss just before we arrive here. */
  619. if (os.conn <= C_CONNECTED)
  620. goto out_unlock;
  621. ns = os;
  622. ns.conn = C_CONNECTED;
  623. dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
  624. (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
  625. "Online verify " : "Resync",
  626. dt + mdev->rs_paused, mdev->rs_paused, dbdt);
  627. n_oos = drbd_bm_total_weight(mdev);
  628. if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
  629. if (n_oos) {
  630. dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
  631. n_oos, Bit2KB(1));
  632. khelper_cmd = "out-of-sync";
  633. }
  634. } else {
  635. D_ASSERT((n_oos - mdev->rs_failed) == 0);
  636. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
  637. khelper_cmd = "after-resync-target";
  638. if (mdev->csums_tfm && mdev->rs_total) {
  639. const unsigned long s = mdev->rs_same_csum;
  640. const unsigned long t = mdev->rs_total;
  641. const int ratio =
  642. (t == 0) ? 0 :
  643. (t < 100000) ? ((s*100)/t) : (s/(t/100));
  644. dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
  645. "transferred %luK total %luK\n",
  646. ratio,
  647. Bit2KB(mdev->rs_same_csum),
  648. Bit2KB(mdev->rs_total - mdev->rs_same_csum),
  649. Bit2KB(mdev->rs_total));
  650. }
  651. }
  652. if (mdev->rs_failed) {
  653. dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
  654. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
  655. ns.disk = D_INCONSISTENT;
  656. ns.pdsk = D_UP_TO_DATE;
  657. } else {
  658. ns.disk = D_UP_TO_DATE;
  659. ns.pdsk = D_INCONSISTENT;
  660. }
  661. } else {
  662. ns.disk = D_UP_TO_DATE;
  663. ns.pdsk = D_UP_TO_DATE;
  664. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
  665. if (mdev->p_uuid) {
  666. int i;
  667. for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
  668. _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
  669. drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
  670. _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
  671. } else {
  672. dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
  673. }
  674. }
  675. drbd_uuid_set_bm(mdev, 0UL);
  676. if (mdev->p_uuid) {
  677. /* Now the two UUID sets are equal, update what we
  678. * know of the peer. */
  679. int i;
  680. for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
  681. mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
  682. }
  683. }
  684. _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
  685. out_unlock:
  686. spin_unlock_irq(&mdev->req_lock);
  687. put_ldev(mdev);
  688. out:
  689. mdev->rs_total = 0;
  690. mdev->rs_failed = 0;
  691. mdev->rs_paused = 0;
  692. mdev->ov_start_sector = 0;
  693. if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
  694. dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
  695. drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
  696. }
  697. if (khelper_cmd)
  698. drbd_khelper(mdev, khelper_cmd);
  699. return 1;
  700. }
  701. /* helper */
  702. static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
  703. {
  704. if (drbd_ee_has_active_page(e)) {
  705. /* This might happen if sendpage() has not finished */
  706. spin_lock_irq(&mdev->req_lock);
  707. list_add_tail(&e->w.list, &mdev->net_ee);
  708. spin_unlock_irq(&mdev->req_lock);
  709. } else
  710. drbd_free_ee(mdev, e);
  711. }
  712. /**
  713. * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
  714. * @mdev: DRBD device.
  715. * @w: work object.
  716. * @cancel: The connection will be closed anyways
  717. */
  718. int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  719. {
  720. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  721. int ok;
  722. if (unlikely(cancel)) {
  723. drbd_free_ee(mdev, e);
  724. dec_unacked(mdev);
  725. return 1;
  726. }
  727. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  728. ok = drbd_send_block(mdev, P_DATA_REPLY, e);
  729. } else {
  730. if (__ratelimit(&drbd_ratelimit_state))
  731. dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
  732. (unsigned long long)e->sector);
  733. ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
  734. }
  735. dec_unacked(mdev);
  736. move_to_net_ee_or_free(mdev, e);
  737. if (unlikely(!ok))
  738. dev_err(DEV, "drbd_send_block() failed\n");
  739. return ok;
  740. }
  741. /**
  742. * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
  743. * @mdev: DRBD device.
  744. * @w: work object.
  745. * @cancel: The connection will be closed anyways
  746. */
  747. int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  748. {
  749. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  750. int ok;
  751. if (unlikely(cancel)) {
  752. drbd_free_ee(mdev, e);
  753. dec_unacked(mdev);
  754. return 1;
  755. }
  756. if (get_ldev_if_state(mdev, D_FAILED)) {
  757. drbd_rs_complete_io(mdev, e->sector);
  758. put_ldev(mdev);
  759. }
  760. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  761. if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
  762. inc_rs_pending(mdev);
  763. ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
  764. } else {
  765. if (__ratelimit(&drbd_ratelimit_state))
  766. dev_err(DEV, "Not sending RSDataReply, "
  767. "partner DISKLESS!\n");
  768. ok = 1;
  769. }
  770. } else {
  771. if (__ratelimit(&drbd_ratelimit_state))
  772. dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
  773. (unsigned long long)e->sector);
  774. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  775. /* update resync data with failure */
  776. drbd_rs_failed_io(mdev, e->sector, e->size);
  777. }
  778. dec_unacked(mdev);
  779. move_to_net_ee_or_free(mdev, e);
  780. if (unlikely(!ok))
  781. dev_err(DEV, "drbd_send_block() failed\n");
  782. return ok;
  783. }
  784. int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  785. {
  786. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  787. struct digest_info *di;
  788. int digest_size;
  789. void *digest = NULL;
  790. int ok, eq = 0;
  791. if (unlikely(cancel)) {
  792. drbd_free_ee(mdev, e);
  793. dec_unacked(mdev);
  794. return 1;
  795. }
  796. drbd_rs_complete_io(mdev, e->sector);
  797. di = (struct digest_info *)(unsigned long)e->block_id;
  798. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  799. /* quick hack to try to avoid a race against reconfiguration.
  800. * a real fix would be much more involved,
  801. * introducing more locking mechanisms */
  802. if (mdev->csums_tfm) {
  803. digest_size = crypto_hash_digestsize(mdev->csums_tfm);
  804. D_ASSERT(digest_size == di->digest_size);
  805. digest = kmalloc(digest_size, GFP_NOIO);
  806. }
  807. if (digest) {
  808. drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
  809. eq = !memcmp(digest, di->digest, digest_size);
  810. kfree(digest);
  811. }
  812. if (eq) {
  813. drbd_set_in_sync(mdev, e->sector, e->size);
  814. /* rs_same_csums unit is BM_BLOCK_SIZE */
  815. mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
  816. ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
  817. } else {
  818. inc_rs_pending(mdev);
  819. e->block_id = ID_SYNCER;
  820. ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
  821. }
  822. } else {
  823. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  824. if (__ratelimit(&drbd_ratelimit_state))
  825. dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
  826. }
  827. dec_unacked(mdev);
  828. kfree(di);
  829. move_to_net_ee_or_free(mdev, e);
  830. if (unlikely(!ok))
  831. dev_err(DEV, "drbd_send_block/ack() failed\n");
  832. return ok;
  833. }
  834. int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  835. {
  836. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  837. int digest_size;
  838. void *digest;
  839. int ok = 1;
  840. if (unlikely(cancel))
  841. goto out;
  842. if (unlikely((e->flags & EE_WAS_ERROR) != 0))
  843. goto out;
  844. digest_size = crypto_hash_digestsize(mdev->verify_tfm);
  845. /* FIXME if this allocation fails, online verify will not terminate! */
  846. digest = kmalloc(digest_size, GFP_NOIO);
  847. if (digest) {
  848. drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
  849. inc_rs_pending(mdev);
  850. ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
  851. digest, digest_size, P_OV_REPLY);
  852. if (!ok)
  853. dec_rs_pending(mdev);
  854. kfree(digest);
  855. }
  856. out:
  857. drbd_free_ee(mdev, e);
  858. dec_unacked(mdev);
  859. return ok;
  860. }
  861. void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
  862. {
  863. if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
  864. mdev->ov_last_oos_size += size>>9;
  865. } else {
  866. mdev->ov_last_oos_start = sector;
  867. mdev->ov_last_oos_size = size>>9;
  868. }
  869. drbd_set_out_of_sync(mdev, sector, size);
  870. set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
  871. }
  872. int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  873. {
  874. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  875. struct digest_info *di;
  876. int digest_size;
  877. void *digest;
  878. int ok, eq = 0;
  879. if (unlikely(cancel)) {
  880. drbd_free_ee(mdev, e);
  881. dec_unacked(mdev);
  882. return 1;
  883. }
  884. /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
  885. * the resync lru has been cleaned up already */
  886. drbd_rs_complete_io(mdev, e->sector);
  887. di = (struct digest_info *)(unsigned long)e->block_id;
  888. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  889. digest_size = crypto_hash_digestsize(mdev->verify_tfm);
  890. digest = kmalloc(digest_size, GFP_NOIO);
  891. if (digest) {
  892. drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
  893. D_ASSERT(digest_size == di->digest_size);
  894. eq = !memcmp(digest, di->digest, digest_size);
  895. kfree(digest);
  896. }
  897. } else {
  898. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  899. if (__ratelimit(&drbd_ratelimit_state))
  900. dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
  901. }
  902. dec_unacked(mdev);
  903. kfree(di);
  904. if (!eq)
  905. drbd_ov_oos_found(mdev, e->sector, e->size);
  906. else
  907. ov_oos_print(mdev);
  908. ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
  909. eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
  910. drbd_free_ee(mdev, e);
  911. if (--mdev->ov_left == 0) {
  912. ov_oos_print(mdev);
  913. drbd_resync_finished(mdev);
  914. }
  915. return ok;
  916. }
  917. int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  918. {
  919. struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
  920. complete(&b->done);
  921. return 1;
  922. }
  923. int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  924. {
  925. struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
  926. struct p_barrier *p = &mdev->data.sbuf.barrier;
  927. int ok = 1;
  928. /* really avoid racing with tl_clear. w.cb may have been referenced
  929. * just before it was reassigned and re-queued, so double check that.
  930. * actually, this race was harmless, since we only try to send the
  931. * barrier packet here, and otherwise do nothing with the object.
  932. * but compare with the head of w_clear_epoch */
  933. spin_lock_irq(&mdev->req_lock);
  934. if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
  935. cancel = 1;
  936. spin_unlock_irq(&mdev->req_lock);
  937. if (cancel)
  938. return 1;
  939. if (!drbd_get_data_sock(mdev))
  940. return 0;
  941. p->barrier = b->br_number;
  942. /* inc_ap_pending was done where this was queued.
  943. * dec_ap_pending will be done in got_BarrierAck
  944. * or (on connection loss) in w_clear_epoch. */
  945. ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
  946. (struct p_header *)p, sizeof(*p), 0);
  947. drbd_put_data_sock(mdev);
  948. return ok;
  949. }
  950. int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  951. {
  952. if (cancel)
  953. return 1;
  954. return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
  955. }
  956. /**
  957. * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
  958. * @mdev: DRBD device.
  959. * @w: work object.
  960. * @cancel: The connection will be closed anyways
  961. */
  962. int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  963. {
  964. struct drbd_request *req = container_of(w, struct drbd_request, w);
  965. int ok;
  966. if (unlikely(cancel)) {
  967. req_mod(req, send_canceled);
  968. return 1;
  969. }
  970. ok = drbd_send_dblock(mdev, req);
  971. req_mod(req, ok ? handed_over_to_network : send_failed);
  972. return ok;
  973. }
  974. /**
  975. * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
  976. * @mdev: DRBD device.
  977. * @w: work object.
  978. * @cancel: The connection will be closed anyways
  979. */
  980. int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  981. {
  982. struct drbd_request *req = container_of(w, struct drbd_request, w);
  983. int ok;
  984. if (unlikely(cancel)) {
  985. req_mod(req, send_canceled);
  986. return 1;
  987. }
  988. ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
  989. (unsigned long)req);
  990. if (!ok) {
  991. /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
  992. * so this is probably redundant */
  993. if (mdev->state.conn >= C_CONNECTED)
  994. drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
  995. }
  996. req_mod(req, ok ? handed_over_to_network : send_failed);
  997. return ok;
  998. }
  999. static int _drbd_may_sync_now(struct drbd_conf *mdev)
  1000. {
  1001. struct drbd_conf *odev = mdev;
  1002. while (1) {
  1003. if (odev->sync_conf.after == -1)
  1004. return 1;
  1005. odev = minor_to_mdev(odev->sync_conf.after);
  1006. ERR_IF(!odev) return 1;
  1007. if ((odev->state.conn >= C_SYNC_SOURCE &&
  1008. odev->state.conn <= C_PAUSED_SYNC_T) ||
  1009. odev->state.aftr_isp || odev->state.peer_isp ||
  1010. odev->state.user_isp)
  1011. return 0;
  1012. }
  1013. }
  1014. /**
  1015. * _drbd_pause_after() - Pause resync on all devices that may not resync now
  1016. * @mdev: DRBD device.
  1017. *
  1018. * Called from process context only (admin command and after_state_ch).
  1019. */
  1020. static int _drbd_pause_after(struct drbd_conf *mdev)
  1021. {
  1022. struct drbd_conf *odev;
  1023. int i, rv = 0;
  1024. for (i = 0; i < minor_count; i++) {
  1025. odev = minor_to_mdev(i);
  1026. if (!odev)
  1027. continue;
  1028. if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
  1029. continue;
  1030. if (!_drbd_may_sync_now(odev))
  1031. rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
  1032. != SS_NOTHING_TO_DO);
  1033. }
  1034. return rv;
  1035. }
  1036. /**
  1037. * _drbd_resume_next() - Resume resync on all devices that may resync now
  1038. * @mdev: DRBD device.
  1039. *
  1040. * Called from process context only (admin command and worker).
  1041. */
  1042. static int _drbd_resume_next(struct drbd_conf *mdev)
  1043. {
  1044. struct drbd_conf *odev;
  1045. int i, rv = 0;
  1046. for (i = 0; i < minor_count; i++) {
  1047. odev = minor_to_mdev(i);
  1048. if (!odev)
  1049. continue;
  1050. if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
  1051. continue;
  1052. if (odev->state.aftr_isp) {
  1053. if (_drbd_may_sync_now(odev))
  1054. rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
  1055. CS_HARD, NULL)
  1056. != SS_NOTHING_TO_DO) ;
  1057. }
  1058. }
  1059. return rv;
  1060. }
  1061. void resume_next_sg(struct drbd_conf *mdev)
  1062. {
  1063. write_lock_irq(&global_state_lock);
  1064. _drbd_resume_next(mdev);
  1065. write_unlock_irq(&global_state_lock);
  1066. }
  1067. void suspend_other_sg(struct drbd_conf *mdev)
  1068. {
  1069. write_lock_irq(&global_state_lock);
  1070. _drbd_pause_after(mdev);
  1071. write_unlock_irq(&global_state_lock);
  1072. }
  1073. static int sync_after_error(struct drbd_conf *mdev, int o_minor)
  1074. {
  1075. struct drbd_conf *odev;
  1076. if (o_minor == -1)
  1077. return NO_ERROR;
  1078. if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
  1079. return ERR_SYNC_AFTER;
  1080. /* check for loops */
  1081. odev = minor_to_mdev(o_minor);
  1082. while (1) {
  1083. if (odev == mdev)
  1084. return ERR_SYNC_AFTER_CYCLE;
  1085. /* dependency chain ends here, no cycles. */
  1086. if (odev->sync_conf.after == -1)
  1087. return NO_ERROR;
  1088. /* follow the dependency chain */
  1089. odev = minor_to_mdev(odev->sync_conf.after);
  1090. }
  1091. }
  1092. int drbd_alter_sa(struct drbd_conf *mdev, int na)
  1093. {
  1094. int changes;
  1095. int retcode;
  1096. write_lock_irq(&global_state_lock);
  1097. retcode = sync_after_error(mdev, na);
  1098. if (retcode == NO_ERROR) {
  1099. mdev->sync_conf.after = na;
  1100. do {
  1101. changes = _drbd_pause_after(mdev);
  1102. changes |= _drbd_resume_next(mdev);
  1103. } while (changes);
  1104. }
  1105. write_unlock_irq(&global_state_lock);
  1106. return retcode;
  1107. }
  1108. static void ping_peer(struct drbd_conf *mdev)
  1109. {
  1110. clear_bit(GOT_PING_ACK, &mdev->flags);
  1111. request_ping(mdev);
  1112. wait_event(mdev->misc_wait,
  1113. test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
  1114. }
  1115. /**
  1116. * drbd_start_resync() - Start the resync process
  1117. * @mdev: DRBD device.
  1118. * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
  1119. *
  1120. * This function might bring you directly into one of the
  1121. * C_PAUSED_SYNC_* states.
  1122. */
  1123. void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
  1124. {
  1125. union drbd_state ns;
  1126. int r;
  1127. if (mdev->state.conn >= C_SYNC_SOURCE) {
  1128. dev_err(DEV, "Resync already running!\n");
  1129. return;
  1130. }
  1131. /* In case a previous resync run was aborted by an IO error/detach on the peer. */
  1132. drbd_rs_cancel_all(mdev);
  1133. if (side == C_SYNC_TARGET) {
  1134. /* Since application IO was locked out during C_WF_BITMAP_T and
  1135. C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
  1136. we check that we might make the data inconsistent. */
  1137. r = drbd_khelper(mdev, "before-resync-target");
  1138. r = (r >> 8) & 0xff;
  1139. if (r > 0) {
  1140. dev_info(DEV, "before-resync-target handler returned %d, "
  1141. "dropping connection.\n", r);
  1142. drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
  1143. return;
  1144. }
  1145. }
  1146. drbd_state_lock(mdev);
  1147. if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
  1148. drbd_state_unlock(mdev);
  1149. return;
  1150. }
  1151. if (side == C_SYNC_TARGET) {
  1152. mdev->bm_resync_fo = 0;
  1153. } else /* side == C_SYNC_SOURCE */ {
  1154. u64 uuid;
  1155. get_random_bytes(&uuid, sizeof(u64));
  1156. drbd_uuid_set(mdev, UI_BITMAP, uuid);
  1157. drbd_send_sync_uuid(mdev, uuid);
  1158. D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
  1159. }
  1160. write_lock_irq(&global_state_lock);
  1161. ns = mdev->state;
  1162. ns.aftr_isp = !_drbd_may_sync_now(mdev);
  1163. ns.conn = side;
  1164. if (side == C_SYNC_TARGET)
  1165. ns.disk = D_INCONSISTENT;
  1166. else /* side == C_SYNC_SOURCE */
  1167. ns.pdsk = D_INCONSISTENT;
  1168. r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
  1169. ns = mdev->state;
  1170. if (ns.conn < C_CONNECTED)
  1171. r = SS_UNKNOWN_ERROR;
  1172. if (r == SS_SUCCESS) {
  1173. mdev->rs_total =
  1174. mdev->rs_mark_left = drbd_bm_total_weight(mdev);
  1175. mdev->rs_failed = 0;
  1176. mdev->rs_paused = 0;
  1177. mdev->rs_start =
  1178. mdev->rs_mark_time = jiffies;
  1179. mdev->rs_same_csum = 0;
  1180. _drbd_pause_after(mdev);
  1181. }
  1182. write_unlock_irq(&global_state_lock);
  1183. put_ldev(mdev);
  1184. if (r == SS_SUCCESS) {
  1185. dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
  1186. drbd_conn_str(ns.conn),
  1187. (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
  1188. (unsigned long) mdev->rs_total);
  1189. if (mdev->rs_total == 0) {
  1190. /* Peer still reachable? Beware of failing before-resync-target handlers! */
  1191. ping_peer(mdev);
  1192. drbd_resync_finished(mdev);
  1193. }
  1194. /* ns.conn may already be != mdev->state.conn,
  1195. * we may have been paused in between, or become paused until
  1196. * the timer triggers.
  1197. * No matter, that is handled in resync_timer_fn() */
  1198. if (ns.conn == C_SYNC_TARGET)
  1199. mod_timer(&mdev->resync_timer, jiffies);
  1200. drbd_md_sync(mdev);
  1201. }
  1202. drbd_state_unlock(mdev);
  1203. }
  1204. int drbd_worker(struct drbd_thread *thi)
  1205. {
  1206. struct drbd_conf *mdev = thi->mdev;
  1207. struct drbd_work *w = NULL;
  1208. LIST_HEAD(work_list);
  1209. int intr = 0, i;
  1210. sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
  1211. while (get_t_state(thi) == Running) {
  1212. drbd_thread_current_set_cpu(mdev);
  1213. if (down_trylock(&mdev->data.work.s)) {
  1214. mutex_lock(&mdev->data.mutex);
  1215. if (mdev->data.socket && !mdev->net_conf->no_cork)
  1216. drbd_tcp_uncork(mdev->data.socket);
  1217. mutex_unlock(&mdev->data.mutex);
  1218. intr = down_interruptible(&mdev->data.work.s);
  1219. mutex_lock(&mdev->data.mutex);
  1220. if (mdev->data.socket && !mdev->net_conf->no_cork)
  1221. drbd_tcp_cork(mdev->data.socket);
  1222. mutex_unlock(&mdev->data.mutex);
  1223. }
  1224. if (intr) {
  1225. D_ASSERT(intr == -EINTR);
  1226. flush_signals(current);
  1227. ERR_IF (get_t_state(thi) == Running)
  1228. continue;
  1229. break;
  1230. }
  1231. if (get_t_state(thi) != Running)
  1232. break;
  1233. /* With this break, we have done a down() but not consumed
  1234. the entry from the list. The cleanup code takes care of
  1235. this... */
  1236. w = NULL;
  1237. spin_lock_irq(&mdev->data.work.q_lock);
  1238. ERR_IF(list_empty(&mdev->data.work.q)) {
  1239. /* something terribly wrong in our logic.
  1240. * we were able to down() the semaphore,
  1241. * but the list is empty... doh.
  1242. *
  1243. * what is the best thing to do now?
  1244. * try again from scratch, restarting the receiver,
  1245. * asender, whatnot? could break even more ugly,
  1246. * e.g. when we are primary, but no good local data.
  1247. *
  1248. * I'll try to get away just starting over this loop.
  1249. */
  1250. spin_unlock_irq(&mdev->data.work.q_lock);
  1251. continue;
  1252. }
  1253. w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
  1254. list_del_init(&w->list);
  1255. spin_unlock_irq(&mdev->data.work.q_lock);
  1256. if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
  1257. /* dev_warn(DEV, "worker: a callback failed! \n"); */
  1258. if (mdev->state.conn >= C_CONNECTED)
  1259. drbd_force_state(mdev,
  1260. NS(conn, C_NETWORK_FAILURE));
  1261. }
  1262. }
  1263. D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
  1264. D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
  1265. spin_lock_irq(&mdev->data.work.q_lock);
  1266. i = 0;
  1267. while (!list_empty(&mdev->data.work.q)) {
  1268. list_splice_init(&mdev->data.work.q, &work_list);
  1269. spin_unlock_irq(&mdev->data.work.q_lock);
  1270. while (!list_empty(&work_list)) {
  1271. w = list_entry(work_list.next, struct drbd_work, list);
  1272. list_del_init(&w->list);
  1273. w->cb(mdev, w, 1);
  1274. i++; /* dead debugging code */
  1275. }
  1276. spin_lock_irq(&mdev->data.work.q_lock);
  1277. }
  1278. sema_init(&mdev->data.work.s, 0);
  1279. /* DANGEROUS race: if someone did queue his work within the spinlock,
  1280. * but up() ed outside the spinlock, we could get an up() on the
  1281. * semaphore without corresponding list entry.
  1282. * So don't do that.
  1283. */
  1284. spin_unlock_irq(&mdev->data.work.q_lock);
  1285. D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
  1286. /* _drbd_set_state only uses stop_nowait.
  1287. * wait here for the Exiting receiver. */
  1288. drbd_thread_stop(&mdev->receiver);
  1289. drbd_mdev_cleanup(mdev);
  1290. dev_info(DEV, "worker terminated\n");
  1291. clear_bit(DEVICE_DYING, &mdev->flags);
  1292. clear_bit(CONFIG_PENDING, &mdev->flags);
  1293. wake_up(&mdev->state_wait);
  1294. return 0;
  1295. }