drbd_worker.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538
  1. /*
  2. drbd_worker.c
  3. This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
  4. Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
  5. Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
  6. Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  7. drbd is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2, or (at your option)
  10. any later version.
  11. drbd is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with drbd; see the file COPYING. If not, write to
  17. the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #include <linux/module.h>
  20. #include <linux/drbd.h>
  21. #include <linux/sched.h>
  22. #include <linux/smp_lock.h>
  23. #include <linux/wait.h>
  24. #include <linux/mm.h>
  25. #include <linux/memcontrol.h>
  26. #include <linux/mm_inline.h>
  27. #include <linux/slab.h>
  28. #include <linux/random.h>
  29. #include <linux/string.h>
  30. #include <linux/scatterlist.h>
  31. #include "drbd_int.h"
  32. #include "drbd_req.h"
  33. #define SLEEP_TIME (HZ/10)
  34. static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
  35. /* defined here:
  36. drbd_md_io_complete
  37. drbd_endio_sec
  38. drbd_endio_pri
  39. * more endio handlers:
  40. atodb_endio in drbd_actlog.c
  41. drbd_bm_async_io_complete in drbd_bitmap.c
  42. * For all these callbacks, note the following:
  43. * The callbacks will be called in irq context by the IDE drivers,
  44. * and in Softirqs/Tasklets/BH context by the SCSI drivers.
  45. * Try to get the locking right :)
  46. *
  47. */
  48. /* About the global_state_lock
  49. Each state transition on an device holds a read lock. In case we have
  50. to evaluate the sync after dependencies, we grab a write lock, because
  51. we need stable states on all devices for that. */
  52. rwlock_t global_state_lock;
  53. /* used for synchronous meta data and bitmap IO
  54. * submitted by drbd_md_sync_page_io()
  55. */
  56. void drbd_md_io_complete(struct bio *bio, int error)
  57. {
  58. struct drbd_md_io *md_io;
  59. md_io = (struct drbd_md_io *)bio->bi_private;
  60. md_io->error = error;
  61. complete(&md_io->event);
  62. }
  63. /* reads on behalf of the partner,
  64. * "submitted" by the receiver
  65. */
  66. void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
  67. {
  68. unsigned long flags = 0;
  69. struct drbd_conf *mdev = e->mdev;
  70. D_ASSERT(e->block_id != ID_VACANT);
  71. spin_lock_irqsave(&mdev->req_lock, flags);
  72. mdev->read_cnt += e->size >> 9;
  73. list_del(&e->w.list);
  74. if (list_empty(&mdev->read_ee))
  75. wake_up(&mdev->ee_wait);
  76. if (test_bit(__EE_WAS_ERROR, &e->flags))
  77. __drbd_chk_io_error(mdev, FALSE);
  78. spin_unlock_irqrestore(&mdev->req_lock, flags);
  79. drbd_queue_work(&mdev->data.work, &e->w);
  80. put_ldev(mdev);
  81. }
  82. static int is_failed_barrier(int ee_flags)
  83. {
  84. return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
  85. == (EE_IS_BARRIER|EE_WAS_ERROR);
  86. }
  87. /* writes on behalf of the partner, or resync writes,
  88. * "submitted" by the receiver, final stage. */
  89. static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
  90. {
  91. unsigned long flags = 0;
  92. struct drbd_conf *mdev = e->mdev;
  93. sector_t e_sector;
  94. int do_wake;
  95. int is_syncer_req;
  96. int do_al_complete_io;
  97. /* if this is a failed barrier request, disable use of barriers,
  98. * and schedule for resubmission */
  99. if (is_failed_barrier(e->flags)) {
  100. drbd_bump_write_ordering(mdev, WO_bdev_flush);
  101. spin_lock_irqsave(&mdev->req_lock, flags);
  102. list_del(&e->w.list);
  103. e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
  104. e->w.cb = w_e_reissue;
  105. /* put_ldev actually happens below, once we come here again. */
  106. __release(local);
  107. spin_unlock_irqrestore(&mdev->req_lock, flags);
  108. drbd_queue_work(&mdev->data.work, &e->w);
  109. return;
  110. }
  111. D_ASSERT(e->block_id != ID_VACANT);
  112. /* after we moved e to done_ee,
  113. * we may no longer access it,
  114. * it may be freed/reused already!
  115. * (as soon as we release the req_lock) */
  116. e_sector = e->sector;
  117. do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
  118. is_syncer_req = is_syncer_block_id(e->block_id);
  119. spin_lock_irqsave(&mdev->req_lock, flags);
  120. mdev->writ_cnt += e->size >> 9;
  121. list_del(&e->w.list); /* has been on active_ee or sync_ee */
  122. list_add_tail(&e->w.list, &mdev->done_ee);
  123. /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
  124. * neither did we wake possibly waiting conflicting requests.
  125. * done from "drbd_process_done_ee" within the appropriate w.cb
  126. * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
  127. do_wake = is_syncer_req
  128. ? list_empty(&mdev->sync_ee)
  129. : list_empty(&mdev->active_ee);
  130. if (test_bit(__EE_WAS_ERROR, &e->flags))
  131. __drbd_chk_io_error(mdev, FALSE);
  132. spin_unlock_irqrestore(&mdev->req_lock, flags);
  133. if (is_syncer_req)
  134. drbd_rs_complete_io(mdev, e_sector);
  135. if (do_wake)
  136. wake_up(&mdev->ee_wait);
  137. if (do_al_complete_io)
  138. drbd_al_complete_io(mdev, e_sector);
  139. wake_asender(mdev);
  140. put_ldev(mdev);
  141. }
  142. /* writes on behalf of the partner, or resync writes,
  143. * "submitted" by the receiver.
  144. */
  145. void drbd_endio_sec(struct bio *bio, int error)
  146. {
  147. struct drbd_epoch_entry *e = bio->bi_private;
  148. struct drbd_conf *mdev = e->mdev;
  149. int uptodate = bio_flagged(bio, BIO_UPTODATE);
  150. int is_write = bio_data_dir(bio) == WRITE;
  151. if (error)
  152. dev_warn(DEV, "%s: error=%d s=%llus\n",
  153. is_write ? "write" : "read", error,
  154. (unsigned long long)e->sector);
  155. if (!error && !uptodate) {
  156. dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
  157. is_write ? "write" : "read",
  158. (unsigned long long)e->sector);
  159. /* strange behavior of some lower level drivers...
  160. * fail the request by clearing the uptodate flag,
  161. * but do not return any error?! */
  162. error = -EIO;
  163. }
  164. if (error)
  165. set_bit(__EE_WAS_ERROR, &e->flags);
  166. bio_put(bio); /* no need for the bio anymore */
  167. if (atomic_dec_and_test(&e->pending_bios)) {
  168. if (is_write)
  169. drbd_endio_write_sec_final(e);
  170. else
  171. drbd_endio_read_sec_final(e);
  172. }
  173. }
  174. /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  175. */
  176. void drbd_endio_pri(struct bio *bio, int error)
  177. {
  178. unsigned long flags;
  179. struct drbd_request *req = bio->bi_private;
  180. struct drbd_conf *mdev = req->mdev;
  181. struct bio_and_error m;
  182. enum drbd_req_event what;
  183. int uptodate = bio_flagged(bio, BIO_UPTODATE);
  184. if (!error && !uptodate) {
  185. dev_warn(DEV, "p %s: setting error to -EIO\n",
  186. bio_data_dir(bio) == WRITE ? "write" : "read");
  187. /* strange behavior of some lower level drivers...
  188. * fail the request by clearing the uptodate flag,
  189. * but do not return any error?! */
  190. error = -EIO;
  191. }
  192. /* to avoid recursion in __req_mod */
  193. if (unlikely(error)) {
  194. what = (bio_data_dir(bio) == WRITE)
  195. ? write_completed_with_error
  196. : (bio_rw(bio) == READ)
  197. ? read_completed_with_error
  198. : read_ahead_completed_with_error;
  199. } else
  200. what = completed_ok;
  201. bio_put(req->private_bio);
  202. req->private_bio = ERR_PTR(error);
  203. spin_lock_irqsave(&mdev->req_lock, flags);
  204. __req_mod(req, what, &m);
  205. spin_unlock_irqrestore(&mdev->req_lock, flags);
  206. if (m.bio)
  207. complete_master_bio(mdev, &m);
  208. }
  209. int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  210. {
  211. struct drbd_request *req = container_of(w, struct drbd_request, w);
  212. /* We should not detach for read io-error,
  213. * but try to WRITE the P_DATA_REPLY to the failed location,
  214. * to give the disk the chance to relocate that block */
  215. spin_lock_irq(&mdev->req_lock);
  216. if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
  217. _req_mod(req, read_retry_remote_canceled);
  218. spin_unlock_irq(&mdev->req_lock);
  219. return 1;
  220. }
  221. spin_unlock_irq(&mdev->req_lock);
  222. return w_send_read_req(mdev, w, 0);
  223. }
  224. int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  225. {
  226. ERR_IF(cancel) return 1;
  227. dev_err(DEV, "resync inactive, but callback triggered??\n");
  228. return 1; /* Simply ignore this! */
  229. }
  230. void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
  231. {
  232. struct hash_desc desc;
  233. struct scatterlist sg;
  234. struct page *page = e->pages;
  235. struct page *tmp;
  236. unsigned len;
  237. desc.tfm = tfm;
  238. desc.flags = 0;
  239. sg_init_table(&sg, 1);
  240. crypto_hash_init(&desc);
  241. while ((tmp = page_chain_next(page))) {
  242. /* all but the last page will be fully used */
  243. sg_set_page(&sg, page, PAGE_SIZE, 0);
  244. crypto_hash_update(&desc, &sg, sg.length);
  245. page = tmp;
  246. }
  247. /* and now the last, possibly only partially used page */
  248. len = e->size & (PAGE_SIZE - 1);
  249. sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
  250. crypto_hash_update(&desc, &sg, sg.length);
  251. crypto_hash_final(&desc, digest);
  252. }
  253. void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
  254. {
  255. struct hash_desc desc;
  256. struct scatterlist sg;
  257. struct bio_vec *bvec;
  258. int i;
  259. desc.tfm = tfm;
  260. desc.flags = 0;
  261. sg_init_table(&sg, 1);
  262. crypto_hash_init(&desc);
  263. __bio_for_each_segment(bvec, bio, i, 0) {
  264. sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
  265. crypto_hash_update(&desc, &sg, sg.length);
  266. }
  267. crypto_hash_final(&desc, digest);
  268. }
  269. static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  270. {
  271. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  272. int digest_size;
  273. void *digest;
  274. int ok;
  275. D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
  276. if (unlikely(cancel)) {
  277. drbd_free_ee(mdev, e);
  278. return 1;
  279. }
  280. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  281. digest_size = crypto_hash_digestsize(mdev->csums_tfm);
  282. digest = kmalloc(digest_size, GFP_NOIO);
  283. if (digest) {
  284. drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
  285. inc_rs_pending(mdev);
  286. ok = drbd_send_drequest_csum(mdev,
  287. e->sector,
  288. e->size,
  289. digest,
  290. digest_size,
  291. P_CSUM_RS_REQUEST);
  292. kfree(digest);
  293. } else {
  294. dev_err(DEV, "kmalloc() of digest failed.\n");
  295. ok = 0;
  296. }
  297. } else
  298. ok = 1;
  299. drbd_free_ee(mdev, e);
  300. if (unlikely(!ok))
  301. dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
  302. return ok;
  303. }
  304. #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  305. static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
  306. {
  307. struct drbd_epoch_entry *e;
  308. if (!get_ldev(mdev))
  309. return 0;
  310. /* GFP_TRY, because if there is no memory available right now, this may
  311. * be rescheduled for later. It is "only" background resync, after all. */
  312. e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
  313. if (!e)
  314. goto fail;
  315. spin_lock_irq(&mdev->req_lock);
  316. list_add(&e->w.list, &mdev->read_ee);
  317. spin_unlock_irq(&mdev->req_lock);
  318. e->w.cb = w_e_send_csum;
  319. if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
  320. return 1;
  321. drbd_free_ee(mdev, e);
  322. fail:
  323. put_ldev(mdev);
  324. return 2;
  325. }
  326. void resync_timer_fn(unsigned long data)
  327. {
  328. unsigned long flags;
  329. struct drbd_conf *mdev = (struct drbd_conf *) data;
  330. int queue;
  331. spin_lock_irqsave(&mdev->req_lock, flags);
  332. if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
  333. queue = 1;
  334. if (mdev->state.conn == C_VERIFY_S)
  335. mdev->resync_work.cb = w_make_ov_request;
  336. else
  337. mdev->resync_work.cb = w_make_resync_request;
  338. } else {
  339. queue = 0;
  340. mdev->resync_work.cb = w_resync_inactive;
  341. }
  342. spin_unlock_irqrestore(&mdev->req_lock, flags);
  343. /* harmless race: list_empty outside data.work.q_lock */
  344. if (list_empty(&mdev->resync_work.list) && queue)
  345. drbd_queue_work(&mdev->data.work, &mdev->resync_work);
  346. }
  347. static int calc_resync_rate(struct drbd_conf *mdev)
  348. {
  349. int d = mdev->data_delay / 1000; /* us -> ms */
  350. int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */
  351. int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */
  352. int cr = mdev->sync_conf.rate;
  353. return d <= td ? cr :
  354. d >= hd ? 0 :
  355. cr + (cr * (td - d) / (hd - td));
  356. }
  357. int w_make_resync_request(struct drbd_conf *mdev,
  358. struct drbd_work *w, int cancel)
  359. {
  360. unsigned long bit;
  361. sector_t sector;
  362. const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
  363. int max_segment_size;
  364. int number, i, size, pe, mx;
  365. int align, queued, sndbuf;
  366. if (unlikely(cancel))
  367. return 1;
  368. if (unlikely(mdev->state.conn < C_CONNECTED)) {
  369. dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
  370. return 0;
  371. }
  372. if (mdev->state.conn != C_SYNC_TARGET)
  373. dev_err(DEV, "%s in w_make_resync_request\n",
  374. drbd_conn_str(mdev->state.conn));
  375. if (!get_ldev(mdev)) {
  376. /* Since we only need to access mdev->rsync a
  377. get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
  378. to continue resync with a broken disk makes no sense at
  379. all */
  380. dev_err(DEV, "Disk broke down during resync!\n");
  381. mdev->resync_work.cb = w_resync_inactive;
  382. return 1;
  383. }
  384. /* starting with drbd 8.3.8, we can handle multi-bio EEs,
  385. * if it should be necessary */
  386. max_segment_size = mdev->agreed_pro_version < 94 ?
  387. queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
  388. mdev->c_sync_rate = calc_resync_rate(mdev);
  389. number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
  390. pe = atomic_read(&mdev->rs_pending_cnt);
  391. mutex_lock(&mdev->data.mutex);
  392. if (mdev->data.socket)
  393. mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
  394. else
  395. mx = 1;
  396. mutex_unlock(&mdev->data.mutex);
  397. /* For resync rates >160MB/sec, allow more pending RS requests */
  398. if (number > mx)
  399. mx = number;
  400. /* Limit the number of pending RS requests to no more than the peer's receive buffer */
  401. if ((pe + number) > mx) {
  402. number = mx - pe;
  403. }
  404. for (i = 0; i < number; i++) {
  405. /* Stop generating RS requests, when half of the send buffer is filled */
  406. mutex_lock(&mdev->data.mutex);
  407. if (mdev->data.socket) {
  408. queued = mdev->data.socket->sk->sk_wmem_queued;
  409. sndbuf = mdev->data.socket->sk->sk_sndbuf;
  410. } else {
  411. queued = 1;
  412. sndbuf = 0;
  413. }
  414. mutex_unlock(&mdev->data.mutex);
  415. if (queued > sndbuf / 2)
  416. goto requeue;
  417. next_sector:
  418. size = BM_BLOCK_SIZE;
  419. bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
  420. if (bit == -1UL) {
  421. mdev->bm_resync_fo = drbd_bm_bits(mdev);
  422. mdev->resync_work.cb = w_resync_inactive;
  423. put_ldev(mdev);
  424. return 1;
  425. }
  426. sector = BM_BIT_TO_SECT(bit);
  427. if (drbd_try_rs_begin_io(mdev, sector)) {
  428. mdev->bm_resync_fo = bit;
  429. goto requeue;
  430. }
  431. mdev->bm_resync_fo = bit + 1;
  432. if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
  433. drbd_rs_complete_io(mdev, sector);
  434. goto next_sector;
  435. }
  436. #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
  437. /* try to find some adjacent bits.
  438. * we stop if we have already the maximum req size.
  439. *
  440. * Additionally always align bigger requests, in order to
  441. * be prepared for all stripe sizes of software RAIDs.
  442. */
  443. align = 1;
  444. for (;;) {
  445. if (size + BM_BLOCK_SIZE > max_segment_size)
  446. break;
  447. /* Be always aligned */
  448. if (sector & ((1<<(align+3))-1))
  449. break;
  450. /* do not cross extent boundaries */
  451. if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
  452. break;
  453. /* now, is it actually dirty, after all?
  454. * caution, drbd_bm_test_bit is tri-state for some
  455. * obscure reason; ( b == 0 ) would get the out-of-band
  456. * only accidentally right because of the "oddly sized"
  457. * adjustment below */
  458. if (drbd_bm_test_bit(mdev, bit+1) != 1)
  459. break;
  460. bit++;
  461. size += BM_BLOCK_SIZE;
  462. if ((BM_BLOCK_SIZE << align) <= size)
  463. align++;
  464. i++;
  465. }
  466. /* if we merged some,
  467. * reset the offset to start the next drbd_bm_find_next from */
  468. if (size > BM_BLOCK_SIZE)
  469. mdev->bm_resync_fo = bit + 1;
  470. #endif
  471. /* adjust very last sectors, in case we are oddly sized */
  472. if (sector + (size>>9) > capacity)
  473. size = (capacity-sector)<<9;
  474. if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
  475. switch (read_for_csum(mdev, sector, size)) {
  476. case 0: /* Disk failure*/
  477. put_ldev(mdev);
  478. return 0;
  479. case 2: /* Allocation failed */
  480. drbd_rs_complete_io(mdev, sector);
  481. mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
  482. goto requeue;
  483. /* case 1: everything ok */
  484. }
  485. } else {
  486. inc_rs_pending(mdev);
  487. if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
  488. sector, size, ID_SYNCER)) {
  489. dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
  490. dec_rs_pending(mdev);
  491. put_ldev(mdev);
  492. return 0;
  493. }
  494. }
  495. }
  496. if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
  497. /* last syncer _request_ was sent,
  498. * but the P_RS_DATA_REPLY not yet received. sync will end (and
  499. * next sync group will resume), as soon as we receive the last
  500. * resync data block, and the last bit is cleared.
  501. * until then resync "work" is "inactive" ...
  502. */
  503. mdev->resync_work.cb = w_resync_inactive;
  504. put_ldev(mdev);
  505. return 1;
  506. }
  507. requeue:
  508. mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
  509. put_ldev(mdev);
  510. return 1;
  511. }
  512. static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  513. {
  514. int number, i, size;
  515. sector_t sector;
  516. const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
  517. if (unlikely(cancel))
  518. return 1;
  519. if (unlikely(mdev->state.conn < C_CONNECTED)) {
  520. dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
  521. return 0;
  522. }
  523. number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
  524. if (atomic_read(&mdev->rs_pending_cnt) > number)
  525. goto requeue;
  526. number -= atomic_read(&mdev->rs_pending_cnt);
  527. sector = mdev->ov_position;
  528. for (i = 0; i < number; i++) {
  529. if (sector >= capacity) {
  530. mdev->resync_work.cb = w_resync_inactive;
  531. return 1;
  532. }
  533. size = BM_BLOCK_SIZE;
  534. if (drbd_try_rs_begin_io(mdev, sector)) {
  535. mdev->ov_position = sector;
  536. goto requeue;
  537. }
  538. if (sector + (size>>9) > capacity)
  539. size = (capacity-sector)<<9;
  540. inc_rs_pending(mdev);
  541. if (!drbd_send_ov_request(mdev, sector, size)) {
  542. dec_rs_pending(mdev);
  543. return 0;
  544. }
  545. sector += BM_SECT_PER_BIT;
  546. }
  547. mdev->ov_position = sector;
  548. requeue:
  549. mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
  550. return 1;
  551. }
  552. int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  553. {
  554. kfree(w);
  555. ov_oos_print(mdev);
  556. drbd_resync_finished(mdev);
  557. return 1;
  558. }
  559. static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  560. {
  561. kfree(w);
  562. drbd_resync_finished(mdev);
  563. return 1;
  564. }
  565. int drbd_resync_finished(struct drbd_conf *mdev)
  566. {
  567. unsigned long db, dt, dbdt;
  568. unsigned long n_oos;
  569. union drbd_state os, ns;
  570. struct drbd_work *w;
  571. char *khelper_cmd = NULL;
  572. /* Remove all elements from the resync LRU. Since future actions
  573. * might set bits in the (main) bitmap, then the entries in the
  574. * resync LRU would be wrong. */
  575. if (drbd_rs_del_all(mdev)) {
  576. /* In case this is not possible now, most probably because
  577. * there are P_RS_DATA_REPLY Packets lingering on the worker's
  578. * queue (or even the read operations for those packets
  579. * is not finished by now). Retry in 100ms. */
  580. drbd_kick_lo(mdev);
  581. __set_current_state(TASK_INTERRUPTIBLE);
  582. schedule_timeout(HZ / 10);
  583. w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
  584. if (w) {
  585. w->cb = w_resync_finished;
  586. drbd_queue_work(&mdev->data.work, w);
  587. return 1;
  588. }
  589. dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
  590. }
  591. dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
  592. if (dt <= 0)
  593. dt = 1;
  594. db = mdev->rs_total;
  595. dbdt = Bit2KB(db/dt);
  596. mdev->rs_paused /= HZ;
  597. if (!get_ldev(mdev))
  598. goto out;
  599. spin_lock_irq(&mdev->req_lock);
  600. os = mdev->state;
  601. /* This protects us against multiple calls (that can happen in the presence
  602. of application IO), and against connectivity loss just before we arrive here. */
  603. if (os.conn <= C_CONNECTED)
  604. goto out_unlock;
  605. ns = os;
  606. ns.conn = C_CONNECTED;
  607. dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
  608. (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
  609. "Online verify " : "Resync",
  610. dt + mdev->rs_paused, mdev->rs_paused, dbdt);
  611. n_oos = drbd_bm_total_weight(mdev);
  612. if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
  613. if (n_oos) {
  614. dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
  615. n_oos, Bit2KB(1));
  616. khelper_cmd = "out-of-sync";
  617. }
  618. } else {
  619. D_ASSERT((n_oos - mdev->rs_failed) == 0);
  620. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
  621. khelper_cmd = "after-resync-target";
  622. if (mdev->csums_tfm && mdev->rs_total) {
  623. const unsigned long s = mdev->rs_same_csum;
  624. const unsigned long t = mdev->rs_total;
  625. const int ratio =
  626. (t == 0) ? 0 :
  627. (t < 100000) ? ((s*100)/t) : (s/(t/100));
  628. dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
  629. "transferred %luK total %luK\n",
  630. ratio,
  631. Bit2KB(mdev->rs_same_csum),
  632. Bit2KB(mdev->rs_total - mdev->rs_same_csum),
  633. Bit2KB(mdev->rs_total));
  634. }
  635. }
  636. if (mdev->rs_failed) {
  637. dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
  638. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
  639. ns.disk = D_INCONSISTENT;
  640. ns.pdsk = D_UP_TO_DATE;
  641. } else {
  642. ns.disk = D_UP_TO_DATE;
  643. ns.pdsk = D_INCONSISTENT;
  644. }
  645. } else {
  646. ns.disk = D_UP_TO_DATE;
  647. ns.pdsk = D_UP_TO_DATE;
  648. if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
  649. if (mdev->p_uuid) {
  650. int i;
  651. for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
  652. _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
  653. drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
  654. _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
  655. } else {
  656. dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
  657. }
  658. }
  659. drbd_uuid_set_bm(mdev, 0UL);
  660. if (mdev->p_uuid) {
  661. /* Now the two UUID sets are equal, update what we
  662. * know of the peer. */
  663. int i;
  664. for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
  665. mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
  666. }
  667. }
  668. _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
  669. out_unlock:
  670. spin_unlock_irq(&mdev->req_lock);
  671. put_ldev(mdev);
  672. out:
  673. mdev->rs_total = 0;
  674. mdev->rs_failed = 0;
  675. mdev->rs_paused = 0;
  676. mdev->ov_start_sector = 0;
  677. if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
  678. dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
  679. drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
  680. }
  681. if (khelper_cmd)
  682. drbd_khelper(mdev, khelper_cmd);
  683. return 1;
  684. }
  685. /* helper */
  686. static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
  687. {
  688. if (drbd_ee_has_active_page(e)) {
  689. /* This might happen if sendpage() has not finished */
  690. spin_lock_irq(&mdev->req_lock);
  691. list_add_tail(&e->w.list, &mdev->net_ee);
  692. spin_unlock_irq(&mdev->req_lock);
  693. } else
  694. drbd_free_ee(mdev, e);
  695. }
  696. /**
  697. * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
  698. * @mdev: DRBD device.
  699. * @w: work object.
  700. * @cancel: The connection will be closed anyways
  701. */
  702. int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  703. {
  704. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  705. int ok;
  706. if (unlikely(cancel)) {
  707. drbd_free_ee(mdev, e);
  708. dec_unacked(mdev);
  709. return 1;
  710. }
  711. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  712. ok = drbd_send_block(mdev, P_DATA_REPLY, e);
  713. } else {
  714. if (__ratelimit(&drbd_ratelimit_state))
  715. dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
  716. (unsigned long long)e->sector);
  717. ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
  718. }
  719. dec_unacked(mdev);
  720. move_to_net_ee_or_free(mdev, e);
  721. if (unlikely(!ok))
  722. dev_err(DEV, "drbd_send_block() failed\n");
  723. return ok;
  724. }
  725. /**
  726. * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
  727. * @mdev: DRBD device.
  728. * @w: work object.
  729. * @cancel: The connection will be closed anyways
  730. */
  731. int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  732. {
  733. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  734. int ok;
  735. if (unlikely(cancel)) {
  736. drbd_free_ee(mdev, e);
  737. dec_unacked(mdev);
  738. return 1;
  739. }
  740. if (get_ldev_if_state(mdev, D_FAILED)) {
  741. drbd_rs_complete_io(mdev, e->sector);
  742. put_ldev(mdev);
  743. }
  744. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  745. if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
  746. inc_rs_pending(mdev);
  747. ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
  748. } else {
  749. if (__ratelimit(&drbd_ratelimit_state))
  750. dev_err(DEV, "Not sending RSDataReply, "
  751. "partner DISKLESS!\n");
  752. ok = 1;
  753. }
  754. } else {
  755. if (__ratelimit(&drbd_ratelimit_state))
  756. dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
  757. (unsigned long long)e->sector);
  758. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  759. /* update resync data with failure */
  760. drbd_rs_failed_io(mdev, e->sector, e->size);
  761. }
  762. dec_unacked(mdev);
  763. move_to_net_ee_or_free(mdev, e);
  764. if (unlikely(!ok))
  765. dev_err(DEV, "drbd_send_block() failed\n");
  766. return ok;
  767. }
  768. int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  769. {
  770. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  771. struct digest_info *di;
  772. int digest_size;
  773. void *digest = NULL;
  774. int ok, eq = 0;
  775. if (unlikely(cancel)) {
  776. drbd_free_ee(mdev, e);
  777. dec_unacked(mdev);
  778. return 1;
  779. }
  780. drbd_rs_complete_io(mdev, e->sector);
  781. di = (struct digest_info *)(unsigned long)e->block_id;
  782. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  783. /* quick hack to try to avoid a race against reconfiguration.
  784. * a real fix would be much more involved,
  785. * introducing more locking mechanisms */
  786. if (mdev->csums_tfm) {
  787. digest_size = crypto_hash_digestsize(mdev->csums_tfm);
  788. D_ASSERT(digest_size == di->digest_size);
  789. digest = kmalloc(digest_size, GFP_NOIO);
  790. }
  791. if (digest) {
  792. drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
  793. eq = !memcmp(digest, di->digest, digest_size);
  794. kfree(digest);
  795. }
  796. if (eq) {
  797. drbd_set_in_sync(mdev, e->sector, e->size);
  798. /* rs_same_csums unit is BM_BLOCK_SIZE */
  799. mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
  800. ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
  801. } else {
  802. inc_rs_pending(mdev);
  803. e->block_id = ID_SYNCER;
  804. ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
  805. }
  806. } else {
  807. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  808. if (__ratelimit(&drbd_ratelimit_state))
  809. dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
  810. }
  811. dec_unacked(mdev);
  812. kfree(di);
  813. move_to_net_ee_or_free(mdev, e);
  814. if (unlikely(!ok))
  815. dev_err(DEV, "drbd_send_block/ack() failed\n");
  816. return ok;
  817. }
  818. int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  819. {
  820. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  821. int digest_size;
  822. void *digest;
  823. int ok = 1;
  824. if (unlikely(cancel))
  825. goto out;
  826. if (unlikely((e->flags & EE_WAS_ERROR) != 0))
  827. goto out;
  828. digest_size = crypto_hash_digestsize(mdev->verify_tfm);
  829. /* FIXME if this allocation fails, online verify will not terminate! */
  830. digest = kmalloc(digest_size, GFP_NOIO);
  831. if (digest) {
  832. drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
  833. inc_rs_pending(mdev);
  834. ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
  835. digest, digest_size, P_OV_REPLY);
  836. if (!ok)
  837. dec_rs_pending(mdev);
  838. kfree(digest);
  839. }
  840. out:
  841. drbd_free_ee(mdev, e);
  842. dec_unacked(mdev);
  843. return ok;
  844. }
  845. void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
  846. {
  847. if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
  848. mdev->ov_last_oos_size += size>>9;
  849. } else {
  850. mdev->ov_last_oos_start = sector;
  851. mdev->ov_last_oos_size = size>>9;
  852. }
  853. drbd_set_out_of_sync(mdev, sector, size);
  854. set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
  855. }
  856. int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  857. {
  858. struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
  859. struct digest_info *di;
  860. int digest_size;
  861. void *digest;
  862. int ok, eq = 0;
  863. if (unlikely(cancel)) {
  864. drbd_free_ee(mdev, e);
  865. dec_unacked(mdev);
  866. return 1;
  867. }
  868. /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
  869. * the resync lru has been cleaned up already */
  870. drbd_rs_complete_io(mdev, e->sector);
  871. di = (struct digest_info *)(unsigned long)e->block_id;
  872. if (likely((e->flags & EE_WAS_ERROR) == 0)) {
  873. digest_size = crypto_hash_digestsize(mdev->verify_tfm);
  874. digest = kmalloc(digest_size, GFP_NOIO);
  875. if (digest) {
  876. drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
  877. D_ASSERT(digest_size == di->digest_size);
  878. eq = !memcmp(digest, di->digest, digest_size);
  879. kfree(digest);
  880. }
  881. } else {
  882. ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
  883. if (__ratelimit(&drbd_ratelimit_state))
  884. dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
  885. }
  886. dec_unacked(mdev);
  887. kfree(di);
  888. if (!eq)
  889. drbd_ov_oos_found(mdev, e->sector, e->size);
  890. else
  891. ov_oos_print(mdev);
  892. ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
  893. eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
  894. drbd_free_ee(mdev, e);
  895. if (--mdev->ov_left == 0) {
  896. ov_oos_print(mdev);
  897. drbd_resync_finished(mdev);
  898. }
  899. return ok;
  900. }
  901. int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  902. {
  903. struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
  904. complete(&b->done);
  905. return 1;
  906. }
  907. int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  908. {
  909. struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
  910. struct p_barrier *p = &mdev->data.sbuf.barrier;
  911. int ok = 1;
  912. /* really avoid racing with tl_clear. w.cb may have been referenced
  913. * just before it was reassigned and re-queued, so double check that.
  914. * actually, this race was harmless, since we only try to send the
  915. * barrier packet here, and otherwise do nothing with the object.
  916. * but compare with the head of w_clear_epoch */
  917. spin_lock_irq(&mdev->req_lock);
  918. if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
  919. cancel = 1;
  920. spin_unlock_irq(&mdev->req_lock);
  921. if (cancel)
  922. return 1;
  923. if (!drbd_get_data_sock(mdev))
  924. return 0;
  925. p->barrier = b->br_number;
  926. /* inc_ap_pending was done where this was queued.
  927. * dec_ap_pending will be done in got_BarrierAck
  928. * or (on connection loss) in w_clear_epoch. */
  929. ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
  930. (struct p_header *)p, sizeof(*p), 0);
  931. drbd_put_data_sock(mdev);
  932. return ok;
  933. }
  934. int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  935. {
  936. if (cancel)
  937. return 1;
  938. return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
  939. }
  940. /**
  941. * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
  942. * @mdev: DRBD device.
  943. * @w: work object.
  944. * @cancel: The connection will be closed anyways
  945. */
  946. int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  947. {
  948. struct drbd_request *req = container_of(w, struct drbd_request, w);
  949. int ok;
  950. if (unlikely(cancel)) {
  951. req_mod(req, send_canceled);
  952. return 1;
  953. }
  954. ok = drbd_send_dblock(mdev, req);
  955. req_mod(req, ok ? handed_over_to_network : send_failed);
  956. return ok;
  957. }
  958. /**
  959. * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
  960. * @mdev: DRBD device.
  961. * @w: work object.
  962. * @cancel: The connection will be closed anyways
  963. */
  964. int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
  965. {
  966. struct drbd_request *req = container_of(w, struct drbd_request, w);
  967. int ok;
  968. if (unlikely(cancel)) {
  969. req_mod(req, send_canceled);
  970. return 1;
  971. }
  972. ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
  973. (unsigned long)req);
  974. if (!ok) {
  975. /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
  976. * so this is probably redundant */
  977. if (mdev->state.conn >= C_CONNECTED)
  978. drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
  979. }
  980. req_mod(req, ok ? handed_over_to_network : send_failed);
  981. return ok;
  982. }
  983. static int _drbd_may_sync_now(struct drbd_conf *mdev)
  984. {
  985. struct drbd_conf *odev = mdev;
  986. while (1) {
  987. if (odev->sync_conf.after == -1)
  988. return 1;
  989. odev = minor_to_mdev(odev->sync_conf.after);
  990. ERR_IF(!odev) return 1;
  991. if ((odev->state.conn >= C_SYNC_SOURCE &&
  992. odev->state.conn <= C_PAUSED_SYNC_T) ||
  993. odev->state.aftr_isp || odev->state.peer_isp ||
  994. odev->state.user_isp)
  995. return 0;
  996. }
  997. }
  998. /**
  999. * _drbd_pause_after() - Pause resync on all devices that may not resync now
  1000. * @mdev: DRBD device.
  1001. *
  1002. * Called from process context only (admin command and after_state_ch).
  1003. */
  1004. static int _drbd_pause_after(struct drbd_conf *mdev)
  1005. {
  1006. struct drbd_conf *odev;
  1007. int i, rv = 0;
  1008. for (i = 0; i < minor_count; i++) {
  1009. odev = minor_to_mdev(i);
  1010. if (!odev)
  1011. continue;
  1012. if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
  1013. continue;
  1014. if (!_drbd_may_sync_now(odev))
  1015. rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
  1016. != SS_NOTHING_TO_DO);
  1017. }
  1018. return rv;
  1019. }
  1020. /**
  1021. * _drbd_resume_next() - Resume resync on all devices that may resync now
  1022. * @mdev: DRBD device.
  1023. *
  1024. * Called from process context only (admin command and worker).
  1025. */
  1026. static int _drbd_resume_next(struct drbd_conf *mdev)
  1027. {
  1028. struct drbd_conf *odev;
  1029. int i, rv = 0;
  1030. for (i = 0; i < minor_count; i++) {
  1031. odev = minor_to_mdev(i);
  1032. if (!odev)
  1033. continue;
  1034. if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
  1035. continue;
  1036. if (odev->state.aftr_isp) {
  1037. if (_drbd_may_sync_now(odev))
  1038. rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
  1039. CS_HARD, NULL)
  1040. != SS_NOTHING_TO_DO) ;
  1041. }
  1042. }
  1043. return rv;
  1044. }
  1045. void resume_next_sg(struct drbd_conf *mdev)
  1046. {
  1047. write_lock_irq(&global_state_lock);
  1048. _drbd_resume_next(mdev);
  1049. write_unlock_irq(&global_state_lock);
  1050. }
  1051. void suspend_other_sg(struct drbd_conf *mdev)
  1052. {
  1053. write_lock_irq(&global_state_lock);
  1054. _drbd_pause_after(mdev);
  1055. write_unlock_irq(&global_state_lock);
  1056. }
  1057. static int sync_after_error(struct drbd_conf *mdev, int o_minor)
  1058. {
  1059. struct drbd_conf *odev;
  1060. if (o_minor == -1)
  1061. return NO_ERROR;
  1062. if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
  1063. return ERR_SYNC_AFTER;
  1064. /* check for loops */
  1065. odev = minor_to_mdev(o_minor);
  1066. while (1) {
  1067. if (odev == mdev)
  1068. return ERR_SYNC_AFTER_CYCLE;
  1069. /* dependency chain ends here, no cycles. */
  1070. if (odev->sync_conf.after == -1)
  1071. return NO_ERROR;
  1072. /* follow the dependency chain */
  1073. odev = minor_to_mdev(odev->sync_conf.after);
  1074. }
  1075. }
  1076. int drbd_alter_sa(struct drbd_conf *mdev, int na)
  1077. {
  1078. int changes;
  1079. int retcode;
  1080. write_lock_irq(&global_state_lock);
  1081. retcode = sync_after_error(mdev, na);
  1082. if (retcode == NO_ERROR) {
  1083. mdev->sync_conf.after = na;
  1084. do {
  1085. changes = _drbd_pause_after(mdev);
  1086. changes |= _drbd_resume_next(mdev);
  1087. } while (changes);
  1088. }
  1089. write_unlock_irq(&global_state_lock);
  1090. return retcode;
  1091. }
  1092. static void ping_peer(struct drbd_conf *mdev)
  1093. {
  1094. clear_bit(GOT_PING_ACK, &mdev->flags);
  1095. request_ping(mdev);
  1096. wait_event(mdev->misc_wait,
  1097. test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
  1098. }
  1099. /**
  1100. * drbd_start_resync() - Start the resync process
  1101. * @mdev: DRBD device.
  1102. * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
  1103. *
  1104. * This function might bring you directly into one of the
  1105. * C_PAUSED_SYNC_* states.
  1106. */
  1107. void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
  1108. {
  1109. union drbd_state ns;
  1110. int r;
  1111. if (mdev->state.conn >= C_SYNC_SOURCE) {
  1112. dev_err(DEV, "Resync already running!\n");
  1113. return;
  1114. }
  1115. /* In case a previous resync run was aborted by an IO error/detach on the peer. */
  1116. drbd_rs_cancel_all(mdev);
  1117. if (side == C_SYNC_TARGET) {
  1118. /* Since application IO was locked out during C_WF_BITMAP_T and
  1119. C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
  1120. we check that we might make the data inconsistent. */
  1121. r = drbd_khelper(mdev, "before-resync-target");
  1122. r = (r >> 8) & 0xff;
  1123. if (r > 0) {
  1124. dev_info(DEV, "before-resync-target handler returned %d, "
  1125. "dropping connection.\n", r);
  1126. drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
  1127. return;
  1128. }
  1129. }
  1130. drbd_state_lock(mdev);
  1131. if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
  1132. drbd_state_unlock(mdev);
  1133. return;
  1134. }
  1135. if (side == C_SYNC_TARGET) {
  1136. mdev->bm_resync_fo = 0;
  1137. } else /* side == C_SYNC_SOURCE */ {
  1138. u64 uuid;
  1139. get_random_bytes(&uuid, sizeof(u64));
  1140. drbd_uuid_set(mdev, UI_BITMAP, uuid);
  1141. drbd_send_sync_uuid(mdev, uuid);
  1142. D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
  1143. }
  1144. write_lock_irq(&global_state_lock);
  1145. ns = mdev->state;
  1146. ns.aftr_isp = !_drbd_may_sync_now(mdev);
  1147. ns.conn = side;
  1148. if (side == C_SYNC_TARGET)
  1149. ns.disk = D_INCONSISTENT;
  1150. else /* side == C_SYNC_SOURCE */
  1151. ns.pdsk = D_INCONSISTENT;
  1152. r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
  1153. ns = mdev->state;
  1154. if (ns.conn < C_CONNECTED)
  1155. r = SS_UNKNOWN_ERROR;
  1156. if (r == SS_SUCCESS) {
  1157. mdev->rs_total =
  1158. mdev->rs_mark_left = drbd_bm_total_weight(mdev);
  1159. mdev->rs_failed = 0;
  1160. mdev->rs_paused = 0;
  1161. mdev->rs_start =
  1162. mdev->rs_mark_time = jiffies;
  1163. mdev->rs_same_csum = 0;
  1164. _drbd_pause_after(mdev);
  1165. }
  1166. write_unlock_irq(&global_state_lock);
  1167. put_ldev(mdev);
  1168. if (r == SS_SUCCESS) {
  1169. dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
  1170. drbd_conn_str(ns.conn),
  1171. (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
  1172. (unsigned long) mdev->rs_total);
  1173. if (mdev->rs_total == 0) {
  1174. /* Peer still reachable? Beware of failing before-resync-target handlers! */
  1175. ping_peer(mdev);
  1176. drbd_resync_finished(mdev);
  1177. }
  1178. /* ns.conn may already be != mdev->state.conn,
  1179. * we may have been paused in between, or become paused until
  1180. * the timer triggers.
  1181. * No matter, that is handled in resync_timer_fn() */
  1182. if (ns.conn == C_SYNC_TARGET)
  1183. mod_timer(&mdev->resync_timer, jiffies);
  1184. drbd_md_sync(mdev);
  1185. }
  1186. drbd_state_unlock(mdev);
  1187. }
  1188. int drbd_worker(struct drbd_thread *thi)
  1189. {
  1190. struct drbd_conf *mdev = thi->mdev;
  1191. struct drbd_work *w = NULL;
  1192. LIST_HEAD(work_list);
  1193. int intr = 0, i;
  1194. sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
  1195. while (get_t_state(thi) == Running) {
  1196. drbd_thread_current_set_cpu(mdev);
  1197. if (down_trylock(&mdev->data.work.s)) {
  1198. mutex_lock(&mdev->data.mutex);
  1199. if (mdev->data.socket && !mdev->net_conf->no_cork)
  1200. drbd_tcp_uncork(mdev->data.socket);
  1201. mutex_unlock(&mdev->data.mutex);
  1202. intr = down_interruptible(&mdev->data.work.s);
  1203. mutex_lock(&mdev->data.mutex);
  1204. if (mdev->data.socket && !mdev->net_conf->no_cork)
  1205. drbd_tcp_cork(mdev->data.socket);
  1206. mutex_unlock(&mdev->data.mutex);
  1207. }
  1208. if (intr) {
  1209. D_ASSERT(intr == -EINTR);
  1210. flush_signals(current);
  1211. ERR_IF (get_t_state(thi) == Running)
  1212. continue;
  1213. break;
  1214. }
  1215. if (get_t_state(thi) != Running)
  1216. break;
  1217. /* With this break, we have done a down() but not consumed
  1218. the entry from the list. The cleanup code takes care of
  1219. this... */
  1220. w = NULL;
  1221. spin_lock_irq(&mdev->data.work.q_lock);
  1222. ERR_IF(list_empty(&mdev->data.work.q)) {
  1223. /* something terribly wrong in our logic.
  1224. * we were able to down() the semaphore,
  1225. * but the list is empty... doh.
  1226. *
  1227. * what is the best thing to do now?
  1228. * try again from scratch, restarting the receiver,
  1229. * asender, whatnot? could break even more ugly,
  1230. * e.g. when we are primary, but no good local data.
  1231. *
  1232. * I'll try to get away just starting over this loop.
  1233. */
  1234. spin_unlock_irq(&mdev->data.work.q_lock);
  1235. continue;
  1236. }
  1237. w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
  1238. list_del_init(&w->list);
  1239. spin_unlock_irq(&mdev->data.work.q_lock);
  1240. if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
  1241. /* dev_warn(DEV, "worker: a callback failed! \n"); */
  1242. if (mdev->state.conn >= C_CONNECTED)
  1243. drbd_force_state(mdev,
  1244. NS(conn, C_NETWORK_FAILURE));
  1245. }
  1246. }
  1247. D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
  1248. D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
  1249. spin_lock_irq(&mdev->data.work.q_lock);
  1250. i = 0;
  1251. while (!list_empty(&mdev->data.work.q)) {
  1252. list_splice_init(&mdev->data.work.q, &work_list);
  1253. spin_unlock_irq(&mdev->data.work.q_lock);
  1254. while (!list_empty(&work_list)) {
  1255. w = list_entry(work_list.next, struct drbd_work, list);
  1256. list_del_init(&w->list);
  1257. w->cb(mdev, w, 1);
  1258. i++; /* dead debugging code */
  1259. }
  1260. spin_lock_irq(&mdev->data.work.q_lock);
  1261. }
  1262. sema_init(&mdev->data.work.s, 0);
  1263. /* DANGEROUS race: if someone did queue his work within the spinlock,
  1264. * but up() ed outside the spinlock, we could get an up() on the
  1265. * semaphore without corresponding list entry.
  1266. * So don't do that.
  1267. */
  1268. spin_unlock_irq(&mdev->data.work.q_lock);
  1269. D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
  1270. /* _drbd_set_state only uses stop_nowait.
  1271. * wait here for the Exiting receiver. */
  1272. drbd_thread_stop(&mdev->receiver);
  1273. drbd_mdev_cleanup(mdev);
  1274. dev_info(DEV, "worker terminated\n");
  1275. clear_bit(DEVICE_DYING, &mdev->flags);
  1276. clear_bit(CONFIG_PENDING, &mdev->flags);
  1277. wake_up(&mdev->state_wait);
  1278. return 0;
  1279. }