scrub.c 67 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516
  1. /*
  2. * Copyright (C) 2011 STRATO. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public
  6. * License v2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public
  14. * License along with this program; if not, write to the
  15. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16. * Boston, MA 021110-1307, USA.
  17. */
  18. #include <linux/blkdev.h>
  19. #include <linux/ratelimit.h>
  20. #include "ctree.h"
  21. #include "volumes.h"
  22. #include "disk-io.h"
  23. #include "ordered-data.h"
  24. #include "transaction.h"
  25. #include "backref.h"
  26. #include "extent_io.h"
  27. #include "check-integrity.h"
  28. #include "rcu-string.h"
  29. /*
  30. * This is only the first step towards a full-features scrub. It reads all
  31. * extent and super block and verifies the checksums. In case a bad checksum
  32. * is found or the extent cannot be read, good data will be written back if
  33. * any can be found.
  34. *
  35. * Future enhancements:
  36. * - In case an unrepairable extent is encountered, track which files are
  37. * affected and report them
  38. * - track and record media errors, throw out bad devices
  39. * - add a mode to also read unallocated space
  40. */
  41. struct scrub_block;
  42. struct scrub_ctx;
  43. #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
  44. #define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */
  45. /*
  46. * the following value times PAGE_SIZE needs to be large enough to match the
  47. * largest node/leaf/sector size that shall be supported.
  48. * Values larger than BTRFS_STRIPE_LEN are not supported.
  49. */
  50. #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
  51. struct scrub_page {
  52. struct scrub_block *sblock;
  53. struct page *page;
  54. struct btrfs_device *dev;
  55. u64 flags; /* extent flags */
  56. u64 generation;
  57. u64 logical;
  58. u64 physical;
  59. atomic_t ref_count;
  60. struct {
  61. unsigned int mirror_num:8;
  62. unsigned int have_csum:1;
  63. unsigned int io_error:1;
  64. };
  65. u8 csum[BTRFS_CSUM_SIZE];
  66. };
  67. struct scrub_bio {
  68. int index;
  69. struct scrub_ctx *sctx;
  70. struct btrfs_device *dev;
  71. struct bio *bio;
  72. int err;
  73. u64 logical;
  74. u64 physical;
  75. struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
  76. int page_count;
  77. int next_free;
  78. struct btrfs_work work;
  79. };
  80. struct scrub_block {
  81. struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
  82. int page_count;
  83. atomic_t outstanding_pages;
  84. atomic_t ref_count; /* free mem on transition to zero */
  85. struct scrub_ctx *sctx;
  86. struct {
  87. unsigned int header_error:1;
  88. unsigned int checksum_error:1;
  89. unsigned int no_io_error_seen:1;
  90. unsigned int generation_error:1; /* also sets header_error */
  91. };
  92. };
  93. struct scrub_ctx {
  94. struct scrub_bio *bios[SCRUB_BIOS_PER_CTX];
  95. struct btrfs_root *dev_root;
  96. int first_free;
  97. int curr;
  98. atomic_t in_flight;
  99. atomic_t fixup_cnt;
  100. spinlock_t list_lock;
  101. wait_queue_head_t list_wait;
  102. u16 csum_size;
  103. struct list_head csum_list;
  104. atomic_t cancel_req;
  105. int readonly;
  106. int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
  107. u32 sectorsize;
  108. u32 nodesize;
  109. u32 leafsize;
  110. /*
  111. * statistics
  112. */
  113. struct btrfs_scrub_progress stat;
  114. spinlock_t stat_lock;
  115. };
  116. struct scrub_fixup_nodatasum {
  117. struct scrub_ctx *sctx;
  118. struct btrfs_device *dev;
  119. u64 logical;
  120. struct btrfs_root *root;
  121. struct btrfs_work work;
  122. int mirror_num;
  123. };
  124. struct scrub_warning {
  125. struct btrfs_path *path;
  126. u64 extent_item_size;
  127. char *scratch_buf;
  128. char *msg_buf;
  129. const char *errstr;
  130. sector_t sector;
  131. u64 logical;
  132. struct btrfs_device *dev;
  133. int msg_bufsize;
  134. int scratch_bufsize;
  135. };
  136. static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
  137. static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
  138. struct btrfs_mapping_tree *map_tree,
  139. u64 length, u64 logical,
  140. struct scrub_block *sblock);
  141. static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
  142. struct scrub_block *sblock, int is_metadata,
  143. int have_csum, u8 *csum, u64 generation,
  144. u16 csum_size);
  145. static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
  146. struct scrub_block *sblock,
  147. int is_metadata, int have_csum,
  148. const u8 *csum, u64 generation,
  149. u16 csum_size);
  150. static void scrub_complete_bio_end_io(struct bio *bio, int err);
  151. static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
  152. struct scrub_block *sblock_good,
  153. int force_write);
  154. static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
  155. struct scrub_block *sblock_good,
  156. int page_num, int force_write);
  157. static int scrub_checksum_data(struct scrub_block *sblock);
  158. static int scrub_checksum_tree_block(struct scrub_block *sblock);
  159. static int scrub_checksum_super(struct scrub_block *sblock);
  160. static void scrub_block_get(struct scrub_block *sblock);
  161. static void scrub_block_put(struct scrub_block *sblock);
  162. static void scrub_page_get(struct scrub_page *spage);
  163. static void scrub_page_put(struct scrub_page *spage);
  164. static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
  165. struct scrub_page *spage);
  166. static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  167. u64 physical, struct btrfs_device *dev, u64 flags,
  168. u64 gen, int mirror_num, u8 *csum, int force);
  169. static void scrub_bio_end_io(struct bio *bio, int err);
  170. static void scrub_bio_end_io_worker(struct btrfs_work *work);
  171. static void scrub_block_complete(struct scrub_block *sblock);
  172. static void scrub_free_csums(struct scrub_ctx *sctx)
  173. {
  174. while (!list_empty(&sctx->csum_list)) {
  175. struct btrfs_ordered_sum *sum;
  176. sum = list_first_entry(&sctx->csum_list,
  177. struct btrfs_ordered_sum, list);
  178. list_del(&sum->list);
  179. kfree(sum);
  180. }
  181. }
  182. static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
  183. {
  184. int i;
  185. if (!sctx)
  186. return;
  187. /* this can happen when scrub is cancelled */
  188. if (sctx->curr != -1) {
  189. struct scrub_bio *sbio = sctx->bios[sctx->curr];
  190. for (i = 0; i < sbio->page_count; i++) {
  191. BUG_ON(!sbio->pagev[i]);
  192. BUG_ON(!sbio->pagev[i]->page);
  193. scrub_block_put(sbio->pagev[i]->sblock);
  194. }
  195. bio_put(sbio->bio);
  196. }
  197. for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
  198. struct scrub_bio *sbio = sctx->bios[i];
  199. if (!sbio)
  200. break;
  201. kfree(sbio);
  202. }
  203. scrub_free_csums(sctx);
  204. kfree(sctx);
  205. }
  206. static noinline_for_stack
  207. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev)
  208. {
  209. struct scrub_ctx *sctx;
  210. int i;
  211. struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
  212. int pages_per_bio;
  213. pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
  214. bio_get_nr_vecs(dev->bdev));
  215. sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
  216. if (!sctx)
  217. goto nomem;
  218. sctx->pages_per_bio = pages_per_bio;
  219. sctx->curr = -1;
  220. sctx->dev_root = dev->dev_root;
  221. for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
  222. struct scrub_bio *sbio;
  223. sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
  224. if (!sbio)
  225. goto nomem;
  226. sctx->bios[i] = sbio;
  227. sbio->index = i;
  228. sbio->sctx = sctx;
  229. sbio->page_count = 0;
  230. sbio->work.func = scrub_bio_end_io_worker;
  231. if (i != SCRUB_BIOS_PER_CTX - 1)
  232. sctx->bios[i]->next_free = i + 1;
  233. else
  234. sctx->bios[i]->next_free = -1;
  235. }
  236. sctx->first_free = 0;
  237. sctx->nodesize = dev->dev_root->nodesize;
  238. sctx->leafsize = dev->dev_root->leafsize;
  239. sctx->sectorsize = dev->dev_root->sectorsize;
  240. atomic_set(&sctx->in_flight, 0);
  241. atomic_set(&sctx->fixup_cnt, 0);
  242. atomic_set(&sctx->cancel_req, 0);
  243. sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
  244. INIT_LIST_HEAD(&sctx->csum_list);
  245. spin_lock_init(&sctx->list_lock);
  246. spin_lock_init(&sctx->stat_lock);
  247. init_waitqueue_head(&sctx->list_wait);
  248. return sctx;
  249. nomem:
  250. scrub_free_ctx(sctx);
  251. return ERR_PTR(-ENOMEM);
  252. }
  253. static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
  254. {
  255. u64 isize;
  256. u32 nlink;
  257. int ret;
  258. int i;
  259. struct extent_buffer *eb;
  260. struct btrfs_inode_item *inode_item;
  261. struct scrub_warning *swarn = ctx;
  262. struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
  263. struct inode_fs_paths *ipath = NULL;
  264. struct btrfs_root *local_root;
  265. struct btrfs_key root_key;
  266. root_key.objectid = root;
  267. root_key.type = BTRFS_ROOT_ITEM_KEY;
  268. root_key.offset = (u64)-1;
  269. local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
  270. if (IS_ERR(local_root)) {
  271. ret = PTR_ERR(local_root);
  272. goto err;
  273. }
  274. ret = inode_item_info(inum, 0, local_root, swarn->path);
  275. if (ret) {
  276. btrfs_release_path(swarn->path);
  277. goto err;
  278. }
  279. eb = swarn->path->nodes[0];
  280. inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
  281. struct btrfs_inode_item);
  282. isize = btrfs_inode_size(eb, inode_item);
  283. nlink = btrfs_inode_nlink(eb, inode_item);
  284. btrfs_release_path(swarn->path);
  285. ipath = init_ipath(4096, local_root, swarn->path);
  286. if (IS_ERR(ipath)) {
  287. ret = PTR_ERR(ipath);
  288. ipath = NULL;
  289. goto err;
  290. }
  291. ret = paths_from_inode(inum, ipath);
  292. if (ret < 0)
  293. goto err;
  294. /*
  295. * we deliberately ignore the bit ipath might have been too small to
  296. * hold all of the paths here
  297. */
  298. for (i = 0; i < ipath->fspath->elem_cnt; ++i)
  299. printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
  300. "%s, sector %llu, root %llu, inode %llu, offset %llu, "
  301. "length %llu, links %u (path: %s)\n", swarn->errstr,
  302. swarn->logical, rcu_str_deref(swarn->dev->name),
  303. (unsigned long long)swarn->sector, root, inum, offset,
  304. min(isize - offset, (u64)PAGE_SIZE), nlink,
  305. (char *)(unsigned long)ipath->fspath->val[i]);
  306. free_ipath(ipath);
  307. return 0;
  308. err:
  309. printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
  310. "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
  311. "resolving failed with ret=%d\n", swarn->errstr,
  312. swarn->logical, rcu_str_deref(swarn->dev->name),
  313. (unsigned long long)swarn->sector, root, inum, offset, ret);
  314. free_ipath(ipath);
  315. return 0;
  316. }
  317. static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
  318. {
  319. struct btrfs_device *dev;
  320. struct btrfs_fs_info *fs_info;
  321. struct btrfs_path *path;
  322. struct btrfs_key found_key;
  323. struct extent_buffer *eb;
  324. struct btrfs_extent_item *ei;
  325. struct scrub_warning swarn;
  326. unsigned long ptr = 0;
  327. u64 extent_item_pos;
  328. u64 flags = 0;
  329. u64 ref_root;
  330. u32 item_size;
  331. u8 ref_level;
  332. const int bufsize = 4096;
  333. int ret;
  334. WARN_ON(sblock->page_count < 1);
  335. dev = sblock->pagev[0]->dev;
  336. fs_info = sblock->sctx->dev_root->fs_info;
  337. path = btrfs_alloc_path();
  338. swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
  339. swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
  340. swarn.sector = (sblock->pagev[0]->physical) >> 9;
  341. swarn.logical = sblock->pagev[0]->logical;
  342. swarn.errstr = errstr;
  343. swarn.dev = NULL;
  344. swarn.msg_bufsize = bufsize;
  345. swarn.scratch_bufsize = bufsize;
  346. if (!path || !swarn.scratch_buf || !swarn.msg_buf)
  347. goto out;
  348. ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
  349. &flags);
  350. if (ret < 0)
  351. goto out;
  352. extent_item_pos = swarn.logical - found_key.objectid;
  353. swarn.extent_item_size = found_key.offset;
  354. eb = path->nodes[0];
  355. ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
  356. item_size = btrfs_item_size_nr(eb, path->slots[0]);
  357. btrfs_release_path(path);
  358. if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  359. do {
  360. ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
  361. &ref_root, &ref_level);
  362. printk_in_rcu(KERN_WARNING
  363. "btrfs: %s at logical %llu on dev %s, "
  364. "sector %llu: metadata %s (level %d) in tree "
  365. "%llu\n", errstr, swarn.logical,
  366. rcu_str_deref(dev->name),
  367. (unsigned long long)swarn.sector,
  368. ref_level ? "node" : "leaf",
  369. ret < 0 ? -1 : ref_level,
  370. ret < 0 ? -1 : ref_root);
  371. } while (ret != 1);
  372. } else {
  373. swarn.path = path;
  374. swarn.dev = dev;
  375. iterate_extent_inodes(fs_info, found_key.objectid,
  376. extent_item_pos, 1,
  377. scrub_print_warning_inode, &swarn);
  378. }
  379. out:
  380. btrfs_free_path(path);
  381. kfree(swarn.scratch_buf);
  382. kfree(swarn.msg_buf);
  383. }
  384. static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
  385. {
  386. struct page *page = NULL;
  387. unsigned long index;
  388. struct scrub_fixup_nodatasum *fixup = ctx;
  389. int ret;
  390. int corrected = 0;
  391. struct btrfs_key key;
  392. struct inode *inode = NULL;
  393. u64 end = offset + PAGE_SIZE - 1;
  394. struct btrfs_root *local_root;
  395. key.objectid = root;
  396. key.type = BTRFS_ROOT_ITEM_KEY;
  397. key.offset = (u64)-1;
  398. local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
  399. if (IS_ERR(local_root))
  400. return PTR_ERR(local_root);
  401. key.type = BTRFS_INODE_ITEM_KEY;
  402. key.objectid = inum;
  403. key.offset = 0;
  404. inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
  405. if (IS_ERR(inode))
  406. return PTR_ERR(inode);
  407. index = offset >> PAGE_CACHE_SHIFT;
  408. page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
  409. if (!page) {
  410. ret = -ENOMEM;
  411. goto out;
  412. }
  413. if (PageUptodate(page)) {
  414. struct btrfs_mapping_tree *map_tree;
  415. if (PageDirty(page)) {
  416. /*
  417. * we need to write the data to the defect sector. the
  418. * data that was in that sector is not in memory,
  419. * because the page was modified. we must not write the
  420. * modified page to that sector.
  421. *
  422. * TODO: what could be done here: wait for the delalloc
  423. * runner to write out that page (might involve
  424. * COW) and see whether the sector is still
  425. * referenced afterwards.
  426. *
  427. * For the meantime, we'll treat this error
  428. * incorrectable, although there is a chance that a
  429. * later scrub will find the bad sector again and that
  430. * there's no dirty page in memory, then.
  431. */
  432. ret = -EIO;
  433. goto out;
  434. }
  435. map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
  436. ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
  437. fixup->logical, page,
  438. fixup->mirror_num);
  439. unlock_page(page);
  440. corrected = !ret;
  441. } else {
  442. /*
  443. * we need to get good data first. the general readpage path
  444. * will call repair_io_failure for us, we just have to make
  445. * sure we read the bad mirror.
  446. */
  447. ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
  448. EXTENT_DAMAGED, GFP_NOFS);
  449. if (ret) {
  450. /* set_extent_bits should give proper error */
  451. WARN_ON(ret > 0);
  452. if (ret > 0)
  453. ret = -EFAULT;
  454. goto out;
  455. }
  456. ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
  457. btrfs_get_extent,
  458. fixup->mirror_num);
  459. wait_on_page_locked(page);
  460. corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
  461. end, EXTENT_DAMAGED, 0, NULL);
  462. if (!corrected)
  463. clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
  464. EXTENT_DAMAGED, GFP_NOFS);
  465. }
  466. out:
  467. if (page)
  468. put_page(page);
  469. if (inode)
  470. iput(inode);
  471. if (ret < 0)
  472. return ret;
  473. if (ret == 0 && corrected) {
  474. /*
  475. * we only need to call readpage for one of the inodes belonging
  476. * to this extent. so make iterate_extent_inodes stop
  477. */
  478. return 1;
  479. }
  480. return -EIO;
  481. }
  482. static void scrub_fixup_nodatasum(struct btrfs_work *work)
  483. {
  484. int ret;
  485. struct scrub_fixup_nodatasum *fixup;
  486. struct scrub_ctx *sctx;
  487. struct btrfs_trans_handle *trans = NULL;
  488. struct btrfs_fs_info *fs_info;
  489. struct btrfs_path *path;
  490. int uncorrectable = 0;
  491. fixup = container_of(work, struct scrub_fixup_nodatasum, work);
  492. sctx = fixup->sctx;
  493. fs_info = fixup->root->fs_info;
  494. path = btrfs_alloc_path();
  495. if (!path) {
  496. spin_lock(&sctx->stat_lock);
  497. ++sctx->stat.malloc_errors;
  498. spin_unlock(&sctx->stat_lock);
  499. uncorrectable = 1;
  500. goto out;
  501. }
  502. trans = btrfs_join_transaction(fixup->root);
  503. if (IS_ERR(trans)) {
  504. uncorrectable = 1;
  505. goto out;
  506. }
  507. /*
  508. * the idea is to trigger a regular read through the standard path. we
  509. * read a page from the (failed) logical address by specifying the
  510. * corresponding copynum of the failed sector. thus, that readpage is
  511. * expected to fail.
  512. * that is the point where on-the-fly error correction will kick in
  513. * (once it's finished) and rewrite the failed sector if a good copy
  514. * can be found.
  515. */
  516. ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
  517. path, scrub_fixup_readpage,
  518. fixup);
  519. if (ret < 0) {
  520. uncorrectable = 1;
  521. goto out;
  522. }
  523. WARN_ON(ret != 1);
  524. spin_lock(&sctx->stat_lock);
  525. ++sctx->stat.corrected_errors;
  526. spin_unlock(&sctx->stat_lock);
  527. out:
  528. if (trans && !IS_ERR(trans))
  529. btrfs_end_transaction(trans, fixup->root);
  530. if (uncorrectable) {
  531. spin_lock(&sctx->stat_lock);
  532. ++sctx->stat.uncorrectable_errors;
  533. spin_unlock(&sctx->stat_lock);
  534. printk_ratelimited_in_rcu(KERN_ERR
  535. "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
  536. (unsigned long long)fixup->logical,
  537. rcu_str_deref(fixup->dev->name));
  538. }
  539. btrfs_free_path(path);
  540. kfree(fixup);
  541. /* see caller why we're pretending to be paused in the scrub counters */
  542. mutex_lock(&fs_info->scrub_lock);
  543. atomic_dec(&fs_info->scrubs_running);
  544. atomic_dec(&fs_info->scrubs_paused);
  545. mutex_unlock(&fs_info->scrub_lock);
  546. atomic_dec(&sctx->fixup_cnt);
  547. wake_up(&fs_info->scrub_pause_wait);
  548. wake_up(&sctx->list_wait);
  549. }
  550. /*
  551. * scrub_handle_errored_block gets called when either verification of the
  552. * pages failed or the bio failed to read, e.g. with EIO. In the latter
  553. * case, this function handles all pages in the bio, even though only one
  554. * may be bad.
  555. * The goal of this function is to repair the errored block by using the
  556. * contents of one of the mirrors.
  557. */
  558. static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  559. {
  560. struct scrub_ctx *sctx = sblock_to_check->sctx;
  561. struct btrfs_device *dev;
  562. struct btrfs_fs_info *fs_info;
  563. u64 length;
  564. u64 logical;
  565. u64 generation;
  566. unsigned int failed_mirror_index;
  567. unsigned int is_metadata;
  568. unsigned int have_csum;
  569. u8 *csum;
  570. struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
  571. struct scrub_block *sblock_bad;
  572. int ret;
  573. int mirror_index;
  574. int page_num;
  575. int success;
  576. static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
  577. DEFAULT_RATELIMIT_BURST);
  578. BUG_ON(sblock_to_check->page_count < 1);
  579. fs_info = sctx->dev_root->fs_info;
  580. length = sblock_to_check->page_count * PAGE_SIZE;
  581. logical = sblock_to_check->pagev[0]->logical;
  582. generation = sblock_to_check->pagev[0]->generation;
  583. BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
  584. failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
  585. is_metadata = !(sblock_to_check->pagev[0]->flags &
  586. BTRFS_EXTENT_FLAG_DATA);
  587. have_csum = sblock_to_check->pagev[0]->have_csum;
  588. csum = sblock_to_check->pagev[0]->csum;
  589. dev = sblock_to_check->pagev[0]->dev;
  590. /*
  591. * read all mirrors one after the other. This includes to
  592. * re-read the extent or metadata block that failed (that was
  593. * the cause that this fixup code is called) another time,
  594. * page by page this time in order to know which pages
  595. * caused I/O errors and which ones are good (for all mirrors).
  596. * It is the goal to handle the situation when more than one
  597. * mirror contains I/O errors, but the errors do not
  598. * overlap, i.e. the data can be repaired by selecting the
  599. * pages from those mirrors without I/O error on the
  600. * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
  601. * would be that mirror #1 has an I/O error on the first page,
  602. * the second page is good, and mirror #2 has an I/O error on
  603. * the second page, but the first page is good.
  604. * Then the first page of the first mirror can be repaired by
  605. * taking the first page of the second mirror, and the
  606. * second page of the second mirror can be repaired by
  607. * copying the contents of the 2nd page of the 1st mirror.
  608. * One more note: if the pages of one mirror contain I/O
  609. * errors, the checksum cannot be verified. In order to get
  610. * the best data for repairing, the first attempt is to find
  611. * a mirror without I/O errors and with a validated checksum.
  612. * Only if this is not possible, the pages are picked from
  613. * mirrors with I/O errors without considering the checksum.
  614. * If the latter is the case, at the end, the checksum of the
  615. * repaired area is verified in order to correctly maintain
  616. * the statistics.
  617. */
  618. sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
  619. sizeof(*sblocks_for_recheck),
  620. GFP_NOFS);
  621. if (!sblocks_for_recheck) {
  622. spin_lock(&sctx->stat_lock);
  623. sctx->stat.malloc_errors++;
  624. sctx->stat.read_errors++;
  625. sctx->stat.uncorrectable_errors++;
  626. spin_unlock(&sctx->stat_lock);
  627. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  628. goto out;
  629. }
  630. /* setup the context, map the logical blocks and alloc the pages */
  631. ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length,
  632. logical, sblocks_for_recheck);
  633. if (ret) {
  634. spin_lock(&sctx->stat_lock);
  635. sctx->stat.read_errors++;
  636. sctx->stat.uncorrectable_errors++;
  637. spin_unlock(&sctx->stat_lock);
  638. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  639. goto out;
  640. }
  641. BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
  642. sblock_bad = sblocks_for_recheck + failed_mirror_index;
  643. /* build and submit the bios for the failed mirror, check checksums */
  644. ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
  645. csum, generation, sctx->csum_size);
  646. if (ret) {
  647. spin_lock(&sctx->stat_lock);
  648. sctx->stat.read_errors++;
  649. sctx->stat.uncorrectable_errors++;
  650. spin_unlock(&sctx->stat_lock);
  651. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  652. goto out;
  653. }
  654. if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
  655. sblock_bad->no_io_error_seen) {
  656. /*
  657. * the error disappeared after reading page by page, or
  658. * the area was part of a huge bio and other parts of the
  659. * bio caused I/O errors, or the block layer merged several
  660. * read requests into one and the error is caused by a
  661. * different bio (usually one of the two latter cases is
  662. * the cause)
  663. */
  664. spin_lock(&sctx->stat_lock);
  665. sctx->stat.unverified_errors++;
  666. spin_unlock(&sctx->stat_lock);
  667. goto out;
  668. }
  669. if (!sblock_bad->no_io_error_seen) {
  670. spin_lock(&sctx->stat_lock);
  671. sctx->stat.read_errors++;
  672. spin_unlock(&sctx->stat_lock);
  673. if (__ratelimit(&_rs))
  674. scrub_print_warning("i/o error", sblock_to_check);
  675. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  676. } else if (sblock_bad->checksum_error) {
  677. spin_lock(&sctx->stat_lock);
  678. sctx->stat.csum_errors++;
  679. spin_unlock(&sctx->stat_lock);
  680. if (__ratelimit(&_rs))
  681. scrub_print_warning("checksum error", sblock_to_check);
  682. btrfs_dev_stat_inc_and_print(dev,
  683. BTRFS_DEV_STAT_CORRUPTION_ERRS);
  684. } else if (sblock_bad->header_error) {
  685. spin_lock(&sctx->stat_lock);
  686. sctx->stat.verify_errors++;
  687. spin_unlock(&sctx->stat_lock);
  688. if (__ratelimit(&_rs))
  689. scrub_print_warning("checksum/header error",
  690. sblock_to_check);
  691. if (sblock_bad->generation_error)
  692. btrfs_dev_stat_inc_and_print(dev,
  693. BTRFS_DEV_STAT_GENERATION_ERRS);
  694. else
  695. btrfs_dev_stat_inc_and_print(dev,
  696. BTRFS_DEV_STAT_CORRUPTION_ERRS);
  697. }
  698. if (sctx->readonly)
  699. goto did_not_correct_error;
  700. if (!is_metadata && !have_csum) {
  701. struct scrub_fixup_nodatasum *fixup_nodatasum;
  702. /*
  703. * !is_metadata and !have_csum, this means that the data
  704. * might not be COW'ed, that it might be modified
  705. * concurrently. The general strategy to work on the
  706. * commit root does not help in the case when COW is not
  707. * used.
  708. */
  709. fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
  710. if (!fixup_nodatasum)
  711. goto did_not_correct_error;
  712. fixup_nodatasum->sctx = sctx;
  713. fixup_nodatasum->dev = dev;
  714. fixup_nodatasum->logical = logical;
  715. fixup_nodatasum->root = fs_info->extent_root;
  716. fixup_nodatasum->mirror_num = failed_mirror_index + 1;
  717. /*
  718. * increment scrubs_running to prevent cancel requests from
  719. * completing as long as a fixup worker is running. we must also
  720. * increment scrubs_paused to prevent deadlocking on pause
  721. * requests used for transactions commits (as the worker uses a
  722. * transaction context). it is safe to regard the fixup worker
  723. * as paused for all matters practical. effectively, we only
  724. * avoid cancellation requests from completing.
  725. */
  726. mutex_lock(&fs_info->scrub_lock);
  727. atomic_inc(&fs_info->scrubs_running);
  728. atomic_inc(&fs_info->scrubs_paused);
  729. mutex_unlock(&fs_info->scrub_lock);
  730. atomic_inc(&sctx->fixup_cnt);
  731. fixup_nodatasum->work.func = scrub_fixup_nodatasum;
  732. btrfs_queue_worker(&fs_info->scrub_workers,
  733. &fixup_nodatasum->work);
  734. goto out;
  735. }
  736. /*
  737. * now build and submit the bios for the other mirrors, check
  738. * checksums.
  739. * First try to pick the mirror which is completely without I/O
  740. * errors and also does not have a checksum error.
  741. * If one is found, and if a checksum is present, the full block
  742. * that is known to contain an error is rewritten. Afterwards
  743. * the block is known to be corrected.
  744. * If a mirror is found which is completely correct, and no
  745. * checksum is present, only those pages are rewritten that had
  746. * an I/O error in the block to be repaired, since it cannot be
  747. * determined, which copy of the other pages is better (and it
  748. * could happen otherwise that a correct page would be
  749. * overwritten by a bad one).
  750. */
  751. for (mirror_index = 0;
  752. mirror_index < BTRFS_MAX_MIRRORS &&
  753. sblocks_for_recheck[mirror_index].page_count > 0;
  754. mirror_index++) {
  755. struct scrub_block *sblock_other;
  756. if (mirror_index == failed_mirror_index)
  757. continue;
  758. sblock_other = sblocks_for_recheck + mirror_index;
  759. /* build and submit the bios, check checksums */
  760. ret = scrub_recheck_block(fs_info, sblock_other, is_metadata,
  761. have_csum, csum, generation,
  762. sctx->csum_size);
  763. if (!ret && !sblock_other->header_error &&
  764. !sblock_other->checksum_error &&
  765. sblock_other->no_io_error_seen) {
  766. int force_write = is_metadata || have_csum;
  767. ret = scrub_repair_block_from_good_copy(sblock_bad,
  768. sblock_other,
  769. force_write);
  770. if (0 == ret)
  771. goto corrected_error;
  772. }
  773. }
  774. /*
  775. * in case of I/O errors in the area that is supposed to be
  776. * repaired, continue by picking good copies of those pages.
  777. * Select the good pages from mirrors to rewrite bad pages from
  778. * the area to fix. Afterwards verify the checksum of the block
  779. * that is supposed to be repaired. This verification step is
  780. * only done for the purpose of statistic counting and for the
  781. * final scrub report, whether errors remain.
  782. * A perfect algorithm could make use of the checksum and try
  783. * all possible combinations of pages from the different mirrors
  784. * until the checksum verification succeeds. For example, when
  785. * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
  786. * of mirror #2 is readable but the final checksum test fails,
  787. * then the 2nd page of mirror #3 could be tried, whether now
  788. * the final checksum succeedes. But this would be a rare
  789. * exception and is therefore not implemented. At least it is
  790. * avoided that the good copy is overwritten.
  791. * A more useful improvement would be to pick the sectors
  792. * without I/O error based on sector sizes (512 bytes on legacy
  793. * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
  794. * mirror could be repaired by taking 512 byte of a different
  795. * mirror, even if other 512 byte sectors in the same PAGE_SIZE
  796. * area are unreadable.
  797. */
  798. /* can only fix I/O errors from here on */
  799. if (sblock_bad->no_io_error_seen)
  800. goto did_not_correct_error;
  801. success = 1;
  802. for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
  803. struct scrub_page *page_bad = sblock_bad->pagev[page_num];
  804. if (!page_bad->io_error)
  805. continue;
  806. for (mirror_index = 0;
  807. mirror_index < BTRFS_MAX_MIRRORS &&
  808. sblocks_for_recheck[mirror_index].page_count > 0;
  809. mirror_index++) {
  810. struct scrub_block *sblock_other = sblocks_for_recheck +
  811. mirror_index;
  812. struct scrub_page *page_other = sblock_other->pagev[
  813. page_num];
  814. if (!page_other->io_error) {
  815. ret = scrub_repair_page_from_good_copy(
  816. sblock_bad, sblock_other, page_num, 0);
  817. if (0 == ret) {
  818. page_bad->io_error = 0;
  819. break; /* succeeded for this page */
  820. }
  821. }
  822. }
  823. if (page_bad->io_error) {
  824. /* did not find a mirror to copy the page from */
  825. success = 0;
  826. }
  827. }
  828. if (success) {
  829. if (is_metadata || have_csum) {
  830. /*
  831. * need to verify the checksum now that all
  832. * sectors on disk are repaired (the write
  833. * request for data to be repaired is on its way).
  834. * Just be lazy and use scrub_recheck_block()
  835. * which re-reads the data before the checksum
  836. * is verified, but most likely the data comes out
  837. * of the page cache.
  838. */
  839. ret = scrub_recheck_block(fs_info, sblock_bad,
  840. is_metadata, have_csum, csum,
  841. generation, sctx->csum_size);
  842. if (!ret && !sblock_bad->header_error &&
  843. !sblock_bad->checksum_error &&
  844. sblock_bad->no_io_error_seen)
  845. goto corrected_error;
  846. else
  847. goto did_not_correct_error;
  848. } else {
  849. corrected_error:
  850. spin_lock(&sctx->stat_lock);
  851. sctx->stat.corrected_errors++;
  852. spin_unlock(&sctx->stat_lock);
  853. printk_ratelimited_in_rcu(KERN_ERR
  854. "btrfs: fixed up error at logical %llu on dev %s\n",
  855. (unsigned long long)logical,
  856. rcu_str_deref(dev->name));
  857. }
  858. } else {
  859. did_not_correct_error:
  860. spin_lock(&sctx->stat_lock);
  861. sctx->stat.uncorrectable_errors++;
  862. spin_unlock(&sctx->stat_lock);
  863. printk_ratelimited_in_rcu(KERN_ERR
  864. "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
  865. (unsigned long long)logical,
  866. rcu_str_deref(dev->name));
  867. }
  868. out:
  869. if (sblocks_for_recheck) {
  870. for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
  871. mirror_index++) {
  872. struct scrub_block *sblock = sblocks_for_recheck +
  873. mirror_index;
  874. int page_index;
  875. for (page_index = 0; page_index < sblock->page_count;
  876. page_index++) {
  877. sblock->pagev[page_index]->sblock = NULL;
  878. scrub_page_put(sblock->pagev[page_index]);
  879. }
  880. }
  881. kfree(sblocks_for_recheck);
  882. }
  883. return 0;
  884. }
  885. static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
  886. struct btrfs_mapping_tree *map_tree,
  887. u64 length, u64 logical,
  888. struct scrub_block *sblocks_for_recheck)
  889. {
  890. int page_index;
  891. int mirror_index;
  892. int ret;
  893. /*
  894. * note: the two members ref_count and outstanding_pages
  895. * are not used (and not set) in the blocks that are used for
  896. * the recheck procedure
  897. */
  898. page_index = 0;
  899. while (length > 0) {
  900. u64 sublen = min_t(u64, length, PAGE_SIZE);
  901. u64 mapped_length = sublen;
  902. struct btrfs_bio *bbio = NULL;
  903. /*
  904. * with a length of PAGE_SIZE, each returned stripe
  905. * represents one mirror
  906. */
  907. ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
  908. &bbio, 0);
  909. if (ret || !bbio || mapped_length < sublen) {
  910. kfree(bbio);
  911. return -EIO;
  912. }
  913. BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
  914. for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
  915. mirror_index++) {
  916. struct scrub_block *sblock;
  917. struct scrub_page *page;
  918. if (mirror_index >= BTRFS_MAX_MIRRORS)
  919. continue;
  920. sblock = sblocks_for_recheck + mirror_index;
  921. sblock->sctx = sctx;
  922. page = kzalloc(sizeof(*page), GFP_NOFS);
  923. if (!page) {
  924. leave_nomem:
  925. spin_lock(&sctx->stat_lock);
  926. sctx->stat.malloc_errors++;
  927. spin_unlock(&sctx->stat_lock);
  928. kfree(bbio);
  929. return -ENOMEM;
  930. }
  931. scrub_page_get(page);
  932. sblock->pagev[page_index] = page;
  933. page->logical = logical;
  934. page->physical = bbio->stripes[mirror_index].physical;
  935. /* for missing devices, dev->bdev is NULL */
  936. page->dev = bbio->stripes[mirror_index].dev;
  937. page->mirror_num = mirror_index + 1;
  938. sblock->page_count++;
  939. page->page = alloc_page(GFP_NOFS);
  940. if (!page->page)
  941. goto leave_nomem;
  942. }
  943. kfree(bbio);
  944. length -= sublen;
  945. logical += sublen;
  946. page_index++;
  947. }
  948. return 0;
  949. }
  950. /*
  951. * this function will check the on disk data for checksum errors, header
  952. * errors and read I/O errors. If any I/O errors happen, the exact pages
  953. * which are errored are marked as being bad. The goal is to enable scrub
  954. * to take those pages that are not errored from all the mirrors so that
  955. * the pages that are errored in the just handled mirror can be repaired.
  956. */
  957. static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
  958. struct scrub_block *sblock, int is_metadata,
  959. int have_csum, u8 *csum, u64 generation,
  960. u16 csum_size)
  961. {
  962. int page_num;
  963. sblock->no_io_error_seen = 1;
  964. sblock->header_error = 0;
  965. sblock->checksum_error = 0;
  966. for (page_num = 0; page_num < sblock->page_count; page_num++) {
  967. struct bio *bio;
  968. int ret;
  969. struct scrub_page *page = sblock->pagev[page_num];
  970. DECLARE_COMPLETION_ONSTACK(complete);
  971. if (page->dev->bdev == NULL) {
  972. page->io_error = 1;
  973. sblock->no_io_error_seen = 0;
  974. continue;
  975. }
  976. WARN_ON(!page->page);
  977. bio = bio_alloc(GFP_NOFS, 1);
  978. if (!bio)
  979. return -EIO;
  980. bio->bi_bdev = page->dev->bdev;
  981. bio->bi_sector = page->physical >> 9;
  982. bio->bi_end_io = scrub_complete_bio_end_io;
  983. bio->bi_private = &complete;
  984. ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
  985. if (PAGE_SIZE != ret) {
  986. bio_put(bio);
  987. return -EIO;
  988. }
  989. btrfsic_submit_bio(READ, bio);
  990. /* this will also unplug the queue */
  991. wait_for_completion(&complete);
  992. page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
  993. if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
  994. sblock->no_io_error_seen = 0;
  995. bio_put(bio);
  996. }
  997. if (sblock->no_io_error_seen)
  998. scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
  999. have_csum, csum, generation,
  1000. csum_size);
  1001. return 0;
  1002. }
  1003. static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
  1004. struct scrub_block *sblock,
  1005. int is_metadata, int have_csum,
  1006. const u8 *csum, u64 generation,
  1007. u16 csum_size)
  1008. {
  1009. int page_num;
  1010. u8 calculated_csum[BTRFS_CSUM_SIZE];
  1011. u32 crc = ~(u32)0;
  1012. struct btrfs_root *root = fs_info->extent_root;
  1013. void *mapped_buffer;
  1014. WARN_ON(!sblock->pagev[0]->page);
  1015. if (is_metadata) {
  1016. struct btrfs_header *h;
  1017. mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
  1018. h = (struct btrfs_header *)mapped_buffer;
  1019. if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
  1020. memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
  1021. memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
  1022. BTRFS_UUID_SIZE)) {
  1023. sblock->header_error = 1;
  1024. } else if (generation != le64_to_cpu(h->generation)) {
  1025. sblock->header_error = 1;
  1026. sblock->generation_error = 1;
  1027. }
  1028. csum = h->csum;
  1029. } else {
  1030. if (!have_csum)
  1031. return;
  1032. mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
  1033. }
  1034. for (page_num = 0;;) {
  1035. if (page_num == 0 && is_metadata)
  1036. crc = btrfs_csum_data(root,
  1037. ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
  1038. crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
  1039. else
  1040. crc = btrfs_csum_data(root, mapped_buffer, crc,
  1041. PAGE_SIZE);
  1042. kunmap_atomic(mapped_buffer);
  1043. page_num++;
  1044. if (page_num >= sblock->page_count)
  1045. break;
  1046. WARN_ON(!sblock->pagev[page_num]->page);
  1047. mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
  1048. }
  1049. btrfs_csum_final(crc, calculated_csum);
  1050. if (memcmp(calculated_csum, csum, csum_size))
  1051. sblock->checksum_error = 1;
  1052. }
  1053. static void scrub_complete_bio_end_io(struct bio *bio, int err)
  1054. {
  1055. complete((struct completion *)bio->bi_private);
  1056. }
  1057. static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
  1058. struct scrub_block *sblock_good,
  1059. int force_write)
  1060. {
  1061. int page_num;
  1062. int ret = 0;
  1063. for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
  1064. int ret_sub;
  1065. ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
  1066. sblock_good,
  1067. page_num,
  1068. force_write);
  1069. if (ret_sub)
  1070. ret = ret_sub;
  1071. }
  1072. return ret;
  1073. }
  1074. static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
  1075. struct scrub_block *sblock_good,
  1076. int page_num, int force_write)
  1077. {
  1078. struct scrub_page *page_bad = sblock_bad->pagev[page_num];
  1079. struct scrub_page *page_good = sblock_good->pagev[page_num];
  1080. BUG_ON(page_bad->page == NULL);
  1081. BUG_ON(page_good->page == NULL);
  1082. if (force_write || sblock_bad->header_error ||
  1083. sblock_bad->checksum_error || page_bad->io_error) {
  1084. struct bio *bio;
  1085. int ret;
  1086. DECLARE_COMPLETION_ONSTACK(complete);
  1087. bio = bio_alloc(GFP_NOFS, 1);
  1088. if (!bio)
  1089. return -EIO;
  1090. bio->bi_bdev = page_bad->dev->bdev;
  1091. bio->bi_sector = page_bad->physical >> 9;
  1092. bio->bi_end_io = scrub_complete_bio_end_io;
  1093. bio->bi_private = &complete;
  1094. ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
  1095. if (PAGE_SIZE != ret) {
  1096. bio_put(bio);
  1097. return -EIO;
  1098. }
  1099. btrfsic_submit_bio(WRITE, bio);
  1100. /* this will also unplug the queue */
  1101. wait_for_completion(&complete);
  1102. if (!bio_flagged(bio, BIO_UPTODATE)) {
  1103. btrfs_dev_stat_inc_and_print(page_bad->dev,
  1104. BTRFS_DEV_STAT_WRITE_ERRS);
  1105. bio_put(bio);
  1106. return -EIO;
  1107. }
  1108. bio_put(bio);
  1109. }
  1110. return 0;
  1111. }
  1112. static void scrub_checksum(struct scrub_block *sblock)
  1113. {
  1114. u64 flags;
  1115. int ret;
  1116. WARN_ON(sblock->page_count < 1);
  1117. flags = sblock->pagev[0]->flags;
  1118. ret = 0;
  1119. if (flags & BTRFS_EXTENT_FLAG_DATA)
  1120. ret = scrub_checksum_data(sblock);
  1121. else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
  1122. ret = scrub_checksum_tree_block(sblock);
  1123. else if (flags & BTRFS_EXTENT_FLAG_SUPER)
  1124. (void)scrub_checksum_super(sblock);
  1125. else
  1126. WARN_ON(1);
  1127. if (ret)
  1128. scrub_handle_errored_block(sblock);
  1129. }
  1130. static int scrub_checksum_data(struct scrub_block *sblock)
  1131. {
  1132. struct scrub_ctx *sctx = sblock->sctx;
  1133. u8 csum[BTRFS_CSUM_SIZE];
  1134. u8 *on_disk_csum;
  1135. struct page *page;
  1136. void *buffer;
  1137. u32 crc = ~(u32)0;
  1138. int fail = 0;
  1139. struct btrfs_root *root = sctx->dev_root;
  1140. u64 len;
  1141. int index;
  1142. BUG_ON(sblock->page_count < 1);
  1143. if (!sblock->pagev[0]->have_csum)
  1144. return 0;
  1145. on_disk_csum = sblock->pagev[0]->csum;
  1146. page = sblock->pagev[0]->page;
  1147. buffer = kmap_atomic(page);
  1148. len = sctx->sectorsize;
  1149. index = 0;
  1150. for (;;) {
  1151. u64 l = min_t(u64, len, PAGE_SIZE);
  1152. crc = btrfs_csum_data(root, buffer, crc, l);
  1153. kunmap_atomic(buffer);
  1154. len -= l;
  1155. if (len == 0)
  1156. break;
  1157. index++;
  1158. BUG_ON(index >= sblock->page_count);
  1159. BUG_ON(!sblock->pagev[index]->page);
  1160. page = sblock->pagev[index]->page;
  1161. buffer = kmap_atomic(page);
  1162. }
  1163. btrfs_csum_final(crc, csum);
  1164. if (memcmp(csum, on_disk_csum, sctx->csum_size))
  1165. fail = 1;
  1166. return fail;
  1167. }
  1168. static int scrub_checksum_tree_block(struct scrub_block *sblock)
  1169. {
  1170. struct scrub_ctx *sctx = sblock->sctx;
  1171. struct btrfs_header *h;
  1172. struct btrfs_root *root = sctx->dev_root;
  1173. struct btrfs_fs_info *fs_info = root->fs_info;
  1174. u8 calculated_csum[BTRFS_CSUM_SIZE];
  1175. u8 on_disk_csum[BTRFS_CSUM_SIZE];
  1176. struct page *page;
  1177. void *mapped_buffer;
  1178. u64 mapped_size;
  1179. void *p;
  1180. u32 crc = ~(u32)0;
  1181. int fail = 0;
  1182. int crc_fail = 0;
  1183. u64 len;
  1184. int index;
  1185. BUG_ON(sblock->page_count < 1);
  1186. page = sblock->pagev[0]->page;
  1187. mapped_buffer = kmap_atomic(page);
  1188. h = (struct btrfs_header *)mapped_buffer;
  1189. memcpy(on_disk_csum, h->csum, sctx->csum_size);
  1190. /*
  1191. * we don't use the getter functions here, as we
  1192. * a) don't have an extent buffer and
  1193. * b) the page is already kmapped
  1194. */
  1195. if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
  1196. ++fail;
  1197. if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
  1198. ++fail;
  1199. if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
  1200. ++fail;
  1201. if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
  1202. BTRFS_UUID_SIZE))
  1203. ++fail;
  1204. BUG_ON(sctx->nodesize != sctx->leafsize);
  1205. len = sctx->nodesize - BTRFS_CSUM_SIZE;
  1206. mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
  1207. p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
  1208. index = 0;
  1209. for (;;) {
  1210. u64 l = min_t(u64, len, mapped_size);
  1211. crc = btrfs_csum_data(root, p, crc, l);
  1212. kunmap_atomic(mapped_buffer);
  1213. len -= l;
  1214. if (len == 0)
  1215. break;
  1216. index++;
  1217. BUG_ON(index >= sblock->page_count);
  1218. BUG_ON(!sblock->pagev[index]->page);
  1219. page = sblock->pagev[index]->page;
  1220. mapped_buffer = kmap_atomic(page);
  1221. mapped_size = PAGE_SIZE;
  1222. p = mapped_buffer;
  1223. }
  1224. btrfs_csum_final(crc, calculated_csum);
  1225. if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
  1226. ++crc_fail;
  1227. return fail || crc_fail;
  1228. }
  1229. static int scrub_checksum_super(struct scrub_block *sblock)
  1230. {
  1231. struct btrfs_super_block *s;
  1232. struct scrub_ctx *sctx = sblock->sctx;
  1233. struct btrfs_root *root = sctx->dev_root;
  1234. struct btrfs_fs_info *fs_info = root->fs_info;
  1235. u8 calculated_csum[BTRFS_CSUM_SIZE];
  1236. u8 on_disk_csum[BTRFS_CSUM_SIZE];
  1237. struct page *page;
  1238. void *mapped_buffer;
  1239. u64 mapped_size;
  1240. void *p;
  1241. u32 crc = ~(u32)0;
  1242. int fail_gen = 0;
  1243. int fail_cor = 0;
  1244. u64 len;
  1245. int index;
  1246. BUG_ON(sblock->page_count < 1);
  1247. page = sblock->pagev[0]->page;
  1248. mapped_buffer = kmap_atomic(page);
  1249. s = (struct btrfs_super_block *)mapped_buffer;
  1250. memcpy(on_disk_csum, s->csum, sctx->csum_size);
  1251. if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
  1252. ++fail_cor;
  1253. if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
  1254. ++fail_gen;
  1255. if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
  1256. ++fail_cor;
  1257. len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
  1258. mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
  1259. p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
  1260. index = 0;
  1261. for (;;) {
  1262. u64 l = min_t(u64, len, mapped_size);
  1263. crc = btrfs_csum_data(root, p, crc, l);
  1264. kunmap_atomic(mapped_buffer);
  1265. len -= l;
  1266. if (len == 0)
  1267. break;
  1268. index++;
  1269. BUG_ON(index >= sblock->page_count);
  1270. BUG_ON(!sblock->pagev[index]->page);
  1271. page = sblock->pagev[index]->page;
  1272. mapped_buffer = kmap_atomic(page);
  1273. mapped_size = PAGE_SIZE;
  1274. p = mapped_buffer;
  1275. }
  1276. btrfs_csum_final(crc, calculated_csum);
  1277. if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
  1278. ++fail_cor;
  1279. if (fail_cor + fail_gen) {
  1280. /*
  1281. * if we find an error in a super block, we just report it.
  1282. * They will get written with the next transaction commit
  1283. * anyway
  1284. */
  1285. spin_lock(&sctx->stat_lock);
  1286. ++sctx->stat.super_errors;
  1287. spin_unlock(&sctx->stat_lock);
  1288. if (fail_cor)
  1289. btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
  1290. BTRFS_DEV_STAT_CORRUPTION_ERRS);
  1291. else
  1292. btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
  1293. BTRFS_DEV_STAT_GENERATION_ERRS);
  1294. }
  1295. return fail_cor + fail_gen;
  1296. }
  1297. static void scrub_block_get(struct scrub_block *sblock)
  1298. {
  1299. atomic_inc(&sblock->ref_count);
  1300. }
  1301. static void scrub_block_put(struct scrub_block *sblock)
  1302. {
  1303. if (atomic_dec_and_test(&sblock->ref_count)) {
  1304. int i;
  1305. for (i = 0; i < sblock->page_count; i++)
  1306. scrub_page_put(sblock->pagev[i]);
  1307. kfree(sblock);
  1308. }
  1309. }
  1310. static void scrub_page_get(struct scrub_page *spage)
  1311. {
  1312. atomic_inc(&spage->ref_count);
  1313. }
  1314. static void scrub_page_put(struct scrub_page *spage)
  1315. {
  1316. if (atomic_dec_and_test(&spage->ref_count)) {
  1317. if (spage->page)
  1318. __free_page(spage->page);
  1319. kfree(spage);
  1320. }
  1321. }
  1322. static void scrub_submit(struct scrub_ctx *sctx)
  1323. {
  1324. struct scrub_bio *sbio;
  1325. if (sctx->curr == -1)
  1326. return;
  1327. sbio = sctx->bios[sctx->curr];
  1328. sctx->curr = -1;
  1329. atomic_inc(&sctx->in_flight);
  1330. btrfsic_submit_bio(READ, sbio->bio);
  1331. }
  1332. static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
  1333. struct scrub_page *spage)
  1334. {
  1335. struct scrub_block *sblock = spage->sblock;
  1336. struct scrub_bio *sbio;
  1337. int ret;
  1338. again:
  1339. /*
  1340. * grab a fresh bio or wait for one to become available
  1341. */
  1342. while (sctx->curr == -1) {
  1343. spin_lock(&sctx->list_lock);
  1344. sctx->curr = sctx->first_free;
  1345. if (sctx->curr != -1) {
  1346. sctx->first_free = sctx->bios[sctx->curr]->next_free;
  1347. sctx->bios[sctx->curr]->next_free = -1;
  1348. sctx->bios[sctx->curr]->page_count = 0;
  1349. spin_unlock(&sctx->list_lock);
  1350. } else {
  1351. spin_unlock(&sctx->list_lock);
  1352. wait_event(sctx->list_wait, sctx->first_free != -1);
  1353. }
  1354. }
  1355. sbio = sctx->bios[sctx->curr];
  1356. if (sbio->page_count == 0) {
  1357. struct bio *bio;
  1358. sbio->physical = spage->physical;
  1359. sbio->logical = spage->logical;
  1360. sbio->dev = spage->dev;
  1361. bio = sbio->bio;
  1362. if (!bio) {
  1363. bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio);
  1364. if (!bio)
  1365. return -ENOMEM;
  1366. sbio->bio = bio;
  1367. }
  1368. bio->bi_private = sbio;
  1369. bio->bi_end_io = scrub_bio_end_io;
  1370. bio->bi_bdev = sbio->dev->bdev;
  1371. bio->bi_sector = sbio->physical >> 9;
  1372. sbio->err = 0;
  1373. } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
  1374. spage->physical ||
  1375. sbio->logical + sbio->page_count * PAGE_SIZE !=
  1376. spage->logical ||
  1377. sbio->dev != spage->dev) {
  1378. scrub_submit(sctx);
  1379. goto again;
  1380. }
  1381. sbio->pagev[sbio->page_count] = spage;
  1382. ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
  1383. if (ret != PAGE_SIZE) {
  1384. if (sbio->page_count < 1) {
  1385. bio_put(sbio->bio);
  1386. sbio->bio = NULL;
  1387. return -EIO;
  1388. }
  1389. scrub_submit(sctx);
  1390. goto again;
  1391. }
  1392. scrub_block_get(sblock); /* one for the added page */
  1393. atomic_inc(&sblock->outstanding_pages);
  1394. sbio->page_count++;
  1395. if (sbio->page_count == sctx->pages_per_bio)
  1396. scrub_submit(sctx);
  1397. return 0;
  1398. }
  1399. static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  1400. u64 physical, struct btrfs_device *dev, u64 flags,
  1401. u64 gen, int mirror_num, u8 *csum, int force)
  1402. {
  1403. struct scrub_block *sblock;
  1404. int index;
  1405. sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
  1406. if (!sblock) {
  1407. spin_lock(&sctx->stat_lock);
  1408. sctx->stat.malloc_errors++;
  1409. spin_unlock(&sctx->stat_lock);
  1410. return -ENOMEM;
  1411. }
  1412. /* one ref inside this function, plus one for each page added to
  1413. * a bio later on */
  1414. atomic_set(&sblock->ref_count, 1);
  1415. sblock->sctx = sctx;
  1416. sblock->no_io_error_seen = 1;
  1417. for (index = 0; len > 0; index++) {
  1418. struct scrub_page *spage;
  1419. u64 l = min_t(u64, len, PAGE_SIZE);
  1420. spage = kzalloc(sizeof(*spage), GFP_NOFS);
  1421. if (!spage) {
  1422. leave_nomem:
  1423. spin_lock(&sctx->stat_lock);
  1424. sctx->stat.malloc_errors++;
  1425. spin_unlock(&sctx->stat_lock);
  1426. scrub_block_put(sblock);
  1427. return -ENOMEM;
  1428. }
  1429. BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
  1430. scrub_page_get(spage);
  1431. sblock->pagev[index] = spage;
  1432. spage->sblock = sblock;
  1433. spage->dev = dev;
  1434. spage->flags = flags;
  1435. spage->generation = gen;
  1436. spage->logical = logical;
  1437. spage->physical = physical;
  1438. spage->mirror_num = mirror_num;
  1439. if (csum) {
  1440. spage->have_csum = 1;
  1441. memcpy(spage->csum, csum, sctx->csum_size);
  1442. } else {
  1443. spage->have_csum = 0;
  1444. }
  1445. sblock->page_count++;
  1446. spage->page = alloc_page(GFP_NOFS);
  1447. if (!spage->page)
  1448. goto leave_nomem;
  1449. len -= l;
  1450. logical += l;
  1451. physical += l;
  1452. }
  1453. WARN_ON(sblock->page_count == 0);
  1454. for (index = 0; index < sblock->page_count; index++) {
  1455. struct scrub_page *spage = sblock->pagev[index];
  1456. int ret;
  1457. ret = scrub_add_page_to_bio(sctx, spage);
  1458. if (ret) {
  1459. scrub_block_put(sblock);
  1460. return ret;
  1461. }
  1462. }
  1463. if (force)
  1464. scrub_submit(sctx);
  1465. /* last one frees, either here or in bio completion for last page */
  1466. scrub_block_put(sblock);
  1467. return 0;
  1468. }
  1469. static void scrub_bio_end_io(struct bio *bio, int err)
  1470. {
  1471. struct scrub_bio *sbio = bio->bi_private;
  1472. struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
  1473. sbio->err = err;
  1474. sbio->bio = bio;
  1475. btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
  1476. }
  1477. static void scrub_bio_end_io_worker(struct btrfs_work *work)
  1478. {
  1479. struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
  1480. struct scrub_ctx *sctx = sbio->sctx;
  1481. int i;
  1482. BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
  1483. if (sbio->err) {
  1484. for (i = 0; i < sbio->page_count; i++) {
  1485. struct scrub_page *spage = sbio->pagev[i];
  1486. spage->io_error = 1;
  1487. spage->sblock->no_io_error_seen = 0;
  1488. }
  1489. }
  1490. /* now complete the scrub_block items that have all pages completed */
  1491. for (i = 0; i < sbio->page_count; i++) {
  1492. struct scrub_page *spage = sbio->pagev[i];
  1493. struct scrub_block *sblock = spage->sblock;
  1494. if (atomic_dec_and_test(&sblock->outstanding_pages))
  1495. scrub_block_complete(sblock);
  1496. scrub_block_put(sblock);
  1497. }
  1498. bio_put(sbio->bio);
  1499. sbio->bio = NULL;
  1500. spin_lock(&sctx->list_lock);
  1501. sbio->next_free = sctx->first_free;
  1502. sctx->first_free = sbio->index;
  1503. spin_unlock(&sctx->list_lock);
  1504. atomic_dec(&sctx->in_flight);
  1505. wake_up(&sctx->list_wait);
  1506. }
  1507. static void scrub_block_complete(struct scrub_block *sblock)
  1508. {
  1509. if (!sblock->no_io_error_seen)
  1510. scrub_handle_errored_block(sblock);
  1511. else
  1512. scrub_checksum(sblock);
  1513. }
  1514. static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
  1515. u8 *csum)
  1516. {
  1517. struct btrfs_ordered_sum *sum = NULL;
  1518. int ret = 0;
  1519. unsigned long i;
  1520. unsigned long num_sectors;
  1521. while (!list_empty(&sctx->csum_list)) {
  1522. sum = list_first_entry(&sctx->csum_list,
  1523. struct btrfs_ordered_sum, list);
  1524. if (sum->bytenr > logical)
  1525. return 0;
  1526. if (sum->bytenr + sum->len > logical)
  1527. break;
  1528. ++sctx->stat.csum_discards;
  1529. list_del(&sum->list);
  1530. kfree(sum);
  1531. sum = NULL;
  1532. }
  1533. if (!sum)
  1534. return 0;
  1535. num_sectors = sum->len / sctx->sectorsize;
  1536. for (i = 0; i < num_sectors; ++i) {
  1537. if (sum->sums[i].bytenr == logical) {
  1538. memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
  1539. ret = 1;
  1540. break;
  1541. }
  1542. }
  1543. if (ret && i == num_sectors - 1) {
  1544. list_del(&sum->list);
  1545. kfree(sum);
  1546. }
  1547. return ret;
  1548. }
  1549. /* scrub extent tries to collect up to 64 kB for each bio */
  1550. static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
  1551. u64 physical, struct btrfs_device *dev, u64 flags,
  1552. u64 gen, int mirror_num)
  1553. {
  1554. int ret;
  1555. u8 csum[BTRFS_CSUM_SIZE];
  1556. u32 blocksize;
  1557. if (flags & BTRFS_EXTENT_FLAG_DATA) {
  1558. blocksize = sctx->sectorsize;
  1559. spin_lock(&sctx->stat_lock);
  1560. sctx->stat.data_extents_scrubbed++;
  1561. sctx->stat.data_bytes_scrubbed += len;
  1562. spin_unlock(&sctx->stat_lock);
  1563. } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  1564. BUG_ON(sctx->nodesize != sctx->leafsize);
  1565. blocksize = sctx->nodesize;
  1566. spin_lock(&sctx->stat_lock);
  1567. sctx->stat.tree_extents_scrubbed++;
  1568. sctx->stat.tree_bytes_scrubbed += len;
  1569. spin_unlock(&sctx->stat_lock);
  1570. } else {
  1571. blocksize = sctx->sectorsize;
  1572. BUG_ON(1);
  1573. }
  1574. while (len) {
  1575. u64 l = min_t(u64, len, blocksize);
  1576. int have_csum = 0;
  1577. if (flags & BTRFS_EXTENT_FLAG_DATA) {
  1578. /* push csums to sbio */
  1579. have_csum = scrub_find_csum(sctx, logical, l, csum);
  1580. if (have_csum == 0)
  1581. ++sctx->stat.no_csum;
  1582. }
  1583. ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
  1584. mirror_num, have_csum ? csum : NULL, 0);
  1585. if (ret)
  1586. return ret;
  1587. len -= l;
  1588. logical += l;
  1589. physical += l;
  1590. }
  1591. return 0;
  1592. }
  1593. static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
  1594. struct map_lookup *map,
  1595. struct btrfs_device *scrub_dev,
  1596. int num, u64 base, u64 length)
  1597. {
  1598. struct btrfs_path *path;
  1599. struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  1600. struct btrfs_root *root = fs_info->extent_root;
  1601. struct btrfs_root *csum_root = fs_info->csum_root;
  1602. struct btrfs_extent_item *extent;
  1603. struct blk_plug plug;
  1604. u64 flags;
  1605. int ret;
  1606. int slot;
  1607. int i;
  1608. u64 nstripes;
  1609. struct extent_buffer *l;
  1610. struct btrfs_key key;
  1611. u64 physical;
  1612. u64 logical;
  1613. u64 generation;
  1614. int mirror_num;
  1615. struct reada_control *reada1;
  1616. struct reada_control *reada2;
  1617. struct btrfs_key key_start;
  1618. struct btrfs_key key_end;
  1619. u64 increment = map->stripe_len;
  1620. u64 offset;
  1621. nstripes = length;
  1622. offset = 0;
  1623. do_div(nstripes, map->stripe_len);
  1624. if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
  1625. offset = map->stripe_len * num;
  1626. increment = map->stripe_len * map->num_stripes;
  1627. mirror_num = 1;
  1628. } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
  1629. int factor = map->num_stripes / map->sub_stripes;
  1630. offset = map->stripe_len * (num / map->sub_stripes);
  1631. increment = map->stripe_len * factor;
  1632. mirror_num = num % map->sub_stripes + 1;
  1633. } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
  1634. increment = map->stripe_len;
  1635. mirror_num = num % map->num_stripes + 1;
  1636. } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
  1637. increment = map->stripe_len;
  1638. mirror_num = num % map->num_stripes + 1;
  1639. } else {
  1640. increment = map->stripe_len;
  1641. mirror_num = 1;
  1642. }
  1643. path = btrfs_alloc_path();
  1644. if (!path)
  1645. return -ENOMEM;
  1646. /*
  1647. * work on commit root. The related disk blocks are static as
  1648. * long as COW is applied. This means, it is save to rewrite
  1649. * them to repair disk errors without any race conditions
  1650. */
  1651. path->search_commit_root = 1;
  1652. path->skip_locking = 1;
  1653. /*
  1654. * trigger the readahead for extent tree csum tree and wait for
  1655. * completion. During readahead, the scrub is officially paused
  1656. * to not hold off transaction commits
  1657. */
  1658. logical = base + offset;
  1659. wait_event(sctx->list_wait,
  1660. atomic_read(&sctx->in_flight) == 0);
  1661. atomic_inc(&fs_info->scrubs_paused);
  1662. wake_up(&fs_info->scrub_pause_wait);
  1663. /* FIXME it might be better to start readahead at commit root */
  1664. key_start.objectid = logical;
  1665. key_start.type = BTRFS_EXTENT_ITEM_KEY;
  1666. key_start.offset = (u64)0;
  1667. key_end.objectid = base + offset + nstripes * increment;
  1668. key_end.type = BTRFS_EXTENT_ITEM_KEY;
  1669. key_end.offset = (u64)0;
  1670. reada1 = btrfs_reada_add(root, &key_start, &key_end);
  1671. key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
  1672. key_start.type = BTRFS_EXTENT_CSUM_KEY;
  1673. key_start.offset = logical;
  1674. key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
  1675. key_end.type = BTRFS_EXTENT_CSUM_KEY;
  1676. key_end.offset = base + offset + nstripes * increment;
  1677. reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
  1678. if (!IS_ERR(reada1))
  1679. btrfs_reada_wait(reada1);
  1680. if (!IS_ERR(reada2))
  1681. btrfs_reada_wait(reada2);
  1682. mutex_lock(&fs_info->scrub_lock);
  1683. while (atomic_read(&fs_info->scrub_pause_req)) {
  1684. mutex_unlock(&fs_info->scrub_lock);
  1685. wait_event(fs_info->scrub_pause_wait,
  1686. atomic_read(&fs_info->scrub_pause_req) == 0);
  1687. mutex_lock(&fs_info->scrub_lock);
  1688. }
  1689. atomic_dec(&fs_info->scrubs_paused);
  1690. mutex_unlock(&fs_info->scrub_lock);
  1691. wake_up(&fs_info->scrub_pause_wait);
  1692. /*
  1693. * collect all data csums for the stripe to avoid seeking during
  1694. * the scrub. This might currently (crc32) end up to be about 1MB
  1695. */
  1696. blk_start_plug(&plug);
  1697. /*
  1698. * now find all extents for each stripe and scrub them
  1699. */
  1700. logical = base + offset;
  1701. physical = map->stripes[num].physical;
  1702. ret = 0;
  1703. for (i = 0; i < nstripes; ++i) {
  1704. /*
  1705. * canceled?
  1706. */
  1707. if (atomic_read(&fs_info->scrub_cancel_req) ||
  1708. atomic_read(&sctx->cancel_req)) {
  1709. ret = -ECANCELED;
  1710. goto out;
  1711. }
  1712. /*
  1713. * check to see if we have to pause
  1714. */
  1715. if (atomic_read(&fs_info->scrub_pause_req)) {
  1716. /* push queued extents */
  1717. scrub_submit(sctx);
  1718. wait_event(sctx->list_wait,
  1719. atomic_read(&sctx->in_flight) == 0);
  1720. atomic_inc(&fs_info->scrubs_paused);
  1721. wake_up(&fs_info->scrub_pause_wait);
  1722. mutex_lock(&fs_info->scrub_lock);
  1723. while (atomic_read(&fs_info->scrub_pause_req)) {
  1724. mutex_unlock(&fs_info->scrub_lock);
  1725. wait_event(fs_info->scrub_pause_wait,
  1726. atomic_read(&fs_info->scrub_pause_req) == 0);
  1727. mutex_lock(&fs_info->scrub_lock);
  1728. }
  1729. atomic_dec(&fs_info->scrubs_paused);
  1730. mutex_unlock(&fs_info->scrub_lock);
  1731. wake_up(&fs_info->scrub_pause_wait);
  1732. }
  1733. ret = btrfs_lookup_csums_range(csum_root, logical,
  1734. logical + map->stripe_len - 1,
  1735. &sctx->csum_list, 1);
  1736. if (ret)
  1737. goto out;
  1738. key.objectid = logical;
  1739. key.type = BTRFS_EXTENT_ITEM_KEY;
  1740. key.offset = (u64)0;
  1741. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  1742. if (ret < 0)
  1743. goto out;
  1744. if (ret > 0) {
  1745. ret = btrfs_previous_item(root, path, 0,
  1746. BTRFS_EXTENT_ITEM_KEY);
  1747. if (ret < 0)
  1748. goto out;
  1749. if (ret > 0) {
  1750. /* there's no smaller item, so stick with the
  1751. * larger one */
  1752. btrfs_release_path(path);
  1753. ret = btrfs_search_slot(NULL, root, &key,
  1754. path, 0, 0);
  1755. if (ret < 0)
  1756. goto out;
  1757. }
  1758. }
  1759. while (1) {
  1760. l = path->nodes[0];
  1761. slot = path->slots[0];
  1762. if (slot >= btrfs_header_nritems(l)) {
  1763. ret = btrfs_next_leaf(root, path);
  1764. if (ret == 0)
  1765. continue;
  1766. if (ret < 0)
  1767. goto out;
  1768. break;
  1769. }
  1770. btrfs_item_key_to_cpu(l, &key, slot);
  1771. if (key.objectid + key.offset <= logical)
  1772. goto next;
  1773. if (key.objectid >= logical + map->stripe_len)
  1774. break;
  1775. if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
  1776. goto next;
  1777. extent = btrfs_item_ptr(l, slot,
  1778. struct btrfs_extent_item);
  1779. flags = btrfs_extent_flags(l, extent);
  1780. generation = btrfs_extent_generation(l, extent);
  1781. if (key.objectid < logical &&
  1782. (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
  1783. printk(KERN_ERR
  1784. "btrfs scrub: tree block %llu spanning "
  1785. "stripes, ignored. logical=%llu\n",
  1786. (unsigned long long)key.objectid,
  1787. (unsigned long long)logical);
  1788. goto next;
  1789. }
  1790. /*
  1791. * trim extent to this stripe
  1792. */
  1793. if (key.objectid < logical) {
  1794. key.offset -= logical - key.objectid;
  1795. key.objectid = logical;
  1796. }
  1797. if (key.objectid + key.offset >
  1798. logical + map->stripe_len) {
  1799. key.offset = logical + map->stripe_len -
  1800. key.objectid;
  1801. }
  1802. ret = scrub_extent(sctx, key.objectid, key.offset,
  1803. key.objectid - logical + physical,
  1804. scrub_dev, flags, generation,
  1805. mirror_num);
  1806. if (ret)
  1807. goto out;
  1808. next:
  1809. path->slots[0]++;
  1810. }
  1811. btrfs_release_path(path);
  1812. logical += increment;
  1813. physical += map->stripe_len;
  1814. spin_lock(&sctx->stat_lock);
  1815. sctx->stat.last_physical = physical;
  1816. spin_unlock(&sctx->stat_lock);
  1817. }
  1818. /* push queued extents */
  1819. scrub_submit(sctx);
  1820. out:
  1821. blk_finish_plug(&plug);
  1822. btrfs_free_path(path);
  1823. return ret < 0 ? ret : 0;
  1824. }
  1825. static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
  1826. struct btrfs_device *scrub_dev,
  1827. u64 chunk_tree, u64 chunk_objectid,
  1828. u64 chunk_offset, u64 length,
  1829. u64 dev_offset)
  1830. {
  1831. struct btrfs_mapping_tree *map_tree =
  1832. &sctx->dev_root->fs_info->mapping_tree;
  1833. struct map_lookup *map;
  1834. struct extent_map *em;
  1835. int i;
  1836. int ret = -EINVAL;
  1837. read_lock(&map_tree->map_tree.lock);
  1838. em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
  1839. read_unlock(&map_tree->map_tree.lock);
  1840. if (!em)
  1841. return -EINVAL;
  1842. map = (struct map_lookup *)em->bdev;
  1843. if (em->start != chunk_offset)
  1844. goto out;
  1845. if (em->len < length)
  1846. goto out;
  1847. for (i = 0; i < map->num_stripes; ++i) {
  1848. if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
  1849. map->stripes[i].physical == dev_offset) {
  1850. ret = scrub_stripe(sctx, map, scrub_dev, i,
  1851. chunk_offset, length);
  1852. if (ret)
  1853. goto out;
  1854. }
  1855. }
  1856. out:
  1857. free_extent_map(em);
  1858. return ret;
  1859. }
  1860. static noinline_for_stack
  1861. int scrub_enumerate_chunks(struct scrub_ctx *sctx,
  1862. struct btrfs_device *scrub_dev, u64 start, u64 end)
  1863. {
  1864. struct btrfs_dev_extent *dev_extent = NULL;
  1865. struct btrfs_path *path;
  1866. struct btrfs_root *root = sctx->dev_root;
  1867. struct btrfs_fs_info *fs_info = root->fs_info;
  1868. u64 length;
  1869. u64 chunk_tree;
  1870. u64 chunk_objectid;
  1871. u64 chunk_offset;
  1872. int ret;
  1873. int slot;
  1874. struct extent_buffer *l;
  1875. struct btrfs_key key;
  1876. struct btrfs_key found_key;
  1877. struct btrfs_block_group_cache *cache;
  1878. path = btrfs_alloc_path();
  1879. if (!path)
  1880. return -ENOMEM;
  1881. path->reada = 2;
  1882. path->search_commit_root = 1;
  1883. path->skip_locking = 1;
  1884. key.objectid = scrub_dev->devid;
  1885. key.offset = 0ull;
  1886. key.type = BTRFS_DEV_EXTENT_KEY;
  1887. while (1) {
  1888. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  1889. if (ret < 0)
  1890. break;
  1891. if (ret > 0) {
  1892. if (path->slots[0] >=
  1893. btrfs_header_nritems(path->nodes[0])) {
  1894. ret = btrfs_next_leaf(root, path);
  1895. if (ret)
  1896. break;
  1897. }
  1898. }
  1899. l = path->nodes[0];
  1900. slot = path->slots[0];
  1901. btrfs_item_key_to_cpu(l, &found_key, slot);
  1902. if (found_key.objectid != scrub_dev->devid)
  1903. break;
  1904. if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
  1905. break;
  1906. if (found_key.offset >= end)
  1907. break;
  1908. if (found_key.offset < key.offset)
  1909. break;
  1910. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  1911. length = btrfs_dev_extent_length(l, dev_extent);
  1912. if (found_key.offset + length <= start) {
  1913. key.offset = found_key.offset + length;
  1914. btrfs_release_path(path);
  1915. continue;
  1916. }
  1917. chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
  1918. chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
  1919. chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
  1920. /*
  1921. * get a reference on the corresponding block group to prevent
  1922. * the chunk from going away while we scrub it
  1923. */
  1924. cache = btrfs_lookup_block_group(fs_info, chunk_offset);
  1925. if (!cache) {
  1926. ret = -ENOENT;
  1927. break;
  1928. }
  1929. ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
  1930. chunk_offset, length, found_key.offset);
  1931. btrfs_put_block_group(cache);
  1932. if (ret)
  1933. break;
  1934. key.offset = found_key.offset + length;
  1935. btrfs_release_path(path);
  1936. }
  1937. btrfs_free_path(path);
  1938. /*
  1939. * ret can still be 1 from search_slot or next_leaf,
  1940. * that's not an error
  1941. */
  1942. return ret < 0 ? ret : 0;
  1943. }
  1944. static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
  1945. struct btrfs_device *scrub_dev)
  1946. {
  1947. int i;
  1948. u64 bytenr;
  1949. u64 gen;
  1950. int ret;
  1951. struct btrfs_root *root = sctx->dev_root;
  1952. if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
  1953. return -EIO;
  1954. gen = root->fs_info->last_trans_committed;
  1955. for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
  1956. bytenr = btrfs_sb_offset(i);
  1957. if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
  1958. break;
  1959. ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
  1960. scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
  1961. NULL, 1);
  1962. if (ret)
  1963. return ret;
  1964. }
  1965. wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0);
  1966. return 0;
  1967. }
  1968. /*
  1969. * get a reference count on fs_info->scrub_workers. start worker if necessary
  1970. */
  1971. static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
  1972. {
  1973. struct btrfs_fs_info *fs_info = root->fs_info;
  1974. int ret = 0;
  1975. mutex_lock(&fs_info->scrub_lock);
  1976. if (fs_info->scrub_workers_refcnt == 0) {
  1977. btrfs_init_workers(&fs_info->scrub_workers, "scrub",
  1978. fs_info->thread_pool_size, &fs_info->generic_worker);
  1979. fs_info->scrub_workers.idle_thresh = 4;
  1980. ret = btrfs_start_workers(&fs_info->scrub_workers);
  1981. if (ret)
  1982. goto out;
  1983. }
  1984. ++fs_info->scrub_workers_refcnt;
  1985. out:
  1986. mutex_unlock(&fs_info->scrub_lock);
  1987. return ret;
  1988. }
  1989. static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
  1990. {
  1991. struct btrfs_fs_info *fs_info = root->fs_info;
  1992. mutex_lock(&fs_info->scrub_lock);
  1993. if (--fs_info->scrub_workers_refcnt == 0)
  1994. btrfs_stop_workers(&fs_info->scrub_workers);
  1995. WARN_ON(fs_info->scrub_workers_refcnt < 0);
  1996. mutex_unlock(&fs_info->scrub_lock);
  1997. }
  1998. int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
  1999. struct btrfs_scrub_progress *progress, int readonly)
  2000. {
  2001. struct scrub_ctx *sctx;
  2002. struct btrfs_fs_info *fs_info = root->fs_info;
  2003. int ret;
  2004. struct btrfs_device *dev;
  2005. if (btrfs_fs_closing(root->fs_info))
  2006. return -EINVAL;
  2007. /*
  2008. * check some assumptions
  2009. */
  2010. if (root->nodesize != root->leafsize) {
  2011. printk(KERN_ERR
  2012. "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
  2013. root->nodesize, root->leafsize);
  2014. return -EINVAL;
  2015. }
  2016. if (root->nodesize > BTRFS_STRIPE_LEN) {
  2017. /*
  2018. * in this case scrub is unable to calculate the checksum
  2019. * the way scrub is implemented. Do not handle this
  2020. * situation at all because it won't ever happen.
  2021. */
  2022. printk(KERN_ERR
  2023. "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
  2024. root->nodesize, BTRFS_STRIPE_LEN);
  2025. return -EINVAL;
  2026. }
  2027. if (root->sectorsize != PAGE_SIZE) {
  2028. /* not supported for data w/o checksums */
  2029. printk(KERN_ERR
  2030. "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
  2031. root->sectorsize, (unsigned long long)PAGE_SIZE);
  2032. return -EINVAL;
  2033. }
  2034. if (fs_info->chunk_root->nodesize >
  2035. PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
  2036. fs_info->chunk_root->sectorsize >
  2037. PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
  2038. /*
  2039. * would exhaust the array bounds of pagev member in
  2040. * struct scrub_block
  2041. */
  2042. pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
  2043. fs_info->chunk_root->nodesize,
  2044. SCRUB_MAX_PAGES_PER_BLOCK,
  2045. fs_info->chunk_root->sectorsize,
  2046. SCRUB_MAX_PAGES_PER_BLOCK);
  2047. return -EINVAL;
  2048. }
  2049. ret = scrub_workers_get(root);
  2050. if (ret)
  2051. return ret;
  2052. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  2053. dev = btrfs_find_device(root, devid, NULL, NULL);
  2054. if (!dev || dev->missing) {
  2055. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  2056. scrub_workers_put(root);
  2057. return -ENODEV;
  2058. }
  2059. mutex_lock(&fs_info->scrub_lock);
  2060. if (!dev->in_fs_metadata) {
  2061. mutex_unlock(&fs_info->scrub_lock);
  2062. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  2063. scrub_workers_put(root);
  2064. return -ENODEV;
  2065. }
  2066. if (dev->scrub_device) {
  2067. mutex_unlock(&fs_info->scrub_lock);
  2068. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  2069. scrub_workers_put(root);
  2070. return -EINPROGRESS;
  2071. }
  2072. sctx = scrub_setup_ctx(dev);
  2073. if (IS_ERR(sctx)) {
  2074. mutex_unlock(&fs_info->scrub_lock);
  2075. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  2076. scrub_workers_put(root);
  2077. return PTR_ERR(sctx);
  2078. }
  2079. sctx->readonly = readonly;
  2080. dev->scrub_device = sctx;
  2081. atomic_inc(&fs_info->scrubs_running);
  2082. mutex_unlock(&fs_info->scrub_lock);
  2083. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  2084. down_read(&fs_info->scrub_super_lock);
  2085. ret = scrub_supers(sctx, dev);
  2086. up_read(&fs_info->scrub_super_lock);
  2087. if (!ret)
  2088. ret = scrub_enumerate_chunks(sctx, dev, start, end);
  2089. wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0);
  2090. atomic_dec(&fs_info->scrubs_running);
  2091. wake_up(&fs_info->scrub_pause_wait);
  2092. wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0);
  2093. if (progress)
  2094. memcpy(progress, &sctx->stat, sizeof(*progress));
  2095. mutex_lock(&fs_info->scrub_lock);
  2096. dev->scrub_device = NULL;
  2097. mutex_unlock(&fs_info->scrub_lock);
  2098. scrub_free_ctx(sctx);
  2099. scrub_workers_put(root);
  2100. return ret;
  2101. }
  2102. void btrfs_scrub_pause(struct btrfs_root *root)
  2103. {
  2104. struct btrfs_fs_info *fs_info = root->fs_info;
  2105. mutex_lock(&fs_info->scrub_lock);
  2106. atomic_inc(&fs_info->scrub_pause_req);
  2107. while (atomic_read(&fs_info->scrubs_paused) !=
  2108. atomic_read(&fs_info->scrubs_running)) {
  2109. mutex_unlock(&fs_info->scrub_lock);
  2110. wait_event(fs_info->scrub_pause_wait,
  2111. atomic_read(&fs_info->scrubs_paused) ==
  2112. atomic_read(&fs_info->scrubs_running));
  2113. mutex_lock(&fs_info->scrub_lock);
  2114. }
  2115. mutex_unlock(&fs_info->scrub_lock);
  2116. }
  2117. void btrfs_scrub_continue(struct btrfs_root *root)
  2118. {
  2119. struct btrfs_fs_info *fs_info = root->fs_info;
  2120. atomic_dec(&fs_info->scrub_pause_req);
  2121. wake_up(&fs_info->scrub_pause_wait);
  2122. }
  2123. void btrfs_scrub_pause_super(struct btrfs_root *root)
  2124. {
  2125. down_write(&root->fs_info->scrub_super_lock);
  2126. }
  2127. void btrfs_scrub_continue_super(struct btrfs_root *root)
  2128. {
  2129. up_write(&root->fs_info->scrub_super_lock);
  2130. }
  2131. int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
  2132. {
  2133. mutex_lock(&fs_info->scrub_lock);
  2134. if (!atomic_read(&fs_info->scrubs_running)) {
  2135. mutex_unlock(&fs_info->scrub_lock);
  2136. return -ENOTCONN;
  2137. }
  2138. atomic_inc(&fs_info->scrub_cancel_req);
  2139. while (atomic_read(&fs_info->scrubs_running)) {
  2140. mutex_unlock(&fs_info->scrub_lock);
  2141. wait_event(fs_info->scrub_pause_wait,
  2142. atomic_read(&fs_info->scrubs_running) == 0);
  2143. mutex_lock(&fs_info->scrub_lock);
  2144. }
  2145. atomic_dec(&fs_info->scrub_cancel_req);
  2146. mutex_unlock(&fs_info->scrub_lock);
  2147. return 0;
  2148. }
  2149. int btrfs_scrub_cancel(struct btrfs_root *root)
  2150. {
  2151. return __btrfs_scrub_cancel(root->fs_info);
  2152. }
  2153. int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
  2154. {
  2155. struct btrfs_fs_info *fs_info = root->fs_info;
  2156. struct scrub_ctx *sctx;
  2157. mutex_lock(&fs_info->scrub_lock);
  2158. sctx = dev->scrub_device;
  2159. if (!sctx) {
  2160. mutex_unlock(&fs_info->scrub_lock);
  2161. return -ENOTCONN;
  2162. }
  2163. atomic_inc(&sctx->cancel_req);
  2164. while (dev->scrub_device) {
  2165. mutex_unlock(&fs_info->scrub_lock);
  2166. wait_event(fs_info->scrub_pause_wait,
  2167. dev->scrub_device == NULL);
  2168. mutex_lock(&fs_info->scrub_lock);
  2169. }
  2170. mutex_unlock(&fs_info->scrub_lock);
  2171. return 0;
  2172. }
  2173. int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
  2174. {
  2175. struct btrfs_fs_info *fs_info = root->fs_info;
  2176. struct btrfs_device *dev;
  2177. int ret;
  2178. /*
  2179. * we have to hold the device_list_mutex here so the device
  2180. * does not go away in cancel_dev. FIXME: find a better solution
  2181. */
  2182. mutex_lock(&fs_info->fs_devices->device_list_mutex);
  2183. dev = btrfs_find_device(root, devid, NULL, NULL);
  2184. if (!dev) {
  2185. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2186. return -ENODEV;
  2187. }
  2188. ret = btrfs_scrub_cancel_dev(root, dev);
  2189. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2190. return ret;
  2191. }
  2192. int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
  2193. struct btrfs_scrub_progress *progress)
  2194. {
  2195. struct btrfs_device *dev;
  2196. struct scrub_ctx *sctx = NULL;
  2197. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  2198. dev = btrfs_find_device(root, devid, NULL, NULL);
  2199. if (dev)
  2200. sctx = dev->scrub_device;
  2201. if (sctx)
  2202. memcpy(progress, &sctx->stat, sizeof(*progress));
  2203. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  2204. return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
  2205. }