scrub.c 92 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423
  1. /*
  2. * Copyright (C) 2011, 2012 STRATO. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public
  6. * License v2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public
  14. * License along with this program; if not, write to the
  15. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16. * Boston, MA 021110-1307, USA.
  17. */
  18. #include <linux/blkdev.h>
  19. #include <linux/ratelimit.h>
  20. #include "ctree.h"
  21. #include "volumes.h"
  22. #include "disk-io.h"
  23. #include "ordered-data.h"
  24. #include "transaction.h"
  25. #include "backref.h"
  26. #include "extent_io.h"
  27. #include "dev-replace.h"
  28. #include "check-integrity.h"
  29. #include "rcu-string.h"
  30. #include "raid56.h"
  31. /*
  32. * This is only the first step towards a full-features scrub. It reads all
  33. * extent and super block and verifies the checksums. In case a bad checksum
  34. * is found or the extent cannot be read, good data will be written back if
  35. * any can be found.
  36. *
  37. * Future enhancements:
  38. * - In case an unrepairable extent is encountered, track which files are
  39. * affected and report them
  40. * - track and record media errors, throw out bad devices
  41. * - add a mode to also read unallocated space
  42. */
  43. struct scrub_block;
  44. struct scrub_ctx;
  45. /*
  46. * the following three values only influence the performance.
  47. * The last one configures the number of parallel and outstanding I/O
  48. * operations. The first two values configure an upper limit for the number
  49. * of (dynamically allocated) pages that are added to a bio.
  50. */
  51. #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
  52. #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
  53. #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
  54. /*
  55. * the following value times PAGE_SIZE needs to be large enough to match the
  56. * largest node/leaf/sector size that shall be supported.
  57. * Values larger than BTRFS_STRIPE_LEN are not supported.
  58. */
  59. #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
  60. struct scrub_page {
  61. struct scrub_block *sblock;
  62. struct page *page;
  63. struct btrfs_device *dev;
  64. u64 flags; /* extent flags */
  65. u64 generation;
  66. u64 logical;
  67. u64 physical;
  68. u64 physical_for_dev_replace;
  69. atomic_t ref_count;
  70. struct {
  71. unsigned int mirror_num:8;
  72. unsigned int have_csum:1;
  73. unsigned int io_error:1;
  74. };
  75. u8 csum[BTRFS_CSUM_SIZE];
  76. };
  77. struct scrub_bio {
  78. int index;
  79. struct scrub_ctx *sctx;
  80. struct btrfs_device *dev;
  81. struct bio *bio;
  82. int err;
  83. u64 logical;
  84. u64 physical;
  85. #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  86. struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
  87. #else
  88. struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
  89. #endif
  90. int page_count;
  91. int next_free;
  92. struct btrfs_work work;
  93. };
  94. struct scrub_block {
  95. struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
  96. int page_count;
  97. atomic_t outstanding_pages;
  98. atomic_t ref_count; /* free mem on transition to zero */
  99. struct scrub_ctx *sctx;
  100. struct {
  101. unsigned int header_error:1;
  102. unsigned int checksum_error:1;
  103. unsigned int no_io_error_seen:1;
  104. unsigned int generation_error:1; /* also sets header_error */
  105. };
  106. };
  107. struct scrub_wr_ctx {
  108. struct scrub_bio *wr_curr_bio;
  109. struct btrfs_device *tgtdev;
  110. int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
  111. atomic_t flush_all_writes;
  112. struct mutex wr_lock;
  113. };
  114. struct scrub_ctx {
  115. struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
  116. struct btrfs_root *dev_root;
  117. int first_free;
  118. int curr;
  119. atomic_t bios_in_flight;
  120. atomic_t workers_pending;
  121. spinlock_t list_lock;
  122. wait_queue_head_t list_wait;
  123. u16 csum_size;
  124. struct list_head csum_list;
  125. atomic_t cancel_req;
  126. int readonly;
  127. int pages_per_rd_bio;
  128. u32 sectorsize;
  129. u32 nodesize;
  130. u32 leafsize;
  131. int is_dev_replace;
  132. struct scrub_wr_ctx wr_ctx;
  133. /*
  134. * statistics
  135. */
  136. struct btrfs_scrub_progress stat;
  137. spinlock_t stat_lock;
  138. };
  139. struct scrub_fixup_nodatasum {
  140. struct scrub_ctx *sctx;
  141. struct btrfs_device *dev;
  142. u64 logical;
  143. struct btrfs_root *root;
  144. struct btrfs_work work;
  145. int mirror_num;
  146. };
  147. struct scrub_nocow_inode {
  148. u64 inum;
  149. u64 offset;
  150. u64 root;
  151. struct list_head list;
  152. };
  153. struct scrub_copy_nocow_ctx {
  154. struct scrub_ctx *sctx;
  155. u64 logical;
  156. u64 len;
  157. int mirror_num;
  158. u64 physical_for_dev_replace;
  159. struct list_head inodes;
  160. struct btrfs_work work;
  161. };
  162. struct scrub_warning {
  163. struct btrfs_path *path;
  164. u64 extent_item_size;
  165. char *scratch_buf;
  166. char *msg_buf;
  167. const char *errstr;
  168. sector_t sector;
  169. u64 logical;
  170. struct btrfs_device *dev;
  171. int msg_bufsize;
  172. int scratch_bufsize;
  173. };
  174. static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
  175. static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
  176. static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
  177. static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
  178. static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
  179. static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
  180. struct btrfs_fs_info *fs_info,
  181. struct scrub_block *original_sblock,
  182. u64 length, u64 logical,
  183. struct scrub_block *sblocks_for_recheck);
  184. static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
  185. struct scrub_block *sblock, int is_metadata,
  186. int have_csum, u8 *csum, u64 generation,
  187. u16 csum_size);
  188. static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
  189. struct scrub_block *sblock,
  190. int is_metadata, int have_csum,
  191. const u8 *csum, u64 generation,
  192. u16 csum_size);
  193. static void scrub_complete_bio_end_io(struct bio *bio, int err);
  194. static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
  195. struct scrub_block *sblock_good,
  196. int force_write);
  197. static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
  198. struct scrub_block *sblock_good,
  199. int page_num, int force_write);
  200. static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
  201. static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
  202. int page_num);
  203. static int scrub_checksum_data(struct scrub_block *sblock);
  204. static int scrub_checksum_tree_block(struct scrub_block *sblock);
  205. static int scrub_checksum_super(struct scrub_block *sblock);
  206. static void scrub_block_get(struct scrub_block *sblock);
  207. static void scrub_block_put(struct scrub_block *sblock);
  208. static void scrub_page_get(struct scrub_page *spage);
  209. static void scrub_page_put(struct scrub_page *spage);
  210. static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
  211. struct scrub_page *spage);
  212. static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  213. u64 physical, struct btrfs_device *dev, u64 flags,
  214. u64 gen, int mirror_num, u8 *csum, int force,
  215. u64 physical_for_dev_replace);
  216. static void scrub_bio_end_io(struct bio *bio, int err);
  217. static void scrub_bio_end_io_worker(struct btrfs_work *work);
  218. static void scrub_block_complete(struct scrub_block *sblock);
  219. static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
  220. u64 extent_logical, u64 extent_len,
  221. u64 *extent_physical,
  222. struct btrfs_device **extent_dev,
  223. int *extent_mirror_num);
  224. static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
  225. struct scrub_wr_ctx *wr_ctx,
  226. struct btrfs_fs_info *fs_info,
  227. struct btrfs_device *dev,
  228. int is_dev_replace);
  229. static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
  230. static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
  231. struct scrub_page *spage);
  232. static void scrub_wr_submit(struct scrub_ctx *sctx);
  233. static void scrub_wr_bio_end_io(struct bio *bio, int err);
  234. static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
  235. static int write_page_nocow(struct scrub_ctx *sctx,
  236. u64 physical_for_dev_replace, struct page *page);
  237. static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
  238. struct scrub_copy_nocow_ctx *ctx);
  239. static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  240. int mirror_num, u64 physical_for_dev_replace);
  241. static void copy_nocow_pages_worker(struct btrfs_work *work);
  242. static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
  243. {
  244. atomic_inc(&sctx->bios_in_flight);
  245. }
  246. static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
  247. {
  248. atomic_dec(&sctx->bios_in_flight);
  249. wake_up(&sctx->list_wait);
  250. }
  251. /*
  252. * used for workers that require transaction commits (i.e., for the
  253. * NOCOW case)
  254. */
  255. static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
  256. {
  257. struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  258. /*
  259. * increment scrubs_running to prevent cancel requests from
  260. * completing as long as a worker is running. we must also
  261. * increment scrubs_paused to prevent deadlocking on pause
  262. * requests used for transactions commits (as the worker uses a
  263. * transaction context). it is safe to regard the worker
  264. * as paused for all matters practical. effectively, we only
  265. * avoid cancellation requests from completing.
  266. */
  267. mutex_lock(&fs_info->scrub_lock);
  268. atomic_inc(&fs_info->scrubs_running);
  269. atomic_inc(&fs_info->scrubs_paused);
  270. mutex_unlock(&fs_info->scrub_lock);
  271. atomic_inc(&sctx->workers_pending);
  272. }
  273. /* used for workers that require transaction commits */
  274. static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
  275. {
  276. struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  277. /*
  278. * see scrub_pending_trans_workers_inc() why we're pretending
  279. * to be paused in the scrub counters
  280. */
  281. mutex_lock(&fs_info->scrub_lock);
  282. atomic_dec(&fs_info->scrubs_running);
  283. atomic_dec(&fs_info->scrubs_paused);
  284. mutex_unlock(&fs_info->scrub_lock);
  285. atomic_dec(&sctx->workers_pending);
  286. wake_up(&fs_info->scrub_pause_wait);
  287. wake_up(&sctx->list_wait);
  288. }
  289. static void scrub_free_csums(struct scrub_ctx *sctx)
  290. {
  291. while (!list_empty(&sctx->csum_list)) {
  292. struct btrfs_ordered_sum *sum;
  293. sum = list_first_entry(&sctx->csum_list,
  294. struct btrfs_ordered_sum, list);
  295. list_del(&sum->list);
  296. kfree(sum);
  297. }
  298. }
  299. static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
  300. {
  301. int i;
  302. if (!sctx)
  303. return;
  304. scrub_free_wr_ctx(&sctx->wr_ctx);
  305. /* this can happen when scrub is cancelled */
  306. if (sctx->curr != -1) {
  307. struct scrub_bio *sbio = sctx->bios[sctx->curr];
  308. for (i = 0; i < sbio->page_count; i++) {
  309. WARN_ON(!sbio->pagev[i]->page);
  310. scrub_block_put(sbio->pagev[i]->sblock);
  311. }
  312. bio_put(sbio->bio);
  313. }
  314. for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
  315. struct scrub_bio *sbio = sctx->bios[i];
  316. if (!sbio)
  317. break;
  318. kfree(sbio);
  319. }
  320. scrub_free_csums(sctx);
  321. kfree(sctx);
  322. }
  323. static noinline_for_stack
  324. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
  325. {
  326. struct scrub_ctx *sctx;
  327. int i;
  328. struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
  329. int pages_per_rd_bio;
  330. int ret;
  331. /*
  332. * the setting of pages_per_rd_bio is correct for scrub but might
  333. * be wrong for the dev_replace code where we might read from
  334. * different devices in the initial huge bios. However, that
  335. * code is able to correctly handle the case when adding a page
  336. * to a bio fails.
  337. */
  338. if (dev->bdev)
  339. pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
  340. bio_get_nr_vecs(dev->bdev));
  341. else
  342. pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
  343. sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
  344. if (!sctx)
  345. goto nomem;
  346. sctx->is_dev_replace = is_dev_replace;
  347. sctx->pages_per_rd_bio = pages_per_rd_bio;
  348. sctx->curr = -1;
  349. sctx->dev_root = dev->dev_root;
  350. for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
  351. struct scrub_bio *sbio;
  352. sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
  353. if (!sbio)
  354. goto nomem;
  355. sctx->bios[i] = sbio;
  356. sbio->index = i;
  357. sbio->sctx = sctx;
  358. sbio->page_count = 0;
  359. sbio->work.func = scrub_bio_end_io_worker;
  360. if (i != SCRUB_BIOS_PER_SCTX - 1)
  361. sctx->bios[i]->next_free = i + 1;
  362. else
  363. sctx->bios[i]->next_free = -1;
  364. }
  365. sctx->first_free = 0;
  366. sctx->nodesize = dev->dev_root->nodesize;
  367. sctx->leafsize = dev->dev_root->leafsize;
  368. sctx->sectorsize = dev->dev_root->sectorsize;
  369. atomic_set(&sctx->bios_in_flight, 0);
  370. atomic_set(&sctx->workers_pending, 0);
  371. atomic_set(&sctx->cancel_req, 0);
  372. sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
  373. INIT_LIST_HEAD(&sctx->csum_list);
  374. spin_lock_init(&sctx->list_lock);
  375. spin_lock_init(&sctx->stat_lock);
  376. init_waitqueue_head(&sctx->list_wait);
  377. ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
  378. fs_info->dev_replace.tgtdev, is_dev_replace);
  379. if (ret) {
  380. scrub_free_ctx(sctx);
  381. return ERR_PTR(ret);
  382. }
  383. return sctx;
  384. nomem:
  385. scrub_free_ctx(sctx);
  386. return ERR_PTR(-ENOMEM);
  387. }
  388. static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
  389. void *warn_ctx)
  390. {
  391. u64 isize;
  392. u32 nlink;
  393. int ret;
  394. int i;
  395. struct extent_buffer *eb;
  396. struct btrfs_inode_item *inode_item;
  397. struct scrub_warning *swarn = warn_ctx;
  398. struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
  399. struct inode_fs_paths *ipath = NULL;
  400. struct btrfs_root *local_root;
  401. struct btrfs_key root_key;
  402. root_key.objectid = root;
  403. root_key.type = BTRFS_ROOT_ITEM_KEY;
  404. root_key.offset = (u64)-1;
  405. local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
  406. if (IS_ERR(local_root)) {
  407. ret = PTR_ERR(local_root);
  408. goto err;
  409. }
  410. ret = inode_item_info(inum, 0, local_root, swarn->path);
  411. if (ret) {
  412. btrfs_release_path(swarn->path);
  413. goto err;
  414. }
  415. eb = swarn->path->nodes[0];
  416. inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
  417. struct btrfs_inode_item);
  418. isize = btrfs_inode_size(eb, inode_item);
  419. nlink = btrfs_inode_nlink(eb, inode_item);
  420. btrfs_release_path(swarn->path);
  421. ipath = init_ipath(4096, local_root, swarn->path);
  422. if (IS_ERR(ipath)) {
  423. ret = PTR_ERR(ipath);
  424. ipath = NULL;
  425. goto err;
  426. }
  427. ret = paths_from_inode(inum, ipath);
  428. if (ret < 0)
  429. goto err;
  430. /*
  431. * we deliberately ignore the bit ipath might have been too small to
  432. * hold all of the paths here
  433. */
  434. for (i = 0; i < ipath->fspath->elem_cnt; ++i)
  435. printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
  436. "%s, sector %llu, root %llu, inode %llu, offset %llu, "
  437. "length %llu, links %u (path: %s)\n", swarn->errstr,
  438. swarn->logical, rcu_str_deref(swarn->dev->name),
  439. (unsigned long long)swarn->sector, root, inum, offset,
  440. min(isize - offset, (u64)PAGE_SIZE), nlink,
  441. (char *)(unsigned long)ipath->fspath->val[i]);
  442. free_ipath(ipath);
  443. return 0;
  444. err:
  445. printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
  446. "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
  447. "resolving failed with ret=%d\n", swarn->errstr,
  448. swarn->logical, rcu_str_deref(swarn->dev->name),
  449. (unsigned long long)swarn->sector, root, inum, offset, ret);
  450. free_ipath(ipath);
  451. return 0;
  452. }
  453. static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
  454. {
  455. struct btrfs_device *dev;
  456. struct btrfs_fs_info *fs_info;
  457. struct btrfs_path *path;
  458. struct btrfs_key found_key;
  459. struct extent_buffer *eb;
  460. struct btrfs_extent_item *ei;
  461. struct scrub_warning swarn;
  462. unsigned long ptr = 0;
  463. u64 extent_item_pos;
  464. u64 flags = 0;
  465. u64 ref_root;
  466. u32 item_size;
  467. u8 ref_level;
  468. const int bufsize = 4096;
  469. int ret;
  470. WARN_ON(sblock->page_count < 1);
  471. dev = sblock->pagev[0]->dev;
  472. fs_info = sblock->sctx->dev_root->fs_info;
  473. path = btrfs_alloc_path();
  474. swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
  475. swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
  476. swarn.sector = (sblock->pagev[0]->physical) >> 9;
  477. swarn.logical = sblock->pagev[0]->logical;
  478. swarn.errstr = errstr;
  479. swarn.dev = NULL;
  480. swarn.msg_bufsize = bufsize;
  481. swarn.scratch_bufsize = bufsize;
  482. if (!path || !swarn.scratch_buf || !swarn.msg_buf)
  483. goto out;
  484. ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
  485. &flags);
  486. if (ret < 0)
  487. goto out;
  488. extent_item_pos = swarn.logical - found_key.objectid;
  489. swarn.extent_item_size = found_key.offset;
  490. eb = path->nodes[0];
  491. ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
  492. item_size = btrfs_item_size_nr(eb, path->slots[0]);
  493. if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  494. do {
  495. ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
  496. &ref_root, &ref_level);
  497. printk_in_rcu(KERN_WARNING
  498. "btrfs: %s at logical %llu on dev %s, "
  499. "sector %llu: metadata %s (level %d) in tree "
  500. "%llu\n", errstr, swarn.logical,
  501. rcu_str_deref(dev->name),
  502. (unsigned long long)swarn.sector,
  503. ref_level ? "node" : "leaf",
  504. ret < 0 ? -1 : ref_level,
  505. ret < 0 ? -1 : ref_root);
  506. } while (ret != 1);
  507. btrfs_release_path(path);
  508. } else {
  509. btrfs_release_path(path);
  510. swarn.path = path;
  511. swarn.dev = dev;
  512. iterate_extent_inodes(fs_info, found_key.objectid,
  513. extent_item_pos, 1,
  514. scrub_print_warning_inode, &swarn);
  515. }
  516. out:
  517. btrfs_free_path(path);
  518. kfree(swarn.scratch_buf);
  519. kfree(swarn.msg_buf);
  520. }
  521. static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
  522. {
  523. struct page *page = NULL;
  524. unsigned long index;
  525. struct scrub_fixup_nodatasum *fixup = fixup_ctx;
  526. int ret;
  527. int corrected = 0;
  528. struct btrfs_key key;
  529. struct inode *inode = NULL;
  530. struct btrfs_fs_info *fs_info;
  531. u64 end = offset + PAGE_SIZE - 1;
  532. struct btrfs_root *local_root;
  533. int srcu_index;
  534. key.objectid = root;
  535. key.type = BTRFS_ROOT_ITEM_KEY;
  536. key.offset = (u64)-1;
  537. fs_info = fixup->root->fs_info;
  538. srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
  539. local_root = btrfs_read_fs_root_no_name(fs_info, &key);
  540. if (IS_ERR(local_root)) {
  541. srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
  542. return PTR_ERR(local_root);
  543. }
  544. key.type = BTRFS_INODE_ITEM_KEY;
  545. key.objectid = inum;
  546. key.offset = 0;
  547. inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
  548. srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
  549. if (IS_ERR(inode))
  550. return PTR_ERR(inode);
  551. index = offset >> PAGE_CACHE_SHIFT;
  552. page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
  553. if (!page) {
  554. ret = -ENOMEM;
  555. goto out;
  556. }
  557. if (PageUptodate(page)) {
  558. if (PageDirty(page)) {
  559. /*
  560. * we need to write the data to the defect sector. the
  561. * data that was in that sector is not in memory,
  562. * because the page was modified. we must not write the
  563. * modified page to that sector.
  564. *
  565. * TODO: what could be done here: wait for the delalloc
  566. * runner to write out that page (might involve
  567. * COW) and see whether the sector is still
  568. * referenced afterwards.
  569. *
  570. * For the meantime, we'll treat this error
  571. * incorrectable, although there is a chance that a
  572. * later scrub will find the bad sector again and that
  573. * there's no dirty page in memory, then.
  574. */
  575. ret = -EIO;
  576. goto out;
  577. }
  578. fs_info = BTRFS_I(inode)->root->fs_info;
  579. ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
  580. fixup->logical, page,
  581. fixup->mirror_num);
  582. unlock_page(page);
  583. corrected = !ret;
  584. } else {
  585. /*
  586. * we need to get good data first. the general readpage path
  587. * will call repair_io_failure for us, we just have to make
  588. * sure we read the bad mirror.
  589. */
  590. ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
  591. EXTENT_DAMAGED, GFP_NOFS);
  592. if (ret) {
  593. /* set_extent_bits should give proper error */
  594. WARN_ON(ret > 0);
  595. if (ret > 0)
  596. ret = -EFAULT;
  597. goto out;
  598. }
  599. ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
  600. btrfs_get_extent,
  601. fixup->mirror_num);
  602. wait_on_page_locked(page);
  603. corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
  604. end, EXTENT_DAMAGED, 0, NULL);
  605. if (!corrected)
  606. clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
  607. EXTENT_DAMAGED, GFP_NOFS);
  608. }
  609. out:
  610. if (page)
  611. put_page(page);
  612. if (inode)
  613. iput(inode);
  614. if (ret < 0)
  615. return ret;
  616. if (ret == 0 && corrected) {
  617. /*
  618. * we only need to call readpage for one of the inodes belonging
  619. * to this extent. so make iterate_extent_inodes stop
  620. */
  621. return 1;
  622. }
  623. return -EIO;
  624. }
  625. static void scrub_fixup_nodatasum(struct btrfs_work *work)
  626. {
  627. int ret;
  628. struct scrub_fixup_nodatasum *fixup;
  629. struct scrub_ctx *sctx;
  630. struct btrfs_trans_handle *trans = NULL;
  631. struct btrfs_fs_info *fs_info;
  632. struct btrfs_path *path;
  633. int uncorrectable = 0;
  634. fixup = container_of(work, struct scrub_fixup_nodatasum, work);
  635. sctx = fixup->sctx;
  636. fs_info = fixup->root->fs_info;
  637. path = btrfs_alloc_path();
  638. if (!path) {
  639. spin_lock(&sctx->stat_lock);
  640. ++sctx->stat.malloc_errors;
  641. spin_unlock(&sctx->stat_lock);
  642. uncorrectable = 1;
  643. goto out;
  644. }
  645. trans = btrfs_join_transaction(fixup->root);
  646. if (IS_ERR(trans)) {
  647. uncorrectable = 1;
  648. goto out;
  649. }
  650. /*
  651. * the idea is to trigger a regular read through the standard path. we
  652. * read a page from the (failed) logical address by specifying the
  653. * corresponding copynum of the failed sector. thus, that readpage is
  654. * expected to fail.
  655. * that is the point where on-the-fly error correction will kick in
  656. * (once it's finished) and rewrite the failed sector if a good copy
  657. * can be found.
  658. */
  659. ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
  660. path, scrub_fixup_readpage,
  661. fixup);
  662. if (ret < 0) {
  663. uncorrectable = 1;
  664. goto out;
  665. }
  666. WARN_ON(ret != 1);
  667. spin_lock(&sctx->stat_lock);
  668. ++sctx->stat.corrected_errors;
  669. spin_unlock(&sctx->stat_lock);
  670. out:
  671. if (trans && !IS_ERR(trans))
  672. btrfs_end_transaction(trans, fixup->root);
  673. if (uncorrectable) {
  674. spin_lock(&sctx->stat_lock);
  675. ++sctx->stat.uncorrectable_errors;
  676. spin_unlock(&sctx->stat_lock);
  677. btrfs_dev_replace_stats_inc(
  678. &sctx->dev_root->fs_info->dev_replace.
  679. num_uncorrectable_read_errors);
  680. printk_ratelimited_in_rcu(KERN_ERR
  681. "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
  682. fixup->logical, rcu_str_deref(fixup->dev->name));
  683. }
  684. btrfs_free_path(path);
  685. kfree(fixup);
  686. scrub_pending_trans_workers_dec(sctx);
  687. }
  688. /*
  689. * scrub_handle_errored_block gets called when either verification of the
  690. * pages failed or the bio failed to read, e.g. with EIO. In the latter
  691. * case, this function handles all pages in the bio, even though only one
  692. * may be bad.
  693. * The goal of this function is to repair the errored block by using the
  694. * contents of one of the mirrors.
  695. */
  696. static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  697. {
  698. struct scrub_ctx *sctx = sblock_to_check->sctx;
  699. struct btrfs_device *dev;
  700. struct btrfs_fs_info *fs_info;
  701. u64 length;
  702. u64 logical;
  703. u64 generation;
  704. unsigned int failed_mirror_index;
  705. unsigned int is_metadata;
  706. unsigned int have_csum;
  707. u8 *csum;
  708. struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
  709. struct scrub_block *sblock_bad;
  710. int ret;
  711. int mirror_index;
  712. int page_num;
  713. int success;
  714. static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
  715. DEFAULT_RATELIMIT_BURST);
  716. BUG_ON(sblock_to_check->page_count < 1);
  717. fs_info = sctx->dev_root->fs_info;
  718. if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
  719. /*
  720. * if we find an error in a super block, we just report it.
  721. * They will get written with the next transaction commit
  722. * anyway
  723. */
  724. spin_lock(&sctx->stat_lock);
  725. ++sctx->stat.super_errors;
  726. spin_unlock(&sctx->stat_lock);
  727. return 0;
  728. }
  729. length = sblock_to_check->page_count * PAGE_SIZE;
  730. logical = sblock_to_check->pagev[0]->logical;
  731. generation = sblock_to_check->pagev[0]->generation;
  732. BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
  733. failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
  734. is_metadata = !(sblock_to_check->pagev[0]->flags &
  735. BTRFS_EXTENT_FLAG_DATA);
  736. have_csum = sblock_to_check->pagev[0]->have_csum;
  737. csum = sblock_to_check->pagev[0]->csum;
  738. dev = sblock_to_check->pagev[0]->dev;
  739. if (sctx->is_dev_replace && !is_metadata && !have_csum) {
  740. sblocks_for_recheck = NULL;
  741. goto nodatasum_case;
  742. }
  743. /*
  744. * read all mirrors one after the other. This includes to
  745. * re-read the extent or metadata block that failed (that was
  746. * the cause that this fixup code is called) another time,
  747. * page by page this time in order to know which pages
  748. * caused I/O errors and which ones are good (for all mirrors).
  749. * It is the goal to handle the situation when more than one
  750. * mirror contains I/O errors, but the errors do not
  751. * overlap, i.e. the data can be repaired by selecting the
  752. * pages from those mirrors without I/O error on the
  753. * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
  754. * would be that mirror #1 has an I/O error on the first page,
  755. * the second page is good, and mirror #2 has an I/O error on
  756. * the second page, but the first page is good.
  757. * Then the first page of the first mirror can be repaired by
  758. * taking the first page of the second mirror, and the
  759. * second page of the second mirror can be repaired by
  760. * copying the contents of the 2nd page of the 1st mirror.
  761. * One more note: if the pages of one mirror contain I/O
  762. * errors, the checksum cannot be verified. In order to get
  763. * the best data for repairing, the first attempt is to find
  764. * a mirror without I/O errors and with a validated checksum.
  765. * Only if this is not possible, the pages are picked from
  766. * mirrors with I/O errors without considering the checksum.
  767. * If the latter is the case, at the end, the checksum of the
  768. * repaired area is verified in order to correctly maintain
  769. * the statistics.
  770. */
  771. sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
  772. sizeof(*sblocks_for_recheck),
  773. GFP_NOFS);
  774. if (!sblocks_for_recheck) {
  775. spin_lock(&sctx->stat_lock);
  776. sctx->stat.malloc_errors++;
  777. sctx->stat.read_errors++;
  778. sctx->stat.uncorrectable_errors++;
  779. spin_unlock(&sctx->stat_lock);
  780. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  781. goto out;
  782. }
  783. /* setup the context, map the logical blocks and alloc the pages */
  784. ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
  785. logical, sblocks_for_recheck);
  786. if (ret) {
  787. spin_lock(&sctx->stat_lock);
  788. sctx->stat.read_errors++;
  789. sctx->stat.uncorrectable_errors++;
  790. spin_unlock(&sctx->stat_lock);
  791. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  792. goto out;
  793. }
  794. BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
  795. sblock_bad = sblocks_for_recheck + failed_mirror_index;
  796. /* build and submit the bios for the failed mirror, check checksums */
  797. scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
  798. csum, generation, sctx->csum_size);
  799. if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
  800. sblock_bad->no_io_error_seen) {
  801. /*
  802. * the error disappeared after reading page by page, or
  803. * the area was part of a huge bio and other parts of the
  804. * bio caused I/O errors, or the block layer merged several
  805. * read requests into one and the error is caused by a
  806. * different bio (usually one of the two latter cases is
  807. * the cause)
  808. */
  809. spin_lock(&sctx->stat_lock);
  810. sctx->stat.unverified_errors++;
  811. spin_unlock(&sctx->stat_lock);
  812. if (sctx->is_dev_replace)
  813. scrub_write_block_to_dev_replace(sblock_bad);
  814. goto out;
  815. }
  816. if (!sblock_bad->no_io_error_seen) {
  817. spin_lock(&sctx->stat_lock);
  818. sctx->stat.read_errors++;
  819. spin_unlock(&sctx->stat_lock);
  820. if (__ratelimit(&_rs))
  821. scrub_print_warning("i/o error", sblock_to_check);
  822. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  823. } else if (sblock_bad->checksum_error) {
  824. spin_lock(&sctx->stat_lock);
  825. sctx->stat.csum_errors++;
  826. spin_unlock(&sctx->stat_lock);
  827. if (__ratelimit(&_rs))
  828. scrub_print_warning("checksum error", sblock_to_check);
  829. btrfs_dev_stat_inc_and_print(dev,
  830. BTRFS_DEV_STAT_CORRUPTION_ERRS);
  831. } else if (sblock_bad->header_error) {
  832. spin_lock(&sctx->stat_lock);
  833. sctx->stat.verify_errors++;
  834. spin_unlock(&sctx->stat_lock);
  835. if (__ratelimit(&_rs))
  836. scrub_print_warning("checksum/header error",
  837. sblock_to_check);
  838. if (sblock_bad->generation_error)
  839. btrfs_dev_stat_inc_and_print(dev,
  840. BTRFS_DEV_STAT_GENERATION_ERRS);
  841. else
  842. btrfs_dev_stat_inc_and_print(dev,
  843. BTRFS_DEV_STAT_CORRUPTION_ERRS);
  844. }
  845. if (sctx->readonly && !sctx->is_dev_replace)
  846. goto did_not_correct_error;
  847. if (!is_metadata && !have_csum) {
  848. struct scrub_fixup_nodatasum *fixup_nodatasum;
  849. nodatasum_case:
  850. WARN_ON(sctx->is_dev_replace);
  851. /*
  852. * !is_metadata and !have_csum, this means that the data
  853. * might not be COW'ed, that it might be modified
  854. * concurrently. The general strategy to work on the
  855. * commit root does not help in the case when COW is not
  856. * used.
  857. */
  858. fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
  859. if (!fixup_nodatasum)
  860. goto did_not_correct_error;
  861. fixup_nodatasum->sctx = sctx;
  862. fixup_nodatasum->dev = dev;
  863. fixup_nodatasum->logical = logical;
  864. fixup_nodatasum->root = fs_info->extent_root;
  865. fixup_nodatasum->mirror_num = failed_mirror_index + 1;
  866. scrub_pending_trans_workers_inc(sctx);
  867. fixup_nodatasum->work.func = scrub_fixup_nodatasum;
  868. btrfs_queue_worker(&fs_info->scrub_workers,
  869. &fixup_nodatasum->work);
  870. goto out;
  871. }
  872. /*
  873. * now build and submit the bios for the other mirrors, check
  874. * checksums.
  875. * First try to pick the mirror which is completely without I/O
  876. * errors and also does not have a checksum error.
  877. * If one is found, and if a checksum is present, the full block
  878. * that is known to contain an error is rewritten. Afterwards
  879. * the block is known to be corrected.
  880. * If a mirror is found which is completely correct, and no
  881. * checksum is present, only those pages are rewritten that had
  882. * an I/O error in the block to be repaired, since it cannot be
  883. * determined, which copy of the other pages is better (and it
  884. * could happen otherwise that a correct page would be
  885. * overwritten by a bad one).
  886. */
  887. for (mirror_index = 0;
  888. mirror_index < BTRFS_MAX_MIRRORS &&
  889. sblocks_for_recheck[mirror_index].page_count > 0;
  890. mirror_index++) {
  891. struct scrub_block *sblock_other;
  892. if (mirror_index == failed_mirror_index)
  893. continue;
  894. sblock_other = sblocks_for_recheck + mirror_index;
  895. /* build and submit the bios, check checksums */
  896. scrub_recheck_block(fs_info, sblock_other, is_metadata,
  897. have_csum, csum, generation,
  898. sctx->csum_size);
  899. if (!sblock_other->header_error &&
  900. !sblock_other->checksum_error &&
  901. sblock_other->no_io_error_seen) {
  902. if (sctx->is_dev_replace) {
  903. scrub_write_block_to_dev_replace(sblock_other);
  904. } else {
  905. int force_write = is_metadata || have_csum;
  906. ret = scrub_repair_block_from_good_copy(
  907. sblock_bad, sblock_other,
  908. force_write);
  909. }
  910. if (0 == ret)
  911. goto corrected_error;
  912. }
  913. }
  914. /*
  915. * for dev_replace, pick good pages and write to the target device.
  916. */
  917. if (sctx->is_dev_replace) {
  918. success = 1;
  919. for (page_num = 0; page_num < sblock_bad->page_count;
  920. page_num++) {
  921. int sub_success;
  922. sub_success = 0;
  923. for (mirror_index = 0;
  924. mirror_index < BTRFS_MAX_MIRRORS &&
  925. sblocks_for_recheck[mirror_index].page_count > 0;
  926. mirror_index++) {
  927. struct scrub_block *sblock_other =
  928. sblocks_for_recheck + mirror_index;
  929. struct scrub_page *page_other =
  930. sblock_other->pagev[page_num];
  931. if (!page_other->io_error) {
  932. ret = scrub_write_page_to_dev_replace(
  933. sblock_other, page_num);
  934. if (ret == 0) {
  935. /* succeeded for this page */
  936. sub_success = 1;
  937. break;
  938. } else {
  939. btrfs_dev_replace_stats_inc(
  940. &sctx->dev_root->
  941. fs_info->dev_replace.
  942. num_write_errors);
  943. }
  944. }
  945. }
  946. if (!sub_success) {
  947. /*
  948. * did not find a mirror to fetch the page
  949. * from. scrub_write_page_to_dev_replace()
  950. * handles this case (page->io_error), by
  951. * filling the block with zeros before
  952. * submitting the write request
  953. */
  954. success = 0;
  955. ret = scrub_write_page_to_dev_replace(
  956. sblock_bad, page_num);
  957. if (ret)
  958. btrfs_dev_replace_stats_inc(
  959. &sctx->dev_root->fs_info->
  960. dev_replace.num_write_errors);
  961. }
  962. }
  963. goto out;
  964. }
  965. /*
  966. * for regular scrub, repair those pages that are errored.
  967. * In case of I/O errors in the area that is supposed to be
  968. * repaired, continue by picking good copies of those pages.
  969. * Select the good pages from mirrors to rewrite bad pages from
  970. * the area to fix. Afterwards verify the checksum of the block
  971. * that is supposed to be repaired. This verification step is
  972. * only done for the purpose of statistic counting and for the
  973. * final scrub report, whether errors remain.
  974. * A perfect algorithm could make use of the checksum and try
  975. * all possible combinations of pages from the different mirrors
  976. * until the checksum verification succeeds. For example, when
  977. * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
  978. * of mirror #2 is readable but the final checksum test fails,
  979. * then the 2nd page of mirror #3 could be tried, whether now
  980. * the final checksum succeedes. But this would be a rare
  981. * exception and is therefore not implemented. At least it is
  982. * avoided that the good copy is overwritten.
  983. * A more useful improvement would be to pick the sectors
  984. * without I/O error based on sector sizes (512 bytes on legacy
  985. * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
  986. * mirror could be repaired by taking 512 byte of a different
  987. * mirror, even if other 512 byte sectors in the same PAGE_SIZE
  988. * area are unreadable.
  989. */
  990. /* can only fix I/O errors from here on */
  991. if (sblock_bad->no_io_error_seen)
  992. goto did_not_correct_error;
  993. success = 1;
  994. for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
  995. struct scrub_page *page_bad = sblock_bad->pagev[page_num];
  996. if (!page_bad->io_error)
  997. continue;
  998. for (mirror_index = 0;
  999. mirror_index < BTRFS_MAX_MIRRORS &&
  1000. sblocks_for_recheck[mirror_index].page_count > 0;
  1001. mirror_index++) {
  1002. struct scrub_block *sblock_other = sblocks_for_recheck +
  1003. mirror_index;
  1004. struct scrub_page *page_other = sblock_other->pagev[
  1005. page_num];
  1006. if (!page_other->io_error) {
  1007. ret = scrub_repair_page_from_good_copy(
  1008. sblock_bad, sblock_other, page_num, 0);
  1009. if (0 == ret) {
  1010. page_bad->io_error = 0;
  1011. break; /* succeeded for this page */
  1012. }
  1013. }
  1014. }
  1015. if (page_bad->io_error) {
  1016. /* did not find a mirror to copy the page from */
  1017. success = 0;
  1018. }
  1019. }
  1020. if (success) {
  1021. if (is_metadata || have_csum) {
  1022. /*
  1023. * need to verify the checksum now that all
  1024. * sectors on disk are repaired (the write
  1025. * request for data to be repaired is on its way).
  1026. * Just be lazy and use scrub_recheck_block()
  1027. * which re-reads the data before the checksum
  1028. * is verified, but most likely the data comes out
  1029. * of the page cache.
  1030. */
  1031. scrub_recheck_block(fs_info, sblock_bad,
  1032. is_metadata, have_csum, csum,
  1033. generation, sctx->csum_size);
  1034. if (!sblock_bad->header_error &&
  1035. !sblock_bad->checksum_error &&
  1036. sblock_bad->no_io_error_seen)
  1037. goto corrected_error;
  1038. else
  1039. goto did_not_correct_error;
  1040. } else {
  1041. corrected_error:
  1042. spin_lock(&sctx->stat_lock);
  1043. sctx->stat.corrected_errors++;
  1044. spin_unlock(&sctx->stat_lock);
  1045. printk_ratelimited_in_rcu(KERN_ERR
  1046. "btrfs: fixed up error at logical %llu on dev %s\n",
  1047. logical, rcu_str_deref(dev->name));
  1048. }
  1049. } else {
  1050. did_not_correct_error:
  1051. spin_lock(&sctx->stat_lock);
  1052. sctx->stat.uncorrectable_errors++;
  1053. spin_unlock(&sctx->stat_lock);
  1054. printk_ratelimited_in_rcu(KERN_ERR
  1055. "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
  1056. logical, rcu_str_deref(dev->name));
  1057. }
  1058. out:
  1059. if (sblocks_for_recheck) {
  1060. for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
  1061. mirror_index++) {
  1062. struct scrub_block *sblock = sblocks_for_recheck +
  1063. mirror_index;
  1064. int page_index;
  1065. for (page_index = 0; page_index < sblock->page_count;
  1066. page_index++) {
  1067. sblock->pagev[page_index]->sblock = NULL;
  1068. scrub_page_put(sblock->pagev[page_index]);
  1069. }
  1070. }
  1071. kfree(sblocks_for_recheck);
  1072. }
  1073. return 0;
  1074. }
  1075. static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
  1076. struct btrfs_fs_info *fs_info,
  1077. struct scrub_block *original_sblock,
  1078. u64 length, u64 logical,
  1079. struct scrub_block *sblocks_for_recheck)
  1080. {
  1081. int page_index;
  1082. int mirror_index;
  1083. int ret;
  1084. /*
  1085. * note: the two members ref_count and outstanding_pages
  1086. * are not used (and not set) in the blocks that are used for
  1087. * the recheck procedure
  1088. */
  1089. page_index = 0;
  1090. while (length > 0) {
  1091. u64 sublen = min_t(u64, length, PAGE_SIZE);
  1092. u64 mapped_length = sublen;
  1093. struct btrfs_bio *bbio = NULL;
  1094. /*
  1095. * with a length of PAGE_SIZE, each returned stripe
  1096. * represents one mirror
  1097. */
  1098. ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
  1099. &mapped_length, &bbio, 0);
  1100. if (ret || !bbio || mapped_length < sublen) {
  1101. kfree(bbio);
  1102. return -EIO;
  1103. }
  1104. BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
  1105. for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
  1106. mirror_index++) {
  1107. struct scrub_block *sblock;
  1108. struct scrub_page *page;
  1109. if (mirror_index >= BTRFS_MAX_MIRRORS)
  1110. continue;
  1111. sblock = sblocks_for_recheck + mirror_index;
  1112. sblock->sctx = sctx;
  1113. page = kzalloc(sizeof(*page), GFP_NOFS);
  1114. if (!page) {
  1115. leave_nomem:
  1116. spin_lock(&sctx->stat_lock);
  1117. sctx->stat.malloc_errors++;
  1118. spin_unlock(&sctx->stat_lock);
  1119. kfree(bbio);
  1120. return -ENOMEM;
  1121. }
  1122. scrub_page_get(page);
  1123. sblock->pagev[page_index] = page;
  1124. page->logical = logical;
  1125. page->physical = bbio->stripes[mirror_index].physical;
  1126. BUG_ON(page_index >= original_sblock->page_count);
  1127. page->physical_for_dev_replace =
  1128. original_sblock->pagev[page_index]->
  1129. physical_for_dev_replace;
  1130. /* for missing devices, dev->bdev is NULL */
  1131. page->dev = bbio->stripes[mirror_index].dev;
  1132. page->mirror_num = mirror_index + 1;
  1133. sblock->page_count++;
  1134. page->page = alloc_page(GFP_NOFS);
  1135. if (!page->page)
  1136. goto leave_nomem;
  1137. }
  1138. kfree(bbio);
  1139. length -= sublen;
  1140. logical += sublen;
  1141. page_index++;
  1142. }
  1143. return 0;
  1144. }
  1145. /*
  1146. * this function will check the on disk data for checksum errors, header
  1147. * errors and read I/O errors. If any I/O errors happen, the exact pages
  1148. * which are errored are marked as being bad. The goal is to enable scrub
  1149. * to take those pages that are not errored from all the mirrors so that
  1150. * the pages that are errored in the just handled mirror can be repaired.
  1151. */
  1152. static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
  1153. struct scrub_block *sblock, int is_metadata,
  1154. int have_csum, u8 *csum, u64 generation,
  1155. u16 csum_size)
  1156. {
  1157. int page_num;
  1158. sblock->no_io_error_seen = 1;
  1159. sblock->header_error = 0;
  1160. sblock->checksum_error = 0;
  1161. for (page_num = 0; page_num < sblock->page_count; page_num++) {
  1162. struct bio *bio;
  1163. struct scrub_page *page = sblock->pagev[page_num];
  1164. DECLARE_COMPLETION_ONSTACK(complete);
  1165. if (page->dev->bdev == NULL) {
  1166. page->io_error = 1;
  1167. sblock->no_io_error_seen = 0;
  1168. continue;
  1169. }
  1170. WARN_ON(!page->page);
  1171. bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
  1172. if (!bio) {
  1173. page->io_error = 1;
  1174. sblock->no_io_error_seen = 0;
  1175. continue;
  1176. }
  1177. bio->bi_bdev = page->dev->bdev;
  1178. bio->bi_sector = page->physical >> 9;
  1179. bio->bi_end_io = scrub_complete_bio_end_io;
  1180. bio->bi_private = &complete;
  1181. bio_add_page(bio, page->page, PAGE_SIZE, 0);
  1182. btrfsic_submit_bio(READ, bio);
  1183. /* this will also unplug the queue */
  1184. wait_for_completion(&complete);
  1185. page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
  1186. if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
  1187. sblock->no_io_error_seen = 0;
  1188. bio_put(bio);
  1189. }
  1190. if (sblock->no_io_error_seen)
  1191. scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
  1192. have_csum, csum, generation,
  1193. csum_size);
  1194. return;
  1195. }
  1196. static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
  1197. struct scrub_block *sblock,
  1198. int is_metadata, int have_csum,
  1199. const u8 *csum, u64 generation,
  1200. u16 csum_size)
  1201. {
  1202. int page_num;
  1203. u8 calculated_csum[BTRFS_CSUM_SIZE];
  1204. u32 crc = ~(u32)0;
  1205. void *mapped_buffer;
  1206. WARN_ON(!sblock->pagev[0]->page);
  1207. if (is_metadata) {
  1208. struct btrfs_header *h;
  1209. mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
  1210. h = (struct btrfs_header *)mapped_buffer;
  1211. if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
  1212. memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
  1213. memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
  1214. BTRFS_UUID_SIZE)) {
  1215. sblock->header_error = 1;
  1216. } else if (generation != btrfs_stack_header_generation(h)) {
  1217. sblock->header_error = 1;
  1218. sblock->generation_error = 1;
  1219. }
  1220. csum = h->csum;
  1221. } else {
  1222. if (!have_csum)
  1223. return;
  1224. mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
  1225. }
  1226. for (page_num = 0;;) {
  1227. if (page_num == 0 && is_metadata)
  1228. crc = btrfs_csum_data(
  1229. ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
  1230. crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
  1231. else
  1232. crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
  1233. kunmap_atomic(mapped_buffer);
  1234. page_num++;
  1235. if (page_num >= sblock->page_count)
  1236. break;
  1237. WARN_ON(!sblock->pagev[page_num]->page);
  1238. mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
  1239. }
  1240. btrfs_csum_final(crc, calculated_csum);
  1241. if (memcmp(calculated_csum, csum, csum_size))
  1242. sblock->checksum_error = 1;
  1243. }
  1244. static void scrub_complete_bio_end_io(struct bio *bio, int err)
  1245. {
  1246. complete((struct completion *)bio->bi_private);
  1247. }
  1248. static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
  1249. struct scrub_block *sblock_good,
  1250. int force_write)
  1251. {
  1252. int page_num;
  1253. int ret = 0;
  1254. for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
  1255. int ret_sub;
  1256. ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
  1257. sblock_good,
  1258. page_num,
  1259. force_write);
  1260. if (ret_sub)
  1261. ret = ret_sub;
  1262. }
  1263. return ret;
  1264. }
  1265. static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
  1266. struct scrub_block *sblock_good,
  1267. int page_num, int force_write)
  1268. {
  1269. struct scrub_page *page_bad = sblock_bad->pagev[page_num];
  1270. struct scrub_page *page_good = sblock_good->pagev[page_num];
  1271. BUG_ON(page_bad->page == NULL);
  1272. BUG_ON(page_good->page == NULL);
  1273. if (force_write || sblock_bad->header_error ||
  1274. sblock_bad->checksum_error || page_bad->io_error) {
  1275. struct bio *bio;
  1276. int ret;
  1277. DECLARE_COMPLETION_ONSTACK(complete);
  1278. if (!page_bad->dev->bdev) {
  1279. printk_ratelimited(KERN_WARNING
  1280. "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
  1281. return -EIO;
  1282. }
  1283. bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
  1284. if (!bio)
  1285. return -EIO;
  1286. bio->bi_bdev = page_bad->dev->bdev;
  1287. bio->bi_sector = page_bad->physical >> 9;
  1288. bio->bi_end_io = scrub_complete_bio_end_io;
  1289. bio->bi_private = &complete;
  1290. ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
  1291. if (PAGE_SIZE != ret) {
  1292. bio_put(bio);
  1293. return -EIO;
  1294. }
  1295. btrfsic_submit_bio(WRITE, bio);
  1296. /* this will also unplug the queue */
  1297. wait_for_completion(&complete);
  1298. if (!bio_flagged(bio, BIO_UPTODATE)) {
  1299. btrfs_dev_stat_inc_and_print(page_bad->dev,
  1300. BTRFS_DEV_STAT_WRITE_ERRS);
  1301. btrfs_dev_replace_stats_inc(
  1302. &sblock_bad->sctx->dev_root->fs_info->
  1303. dev_replace.num_write_errors);
  1304. bio_put(bio);
  1305. return -EIO;
  1306. }
  1307. bio_put(bio);
  1308. }
  1309. return 0;
  1310. }
  1311. static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
  1312. {
  1313. int page_num;
  1314. for (page_num = 0; page_num < sblock->page_count; page_num++) {
  1315. int ret;
  1316. ret = scrub_write_page_to_dev_replace(sblock, page_num);
  1317. if (ret)
  1318. btrfs_dev_replace_stats_inc(
  1319. &sblock->sctx->dev_root->fs_info->dev_replace.
  1320. num_write_errors);
  1321. }
  1322. }
  1323. static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
  1324. int page_num)
  1325. {
  1326. struct scrub_page *spage = sblock->pagev[page_num];
  1327. BUG_ON(spage->page == NULL);
  1328. if (spage->io_error) {
  1329. void *mapped_buffer = kmap_atomic(spage->page);
  1330. memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
  1331. flush_dcache_page(spage->page);
  1332. kunmap_atomic(mapped_buffer);
  1333. }
  1334. return scrub_add_page_to_wr_bio(sblock->sctx, spage);
  1335. }
  1336. static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
  1337. struct scrub_page *spage)
  1338. {
  1339. struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
  1340. struct scrub_bio *sbio;
  1341. int ret;
  1342. mutex_lock(&wr_ctx->wr_lock);
  1343. again:
  1344. if (!wr_ctx->wr_curr_bio) {
  1345. wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
  1346. GFP_NOFS);
  1347. if (!wr_ctx->wr_curr_bio) {
  1348. mutex_unlock(&wr_ctx->wr_lock);
  1349. return -ENOMEM;
  1350. }
  1351. wr_ctx->wr_curr_bio->sctx = sctx;
  1352. wr_ctx->wr_curr_bio->page_count = 0;
  1353. }
  1354. sbio = wr_ctx->wr_curr_bio;
  1355. if (sbio->page_count == 0) {
  1356. struct bio *bio;
  1357. sbio->physical = spage->physical_for_dev_replace;
  1358. sbio->logical = spage->logical;
  1359. sbio->dev = wr_ctx->tgtdev;
  1360. bio = sbio->bio;
  1361. if (!bio) {
  1362. bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
  1363. if (!bio) {
  1364. mutex_unlock(&wr_ctx->wr_lock);
  1365. return -ENOMEM;
  1366. }
  1367. sbio->bio = bio;
  1368. }
  1369. bio->bi_private = sbio;
  1370. bio->bi_end_io = scrub_wr_bio_end_io;
  1371. bio->bi_bdev = sbio->dev->bdev;
  1372. bio->bi_sector = sbio->physical >> 9;
  1373. sbio->err = 0;
  1374. } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
  1375. spage->physical_for_dev_replace ||
  1376. sbio->logical + sbio->page_count * PAGE_SIZE !=
  1377. spage->logical) {
  1378. scrub_wr_submit(sctx);
  1379. goto again;
  1380. }
  1381. ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
  1382. if (ret != PAGE_SIZE) {
  1383. if (sbio->page_count < 1) {
  1384. bio_put(sbio->bio);
  1385. sbio->bio = NULL;
  1386. mutex_unlock(&wr_ctx->wr_lock);
  1387. return -EIO;
  1388. }
  1389. scrub_wr_submit(sctx);
  1390. goto again;
  1391. }
  1392. sbio->pagev[sbio->page_count] = spage;
  1393. scrub_page_get(spage);
  1394. sbio->page_count++;
  1395. if (sbio->page_count == wr_ctx->pages_per_wr_bio)
  1396. scrub_wr_submit(sctx);
  1397. mutex_unlock(&wr_ctx->wr_lock);
  1398. return 0;
  1399. }
  1400. static void scrub_wr_submit(struct scrub_ctx *sctx)
  1401. {
  1402. struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
  1403. struct scrub_bio *sbio;
  1404. if (!wr_ctx->wr_curr_bio)
  1405. return;
  1406. sbio = wr_ctx->wr_curr_bio;
  1407. wr_ctx->wr_curr_bio = NULL;
  1408. WARN_ON(!sbio->bio->bi_bdev);
  1409. scrub_pending_bio_inc(sctx);
  1410. /* process all writes in a single worker thread. Then the block layer
  1411. * orders the requests before sending them to the driver which
  1412. * doubled the write performance on spinning disks when measured
  1413. * with Linux 3.5 */
  1414. btrfsic_submit_bio(WRITE, sbio->bio);
  1415. }
  1416. static void scrub_wr_bio_end_io(struct bio *bio, int err)
  1417. {
  1418. struct scrub_bio *sbio = bio->bi_private;
  1419. struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
  1420. sbio->err = err;
  1421. sbio->bio = bio;
  1422. sbio->work.func = scrub_wr_bio_end_io_worker;
  1423. btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
  1424. }
  1425. static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
  1426. {
  1427. struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
  1428. struct scrub_ctx *sctx = sbio->sctx;
  1429. int i;
  1430. WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
  1431. if (sbio->err) {
  1432. struct btrfs_dev_replace *dev_replace =
  1433. &sbio->sctx->dev_root->fs_info->dev_replace;
  1434. for (i = 0; i < sbio->page_count; i++) {
  1435. struct scrub_page *spage = sbio->pagev[i];
  1436. spage->io_error = 1;
  1437. btrfs_dev_replace_stats_inc(&dev_replace->
  1438. num_write_errors);
  1439. }
  1440. }
  1441. for (i = 0; i < sbio->page_count; i++)
  1442. scrub_page_put(sbio->pagev[i]);
  1443. bio_put(sbio->bio);
  1444. kfree(sbio);
  1445. scrub_pending_bio_dec(sctx);
  1446. }
  1447. static int scrub_checksum(struct scrub_block *sblock)
  1448. {
  1449. u64 flags;
  1450. int ret;
  1451. WARN_ON(sblock->page_count < 1);
  1452. flags = sblock->pagev[0]->flags;
  1453. ret = 0;
  1454. if (flags & BTRFS_EXTENT_FLAG_DATA)
  1455. ret = scrub_checksum_data(sblock);
  1456. else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
  1457. ret = scrub_checksum_tree_block(sblock);
  1458. else if (flags & BTRFS_EXTENT_FLAG_SUPER)
  1459. (void)scrub_checksum_super(sblock);
  1460. else
  1461. WARN_ON(1);
  1462. if (ret)
  1463. scrub_handle_errored_block(sblock);
  1464. return ret;
  1465. }
  1466. static int scrub_checksum_data(struct scrub_block *sblock)
  1467. {
  1468. struct scrub_ctx *sctx = sblock->sctx;
  1469. u8 csum[BTRFS_CSUM_SIZE];
  1470. u8 *on_disk_csum;
  1471. struct page *page;
  1472. void *buffer;
  1473. u32 crc = ~(u32)0;
  1474. int fail = 0;
  1475. u64 len;
  1476. int index;
  1477. BUG_ON(sblock->page_count < 1);
  1478. if (!sblock->pagev[0]->have_csum)
  1479. return 0;
  1480. on_disk_csum = sblock->pagev[0]->csum;
  1481. page = sblock->pagev[0]->page;
  1482. buffer = kmap_atomic(page);
  1483. len = sctx->sectorsize;
  1484. index = 0;
  1485. for (;;) {
  1486. u64 l = min_t(u64, len, PAGE_SIZE);
  1487. crc = btrfs_csum_data(buffer, crc, l);
  1488. kunmap_atomic(buffer);
  1489. len -= l;
  1490. if (len == 0)
  1491. break;
  1492. index++;
  1493. BUG_ON(index >= sblock->page_count);
  1494. BUG_ON(!sblock->pagev[index]->page);
  1495. page = sblock->pagev[index]->page;
  1496. buffer = kmap_atomic(page);
  1497. }
  1498. btrfs_csum_final(crc, csum);
  1499. if (memcmp(csum, on_disk_csum, sctx->csum_size))
  1500. fail = 1;
  1501. return fail;
  1502. }
  1503. static int scrub_checksum_tree_block(struct scrub_block *sblock)
  1504. {
  1505. struct scrub_ctx *sctx = sblock->sctx;
  1506. struct btrfs_header *h;
  1507. struct btrfs_root *root = sctx->dev_root;
  1508. struct btrfs_fs_info *fs_info = root->fs_info;
  1509. u8 calculated_csum[BTRFS_CSUM_SIZE];
  1510. u8 on_disk_csum[BTRFS_CSUM_SIZE];
  1511. struct page *page;
  1512. void *mapped_buffer;
  1513. u64 mapped_size;
  1514. void *p;
  1515. u32 crc = ~(u32)0;
  1516. int fail = 0;
  1517. int crc_fail = 0;
  1518. u64 len;
  1519. int index;
  1520. BUG_ON(sblock->page_count < 1);
  1521. page = sblock->pagev[0]->page;
  1522. mapped_buffer = kmap_atomic(page);
  1523. h = (struct btrfs_header *)mapped_buffer;
  1524. memcpy(on_disk_csum, h->csum, sctx->csum_size);
  1525. /*
  1526. * we don't use the getter functions here, as we
  1527. * a) don't have an extent buffer and
  1528. * b) the page is already kmapped
  1529. */
  1530. if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
  1531. ++fail;
  1532. if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
  1533. ++fail;
  1534. if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
  1535. ++fail;
  1536. if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
  1537. BTRFS_UUID_SIZE))
  1538. ++fail;
  1539. WARN_ON(sctx->nodesize != sctx->leafsize);
  1540. len = sctx->nodesize - BTRFS_CSUM_SIZE;
  1541. mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
  1542. p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
  1543. index = 0;
  1544. for (;;) {
  1545. u64 l = min_t(u64, len, mapped_size);
  1546. crc = btrfs_csum_data(p, crc, l);
  1547. kunmap_atomic(mapped_buffer);
  1548. len -= l;
  1549. if (len == 0)
  1550. break;
  1551. index++;
  1552. BUG_ON(index >= sblock->page_count);
  1553. BUG_ON(!sblock->pagev[index]->page);
  1554. page = sblock->pagev[index]->page;
  1555. mapped_buffer = kmap_atomic(page);
  1556. mapped_size = PAGE_SIZE;
  1557. p = mapped_buffer;
  1558. }
  1559. btrfs_csum_final(crc, calculated_csum);
  1560. if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
  1561. ++crc_fail;
  1562. return fail || crc_fail;
  1563. }
  1564. static int scrub_checksum_super(struct scrub_block *sblock)
  1565. {
  1566. struct btrfs_super_block *s;
  1567. struct scrub_ctx *sctx = sblock->sctx;
  1568. struct btrfs_root *root = sctx->dev_root;
  1569. struct btrfs_fs_info *fs_info = root->fs_info;
  1570. u8 calculated_csum[BTRFS_CSUM_SIZE];
  1571. u8 on_disk_csum[BTRFS_CSUM_SIZE];
  1572. struct page *page;
  1573. void *mapped_buffer;
  1574. u64 mapped_size;
  1575. void *p;
  1576. u32 crc = ~(u32)0;
  1577. int fail_gen = 0;
  1578. int fail_cor = 0;
  1579. u64 len;
  1580. int index;
  1581. BUG_ON(sblock->page_count < 1);
  1582. page = sblock->pagev[0]->page;
  1583. mapped_buffer = kmap_atomic(page);
  1584. s = (struct btrfs_super_block *)mapped_buffer;
  1585. memcpy(on_disk_csum, s->csum, sctx->csum_size);
  1586. if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
  1587. ++fail_cor;
  1588. if (sblock->pagev[0]->generation != btrfs_super_generation(s))
  1589. ++fail_gen;
  1590. if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
  1591. ++fail_cor;
  1592. len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
  1593. mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
  1594. p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
  1595. index = 0;
  1596. for (;;) {
  1597. u64 l = min_t(u64, len, mapped_size);
  1598. crc = btrfs_csum_data(p, crc, l);
  1599. kunmap_atomic(mapped_buffer);
  1600. len -= l;
  1601. if (len == 0)
  1602. break;
  1603. index++;
  1604. BUG_ON(index >= sblock->page_count);
  1605. BUG_ON(!sblock->pagev[index]->page);
  1606. page = sblock->pagev[index]->page;
  1607. mapped_buffer = kmap_atomic(page);
  1608. mapped_size = PAGE_SIZE;
  1609. p = mapped_buffer;
  1610. }
  1611. btrfs_csum_final(crc, calculated_csum);
  1612. if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
  1613. ++fail_cor;
  1614. if (fail_cor + fail_gen) {
  1615. /*
  1616. * if we find an error in a super block, we just report it.
  1617. * They will get written with the next transaction commit
  1618. * anyway
  1619. */
  1620. spin_lock(&sctx->stat_lock);
  1621. ++sctx->stat.super_errors;
  1622. spin_unlock(&sctx->stat_lock);
  1623. if (fail_cor)
  1624. btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
  1625. BTRFS_DEV_STAT_CORRUPTION_ERRS);
  1626. else
  1627. btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
  1628. BTRFS_DEV_STAT_GENERATION_ERRS);
  1629. }
  1630. return fail_cor + fail_gen;
  1631. }
  1632. static void scrub_block_get(struct scrub_block *sblock)
  1633. {
  1634. atomic_inc(&sblock->ref_count);
  1635. }
  1636. static void scrub_block_put(struct scrub_block *sblock)
  1637. {
  1638. if (atomic_dec_and_test(&sblock->ref_count)) {
  1639. int i;
  1640. for (i = 0; i < sblock->page_count; i++)
  1641. scrub_page_put(sblock->pagev[i]);
  1642. kfree(sblock);
  1643. }
  1644. }
  1645. static void scrub_page_get(struct scrub_page *spage)
  1646. {
  1647. atomic_inc(&spage->ref_count);
  1648. }
  1649. static void scrub_page_put(struct scrub_page *spage)
  1650. {
  1651. if (atomic_dec_and_test(&spage->ref_count)) {
  1652. if (spage->page)
  1653. __free_page(spage->page);
  1654. kfree(spage);
  1655. }
  1656. }
  1657. static void scrub_submit(struct scrub_ctx *sctx)
  1658. {
  1659. struct scrub_bio *sbio;
  1660. if (sctx->curr == -1)
  1661. return;
  1662. sbio = sctx->bios[sctx->curr];
  1663. sctx->curr = -1;
  1664. scrub_pending_bio_inc(sctx);
  1665. if (!sbio->bio->bi_bdev) {
  1666. /*
  1667. * this case should not happen. If btrfs_map_block() is
  1668. * wrong, it could happen for dev-replace operations on
  1669. * missing devices when no mirrors are available, but in
  1670. * this case it should already fail the mount.
  1671. * This case is handled correctly (but _very_ slowly).
  1672. */
  1673. printk_ratelimited(KERN_WARNING
  1674. "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
  1675. bio_endio(sbio->bio, -EIO);
  1676. } else {
  1677. btrfsic_submit_bio(READ, sbio->bio);
  1678. }
  1679. }
  1680. static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
  1681. struct scrub_page *spage)
  1682. {
  1683. struct scrub_block *sblock = spage->sblock;
  1684. struct scrub_bio *sbio;
  1685. int ret;
  1686. again:
  1687. /*
  1688. * grab a fresh bio or wait for one to become available
  1689. */
  1690. while (sctx->curr == -1) {
  1691. spin_lock(&sctx->list_lock);
  1692. sctx->curr = sctx->first_free;
  1693. if (sctx->curr != -1) {
  1694. sctx->first_free = sctx->bios[sctx->curr]->next_free;
  1695. sctx->bios[sctx->curr]->next_free = -1;
  1696. sctx->bios[sctx->curr]->page_count = 0;
  1697. spin_unlock(&sctx->list_lock);
  1698. } else {
  1699. spin_unlock(&sctx->list_lock);
  1700. wait_event(sctx->list_wait, sctx->first_free != -1);
  1701. }
  1702. }
  1703. sbio = sctx->bios[sctx->curr];
  1704. if (sbio->page_count == 0) {
  1705. struct bio *bio;
  1706. sbio->physical = spage->physical;
  1707. sbio->logical = spage->logical;
  1708. sbio->dev = spage->dev;
  1709. bio = sbio->bio;
  1710. if (!bio) {
  1711. bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
  1712. if (!bio)
  1713. return -ENOMEM;
  1714. sbio->bio = bio;
  1715. }
  1716. bio->bi_private = sbio;
  1717. bio->bi_end_io = scrub_bio_end_io;
  1718. bio->bi_bdev = sbio->dev->bdev;
  1719. bio->bi_sector = sbio->physical >> 9;
  1720. sbio->err = 0;
  1721. } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
  1722. spage->physical ||
  1723. sbio->logical + sbio->page_count * PAGE_SIZE !=
  1724. spage->logical ||
  1725. sbio->dev != spage->dev) {
  1726. scrub_submit(sctx);
  1727. goto again;
  1728. }
  1729. sbio->pagev[sbio->page_count] = spage;
  1730. ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
  1731. if (ret != PAGE_SIZE) {
  1732. if (sbio->page_count < 1) {
  1733. bio_put(sbio->bio);
  1734. sbio->bio = NULL;
  1735. return -EIO;
  1736. }
  1737. scrub_submit(sctx);
  1738. goto again;
  1739. }
  1740. scrub_block_get(sblock); /* one for the page added to the bio */
  1741. atomic_inc(&sblock->outstanding_pages);
  1742. sbio->page_count++;
  1743. if (sbio->page_count == sctx->pages_per_rd_bio)
  1744. scrub_submit(sctx);
  1745. return 0;
  1746. }
  1747. static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  1748. u64 physical, struct btrfs_device *dev, u64 flags,
  1749. u64 gen, int mirror_num, u8 *csum, int force,
  1750. u64 physical_for_dev_replace)
  1751. {
  1752. struct scrub_block *sblock;
  1753. int index;
  1754. sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
  1755. if (!sblock) {
  1756. spin_lock(&sctx->stat_lock);
  1757. sctx->stat.malloc_errors++;
  1758. spin_unlock(&sctx->stat_lock);
  1759. return -ENOMEM;
  1760. }
  1761. /* one ref inside this function, plus one for each page added to
  1762. * a bio later on */
  1763. atomic_set(&sblock->ref_count, 1);
  1764. sblock->sctx = sctx;
  1765. sblock->no_io_error_seen = 1;
  1766. for (index = 0; len > 0; index++) {
  1767. struct scrub_page *spage;
  1768. u64 l = min_t(u64, len, PAGE_SIZE);
  1769. spage = kzalloc(sizeof(*spage), GFP_NOFS);
  1770. if (!spage) {
  1771. leave_nomem:
  1772. spin_lock(&sctx->stat_lock);
  1773. sctx->stat.malloc_errors++;
  1774. spin_unlock(&sctx->stat_lock);
  1775. scrub_block_put(sblock);
  1776. return -ENOMEM;
  1777. }
  1778. BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
  1779. scrub_page_get(spage);
  1780. sblock->pagev[index] = spage;
  1781. spage->sblock = sblock;
  1782. spage->dev = dev;
  1783. spage->flags = flags;
  1784. spage->generation = gen;
  1785. spage->logical = logical;
  1786. spage->physical = physical;
  1787. spage->physical_for_dev_replace = physical_for_dev_replace;
  1788. spage->mirror_num = mirror_num;
  1789. if (csum) {
  1790. spage->have_csum = 1;
  1791. memcpy(spage->csum, csum, sctx->csum_size);
  1792. } else {
  1793. spage->have_csum = 0;
  1794. }
  1795. sblock->page_count++;
  1796. spage->page = alloc_page(GFP_NOFS);
  1797. if (!spage->page)
  1798. goto leave_nomem;
  1799. len -= l;
  1800. logical += l;
  1801. physical += l;
  1802. physical_for_dev_replace += l;
  1803. }
  1804. WARN_ON(sblock->page_count == 0);
  1805. for (index = 0; index < sblock->page_count; index++) {
  1806. struct scrub_page *spage = sblock->pagev[index];
  1807. int ret;
  1808. ret = scrub_add_page_to_rd_bio(sctx, spage);
  1809. if (ret) {
  1810. scrub_block_put(sblock);
  1811. return ret;
  1812. }
  1813. }
  1814. if (force)
  1815. scrub_submit(sctx);
  1816. /* last one frees, either here or in bio completion for last page */
  1817. scrub_block_put(sblock);
  1818. return 0;
  1819. }
  1820. static void scrub_bio_end_io(struct bio *bio, int err)
  1821. {
  1822. struct scrub_bio *sbio = bio->bi_private;
  1823. struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
  1824. sbio->err = err;
  1825. sbio->bio = bio;
  1826. btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
  1827. }
  1828. static void scrub_bio_end_io_worker(struct btrfs_work *work)
  1829. {
  1830. struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
  1831. struct scrub_ctx *sctx = sbio->sctx;
  1832. int i;
  1833. BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
  1834. if (sbio->err) {
  1835. for (i = 0; i < sbio->page_count; i++) {
  1836. struct scrub_page *spage = sbio->pagev[i];
  1837. spage->io_error = 1;
  1838. spage->sblock->no_io_error_seen = 0;
  1839. }
  1840. }
  1841. /* now complete the scrub_block items that have all pages completed */
  1842. for (i = 0; i < sbio->page_count; i++) {
  1843. struct scrub_page *spage = sbio->pagev[i];
  1844. struct scrub_block *sblock = spage->sblock;
  1845. if (atomic_dec_and_test(&sblock->outstanding_pages))
  1846. scrub_block_complete(sblock);
  1847. scrub_block_put(sblock);
  1848. }
  1849. bio_put(sbio->bio);
  1850. sbio->bio = NULL;
  1851. spin_lock(&sctx->list_lock);
  1852. sbio->next_free = sctx->first_free;
  1853. sctx->first_free = sbio->index;
  1854. spin_unlock(&sctx->list_lock);
  1855. if (sctx->is_dev_replace &&
  1856. atomic_read(&sctx->wr_ctx.flush_all_writes)) {
  1857. mutex_lock(&sctx->wr_ctx.wr_lock);
  1858. scrub_wr_submit(sctx);
  1859. mutex_unlock(&sctx->wr_ctx.wr_lock);
  1860. }
  1861. scrub_pending_bio_dec(sctx);
  1862. }
  1863. static void scrub_block_complete(struct scrub_block *sblock)
  1864. {
  1865. if (!sblock->no_io_error_seen) {
  1866. scrub_handle_errored_block(sblock);
  1867. } else {
  1868. /*
  1869. * if has checksum error, write via repair mechanism in
  1870. * dev replace case, otherwise write here in dev replace
  1871. * case.
  1872. */
  1873. if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
  1874. scrub_write_block_to_dev_replace(sblock);
  1875. }
  1876. }
  1877. static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
  1878. u8 *csum)
  1879. {
  1880. struct btrfs_ordered_sum *sum = NULL;
  1881. unsigned long index;
  1882. unsigned long num_sectors;
  1883. while (!list_empty(&sctx->csum_list)) {
  1884. sum = list_first_entry(&sctx->csum_list,
  1885. struct btrfs_ordered_sum, list);
  1886. if (sum->bytenr > logical)
  1887. return 0;
  1888. if (sum->bytenr + sum->len > logical)
  1889. break;
  1890. ++sctx->stat.csum_discards;
  1891. list_del(&sum->list);
  1892. kfree(sum);
  1893. sum = NULL;
  1894. }
  1895. if (!sum)
  1896. return 0;
  1897. index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
  1898. num_sectors = sum->len / sctx->sectorsize;
  1899. memcpy(csum, sum->sums + index, sctx->csum_size);
  1900. if (index == num_sectors - 1) {
  1901. list_del(&sum->list);
  1902. kfree(sum);
  1903. }
  1904. return 1;
  1905. }
  1906. /* scrub extent tries to collect up to 64 kB for each bio */
  1907. static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
  1908. u64 physical, struct btrfs_device *dev, u64 flags,
  1909. u64 gen, int mirror_num, u64 physical_for_dev_replace)
  1910. {
  1911. int ret;
  1912. u8 csum[BTRFS_CSUM_SIZE];
  1913. u32 blocksize;
  1914. if (flags & BTRFS_EXTENT_FLAG_DATA) {
  1915. blocksize = sctx->sectorsize;
  1916. spin_lock(&sctx->stat_lock);
  1917. sctx->stat.data_extents_scrubbed++;
  1918. sctx->stat.data_bytes_scrubbed += len;
  1919. spin_unlock(&sctx->stat_lock);
  1920. } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  1921. WARN_ON(sctx->nodesize != sctx->leafsize);
  1922. blocksize = sctx->nodesize;
  1923. spin_lock(&sctx->stat_lock);
  1924. sctx->stat.tree_extents_scrubbed++;
  1925. sctx->stat.tree_bytes_scrubbed += len;
  1926. spin_unlock(&sctx->stat_lock);
  1927. } else {
  1928. blocksize = sctx->sectorsize;
  1929. WARN_ON(1);
  1930. }
  1931. while (len) {
  1932. u64 l = min_t(u64, len, blocksize);
  1933. int have_csum = 0;
  1934. if (flags & BTRFS_EXTENT_FLAG_DATA) {
  1935. /* push csums to sbio */
  1936. have_csum = scrub_find_csum(sctx, logical, l, csum);
  1937. if (have_csum == 0)
  1938. ++sctx->stat.no_csum;
  1939. if (sctx->is_dev_replace && !have_csum) {
  1940. ret = copy_nocow_pages(sctx, logical, l,
  1941. mirror_num,
  1942. physical_for_dev_replace);
  1943. goto behind_scrub_pages;
  1944. }
  1945. }
  1946. ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
  1947. mirror_num, have_csum ? csum : NULL, 0,
  1948. physical_for_dev_replace);
  1949. behind_scrub_pages:
  1950. if (ret)
  1951. return ret;
  1952. len -= l;
  1953. logical += l;
  1954. physical += l;
  1955. physical_for_dev_replace += l;
  1956. }
  1957. return 0;
  1958. }
  1959. static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
  1960. struct map_lookup *map,
  1961. struct btrfs_device *scrub_dev,
  1962. int num, u64 base, u64 length,
  1963. int is_dev_replace)
  1964. {
  1965. struct btrfs_path *path;
  1966. struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  1967. struct btrfs_root *root = fs_info->extent_root;
  1968. struct btrfs_root *csum_root = fs_info->csum_root;
  1969. struct btrfs_extent_item *extent;
  1970. struct blk_plug plug;
  1971. u64 flags;
  1972. int ret;
  1973. int slot;
  1974. u64 nstripes;
  1975. struct extent_buffer *l;
  1976. struct btrfs_key key;
  1977. u64 physical;
  1978. u64 logical;
  1979. u64 logic_end;
  1980. u64 generation;
  1981. int mirror_num;
  1982. struct reada_control *reada1;
  1983. struct reada_control *reada2;
  1984. struct btrfs_key key_start;
  1985. struct btrfs_key key_end;
  1986. u64 increment = map->stripe_len;
  1987. u64 offset;
  1988. u64 extent_logical;
  1989. u64 extent_physical;
  1990. u64 extent_len;
  1991. struct btrfs_device *extent_dev;
  1992. int extent_mirror_num;
  1993. int stop_loop;
  1994. if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
  1995. BTRFS_BLOCK_GROUP_RAID6)) {
  1996. if (num >= nr_data_stripes(map)) {
  1997. return 0;
  1998. }
  1999. }
  2000. nstripes = length;
  2001. offset = 0;
  2002. do_div(nstripes, map->stripe_len);
  2003. if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
  2004. offset = map->stripe_len * num;
  2005. increment = map->stripe_len * map->num_stripes;
  2006. mirror_num = 1;
  2007. } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
  2008. int factor = map->num_stripes / map->sub_stripes;
  2009. offset = map->stripe_len * (num / map->sub_stripes);
  2010. increment = map->stripe_len * factor;
  2011. mirror_num = num % map->sub_stripes + 1;
  2012. } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
  2013. increment = map->stripe_len;
  2014. mirror_num = num % map->num_stripes + 1;
  2015. } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
  2016. increment = map->stripe_len;
  2017. mirror_num = num % map->num_stripes + 1;
  2018. } else {
  2019. increment = map->stripe_len;
  2020. mirror_num = 1;
  2021. }
  2022. path = btrfs_alloc_path();
  2023. if (!path)
  2024. return -ENOMEM;
  2025. /*
  2026. * work on commit root. The related disk blocks are static as
  2027. * long as COW is applied. This means, it is save to rewrite
  2028. * them to repair disk errors without any race conditions
  2029. */
  2030. path->search_commit_root = 1;
  2031. path->skip_locking = 1;
  2032. /*
  2033. * trigger the readahead for extent tree csum tree and wait for
  2034. * completion. During readahead, the scrub is officially paused
  2035. * to not hold off transaction commits
  2036. */
  2037. logical = base + offset;
  2038. wait_event(sctx->list_wait,
  2039. atomic_read(&sctx->bios_in_flight) == 0);
  2040. atomic_inc(&fs_info->scrubs_paused);
  2041. wake_up(&fs_info->scrub_pause_wait);
  2042. /* FIXME it might be better to start readahead at commit root */
  2043. key_start.objectid = logical;
  2044. key_start.type = BTRFS_EXTENT_ITEM_KEY;
  2045. key_start.offset = (u64)0;
  2046. key_end.objectid = base + offset + nstripes * increment;
  2047. key_end.type = BTRFS_METADATA_ITEM_KEY;
  2048. key_end.offset = (u64)-1;
  2049. reada1 = btrfs_reada_add(root, &key_start, &key_end);
  2050. key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
  2051. key_start.type = BTRFS_EXTENT_CSUM_KEY;
  2052. key_start.offset = logical;
  2053. key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
  2054. key_end.type = BTRFS_EXTENT_CSUM_KEY;
  2055. key_end.offset = base + offset + nstripes * increment;
  2056. reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
  2057. if (!IS_ERR(reada1))
  2058. btrfs_reada_wait(reada1);
  2059. if (!IS_ERR(reada2))
  2060. btrfs_reada_wait(reada2);
  2061. mutex_lock(&fs_info->scrub_lock);
  2062. while (atomic_read(&fs_info->scrub_pause_req)) {
  2063. mutex_unlock(&fs_info->scrub_lock);
  2064. wait_event(fs_info->scrub_pause_wait,
  2065. atomic_read(&fs_info->scrub_pause_req) == 0);
  2066. mutex_lock(&fs_info->scrub_lock);
  2067. }
  2068. atomic_dec(&fs_info->scrubs_paused);
  2069. mutex_unlock(&fs_info->scrub_lock);
  2070. wake_up(&fs_info->scrub_pause_wait);
  2071. /*
  2072. * collect all data csums for the stripe to avoid seeking during
  2073. * the scrub. This might currently (crc32) end up to be about 1MB
  2074. */
  2075. blk_start_plug(&plug);
  2076. /*
  2077. * now find all extents for each stripe and scrub them
  2078. */
  2079. logical = base + offset;
  2080. physical = map->stripes[num].physical;
  2081. logic_end = logical + increment * nstripes;
  2082. ret = 0;
  2083. while (logical < logic_end) {
  2084. /*
  2085. * canceled?
  2086. */
  2087. if (atomic_read(&fs_info->scrub_cancel_req) ||
  2088. atomic_read(&sctx->cancel_req)) {
  2089. ret = -ECANCELED;
  2090. goto out;
  2091. }
  2092. /*
  2093. * check to see if we have to pause
  2094. */
  2095. if (atomic_read(&fs_info->scrub_pause_req)) {
  2096. /* push queued extents */
  2097. atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
  2098. scrub_submit(sctx);
  2099. mutex_lock(&sctx->wr_ctx.wr_lock);
  2100. scrub_wr_submit(sctx);
  2101. mutex_unlock(&sctx->wr_ctx.wr_lock);
  2102. wait_event(sctx->list_wait,
  2103. atomic_read(&sctx->bios_in_flight) == 0);
  2104. atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
  2105. atomic_inc(&fs_info->scrubs_paused);
  2106. wake_up(&fs_info->scrub_pause_wait);
  2107. mutex_lock(&fs_info->scrub_lock);
  2108. while (atomic_read(&fs_info->scrub_pause_req)) {
  2109. mutex_unlock(&fs_info->scrub_lock);
  2110. wait_event(fs_info->scrub_pause_wait,
  2111. atomic_read(&fs_info->scrub_pause_req) == 0);
  2112. mutex_lock(&fs_info->scrub_lock);
  2113. }
  2114. atomic_dec(&fs_info->scrubs_paused);
  2115. mutex_unlock(&fs_info->scrub_lock);
  2116. wake_up(&fs_info->scrub_pause_wait);
  2117. }
  2118. key.objectid = logical;
  2119. key.type = BTRFS_EXTENT_ITEM_KEY;
  2120. key.offset = (u64)-1;
  2121. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  2122. if (ret < 0)
  2123. goto out;
  2124. if (ret > 0) {
  2125. ret = btrfs_previous_item(root, path, 0,
  2126. BTRFS_EXTENT_ITEM_KEY);
  2127. if (ret < 0)
  2128. goto out;
  2129. if (ret > 0) {
  2130. /* there's no smaller item, so stick with the
  2131. * larger one */
  2132. btrfs_release_path(path);
  2133. ret = btrfs_search_slot(NULL, root, &key,
  2134. path, 0, 0);
  2135. if (ret < 0)
  2136. goto out;
  2137. }
  2138. }
  2139. stop_loop = 0;
  2140. while (1) {
  2141. u64 bytes;
  2142. l = path->nodes[0];
  2143. slot = path->slots[0];
  2144. if (slot >= btrfs_header_nritems(l)) {
  2145. ret = btrfs_next_leaf(root, path);
  2146. if (ret == 0)
  2147. continue;
  2148. if (ret < 0)
  2149. goto out;
  2150. stop_loop = 1;
  2151. break;
  2152. }
  2153. btrfs_item_key_to_cpu(l, &key, slot);
  2154. if (key.type == BTRFS_METADATA_ITEM_KEY)
  2155. bytes = root->leafsize;
  2156. else
  2157. bytes = key.offset;
  2158. if (key.objectid + bytes <= logical)
  2159. goto next;
  2160. if (key.type != BTRFS_EXTENT_ITEM_KEY &&
  2161. key.type != BTRFS_METADATA_ITEM_KEY)
  2162. goto next;
  2163. if (key.objectid >= logical + map->stripe_len) {
  2164. /* out of this device extent */
  2165. if (key.objectid >= logic_end)
  2166. stop_loop = 1;
  2167. break;
  2168. }
  2169. extent = btrfs_item_ptr(l, slot,
  2170. struct btrfs_extent_item);
  2171. flags = btrfs_extent_flags(l, extent);
  2172. generation = btrfs_extent_generation(l, extent);
  2173. if (key.objectid < logical &&
  2174. (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
  2175. printk(KERN_ERR
  2176. "btrfs scrub: tree block %llu spanning "
  2177. "stripes, ignored. logical=%llu\n",
  2178. key.objectid, logical);
  2179. goto next;
  2180. }
  2181. again:
  2182. extent_logical = key.objectid;
  2183. extent_len = bytes;
  2184. /*
  2185. * trim extent to this stripe
  2186. */
  2187. if (extent_logical < logical) {
  2188. extent_len -= logical - extent_logical;
  2189. extent_logical = logical;
  2190. }
  2191. if (extent_logical + extent_len >
  2192. logical + map->stripe_len) {
  2193. extent_len = logical + map->stripe_len -
  2194. extent_logical;
  2195. }
  2196. extent_physical = extent_logical - logical + physical;
  2197. extent_dev = scrub_dev;
  2198. extent_mirror_num = mirror_num;
  2199. if (is_dev_replace)
  2200. scrub_remap_extent(fs_info, extent_logical,
  2201. extent_len, &extent_physical,
  2202. &extent_dev,
  2203. &extent_mirror_num);
  2204. ret = btrfs_lookup_csums_range(csum_root, logical,
  2205. logical + map->stripe_len - 1,
  2206. &sctx->csum_list, 1);
  2207. if (ret)
  2208. goto out;
  2209. ret = scrub_extent(sctx, extent_logical, extent_len,
  2210. extent_physical, extent_dev, flags,
  2211. generation, extent_mirror_num,
  2212. extent_logical - logical + physical);
  2213. if (ret)
  2214. goto out;
  2215. scrub_free_csums(sctx);
  2216. if (extent_logical + extent_len <
  2217. key.objectid + bytes) {
  2218. logical += increment;
  2219. physical += map->stripe_len;
  2220. if (logical < key.objectid + bytes) {
  2221. cond_resched();
  2222. goto again;
  2223. }
  2224. if (logical >= logic_end) {
  2225. stop_loop = 1;
  2226. break;
  2227. }
  2228. }
  2229. next:
  2230. path->slots[0]++;
  2231. }
  2232. btrfs_release_path(path);
  2233. logical += increment;
  2234. physical += map->stripe_len;
  2235. spin_lock(&sctx->stat_lock);
  2236. if (stop_loop)
  2237. sctx->stat.last_physical = map->stripes[num].physical +
  2238. length;
  2239. else
  2240. sctx->stat.last_physical = physical;
  2241. spin_unlock(&sctx->stat_lock);
  2242. if (stop_loop)
  2243. break;
  2244. }
  2245. out:
  2246. /* push queued extents */
  2247. scrub_submit(sctx);
  2248. mutex_lock(&sctx->wr_ctx.wr_lock);
  2249. scrub_wr_submit(sctx);
  2250. mutex_unlock(&sctx->wr_ctx.wr_lock);
  2251. blk_finish_plug(&plug);
  2252. btrfs_free_path(path);
  2253. return ret < 0 ? ret : 0;
  2254. }
  2255. static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
  2256. struct btrfs_device *scrub_dev,
  2257. u64 chunk_tree, u64 chunk_objectid,
  2258. u64 chunk_offset, u64 length,
  2259. u64 dev_offset, int is_dev_replace)
  2260. {
  2261. struct btrfs_mapping_tree *map_tree =
  2262. &sctx->dev_root->fs_info->mapping_tree;
  2263. struct map_lookup *map;
  2264. struct extent_map *em;
  2265. int i;
  2266. int ret = 0;
  2267. read_lock(&map_tree->map_tree.lock);
  2268. em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
  2269. read_unlock(&map_tree->map_tree.lock);
  2270. if (!em)
  2271. return -EINVAL;
  2272. map = (struct map_lookup *)em->bdev;
  2273. if (em->start != chunk_offset)
  2274. goto out;
  2275. if (em->len < length)
  2276. goto out;
  2277. for (i = 0; i < map->num_stripes; ++i) {
  2278. if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
  2279. map->stripes[i].physical == dev_offset) {
  2280. ret = scrub_stripe(sctx, map, scrub_dev, i,
  2281. chunk_offset, length,
  2282. is_dev_replace);
  2283. if (ret)
  2284. goto out;
  2285. }
  2286. }
  2287. out:
  2288. free_extent_map(em);
  2289. return ret;
  2290. }
  2291. static noinline_for_stack
  2292. int scrub_enumerate_chunks(struct scrub_ctx *sctx,
  2293. struct btrfs_device *scrub_dev, u64 start, u64 end,
  2294. int is_dev_replace)
  2295. {
  2296. struct btrfs_dev_extent *dev_extent = NULL;
  2297. struct btrfs_path *path;
  2298. struct btrfs_root *root = sctx->dev_root;
  2299. struct btrfs_fs_info *fs_info = root->fs_info;
  2300. u64 length;
  2301. u64 chunk_tree;
  2302. u64 chunk_objectid;
  2303. u64 chunk_offset;
  2304. int ret;
  2305. int slot;
  2306. struct extent_buffer *l;
  2307. struct btrfs_key key;
  2308. struct btrfs_key found_key;
  2309. struct btrfs_block_group_cache *cache;
  2310. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  2311. path = btrfs_alloc_path();
  2312. if (!path)
  2313. return -ENOMEM;
  2314. path->reada = 2;
  2315. path->search_commit_root = 1;
  2316. path->skip_locking = 1;
  2317. key.objectid = scrub_dev->devid;
  2318. key.offset = 0ull;
  2319. key.type = BTRFS_DEV_EXTENT_KEY;
  2320. while (1) {
  2321. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  2322. if (ret < 0)
  2323. break;
  2324. if (ret > 0) {
  2325. if (path->slots[0] >=
  2326. btrfs_header_nritems(path->nodes[0])) {
  2327. ret = btrfs_next_leaf(root, path);
  2328. if (ret)
  2329. break;
  2330. }
  2331. }
  2332. l = path->nodes[0];
  2333. slot = path->slots[0];
  2334. btrfs_item_key_to_cpu(l, &found_key, slot);
  2335. if (found_key.objectid != scrub_dev->devid)
  2336. break;
  2337. if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
  2338. break;
  2339. if (found_key.offset >= end)
  2340. break;
  2341. if (found_key.offset < key.offset)
  2342. break;
  2343. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  2344. length = btrfs_dev_extent_length(l, dev_extent);
  2345. if (found_key.offset + length <= start) {
  2346. key.offset = found_key.offset + length;
  2347. btrfs_release_path(path);
  2348. continue;
  2349. }
  2350. chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
  2351. chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
  2352. chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
  2353. /*
  2354. * get a reference on the corresponding block group to prevent
  2355. * the chunk from going away while we scrub it
  2356. */
  2357. cache = btrfs_lookup_block_group(fs_info, chunk_offset);
  2358. if (!cache) {
  2359. ret = -ENOENT;
  2360. break;
  2361. }
  2362. dev_replace->cursor_right = found_key.offset + length;
  2363. dev_replace->cursor_left = found_key.offset;
  2364. dev_replace->item_needs_writeback = 1;
  2365. ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
  2366. chunk_offset, length, found_key.offset,
  2367. is_dev_replace);
  2368. /*
  2369. * flush, submit all pending read and write bios, afterwards
  2370. * wait for them.
  2371. * Note that in the dev replace case, a read request causes
  2372. * write requests that are submitted in the read completion
  2373. * worker. Therefore in the current situation, it is required
  2374. * that all write requests are flushed, so that all read and
  2375. * write requests are really completed when bios_in_flight
  2376. * changes to 0.
  2377. */
  2378. atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
  2379. scrub_submit(sctx);
  2380. mutex_lock(&sctx->wr_ctx.wr_lock);
  2381. scrub_wr_submit(sctx);
  2382. mutex_unlock(&sctx->wr_ctx.wr_lock);
  2383. wait_event(sctx->list_wait,
  2384. atomic_read(&sctx->bios_in_flight) == 0);
  2385. atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
  2386. atomic_inc(&fs_info->scrubs_paused);
  2387. wake_up(&fs_info->scrub_pause_wait);
  2388. wait_event(sctx->list_wait,
  2389. atomic_read(&sctx->workers_pending) == 0);
  2390. mutex_lock(&fs_info->scrub_lock);
  2391. while (atomic_read(&fs_info->scrub_pause_req)) {
  2392. mutex_unlock(&fs_info->scrub_lock);
  2393. wait_event(fs_info->scrub_pause_wait,
  2394. atomic_read(&fs_info->scrub_pause_req) == 0);
  2395. mutex_lock(&fs_info->scrub_lock);
  2396. }
  2397. atomic_dec(&fs_info->scrubs_paused);
  2398. mutex_unlock(&fs_info->scrub_lock);
  2399. wake_up(&fs_info->scrub_pause_wait);
  2400. dev_replace->cursor_left = dev_replace->cursor_right;
  2401. dev_replace->item_needs_writeback = 1;
  2402. btrfs_put_block_group(cache);
  2403. if (ret)
  2404. break;
  2405. if (is_dev_replace &&
  2406. atomic64_read(&dev_replace->num_write_errors) > 0) {
  2407. ret = -EIO;
  2408. break;
  2409. }
  2410. if (sctx->stat.malloc_errors > 0) {
  2411. ret = -ENOMEM;
  2412. break;
  2413. }
  2414. key.offset = found_key.offset + length;
  2415. btrfs_release_path(path);
  2416. }
  2417. btrfs_free_path(path);
  2418. /*
  2419. * ret can still be 1 from search_slot or next_leaf,
  2420. * that's not an error
  2421. */
  2422. return ret < 0 ? ret : 0;
  2423. }
  2424. static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
  2425. struct btrfs_device *scrub_dev)
  2426. {
  2427. int i;
  2428. u64 bytenr;
  2429. u64 gen;
  2430. int ret;
  2431. struct btrfs_root *root = sctx->dev_root;
  2432. if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
  2433. return -EIO;
  2434. gen = root->fs_info->last_trans_committed;
  2435. for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
  2436. bytenr = btrfs_sb_offset(i);
  2437. if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
  2438. break;
  2439. ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
  2440. scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
  2441. NULL, 1, bytenr);
  2442. if (ret)
  2443. return ret;
  2444. }
  2445. wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
  2446. return 0;
  2447. }
  2448. /*
  2449. * get a reference count on fs_info->scrub_workers. start worker if necessary
  2450. */
  2451. static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
  2452. int is_dev_replace)
  2453. {
  2454. int ret = 0;
  2455. mutex_lock(&fs_info->scrub_lock);
  2456. if (fs_info->scrub_workers_refcnt == 0) {
  2457. if (is_dev_replace)
  2458. btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
  2459. &fs_info->generic_worker);
  2460. else
  2461. btrfs_init_workers(&fs_info->scrub_workers, "scrub",
  2462. fs_info->thread_pool_size,
  2463. &fs_info->generic_worker);
  2464. fs_info->scrub_workers.idle_thresh = 4;
  2465. ret = btrfs_start_workers(&fs_info->scrub_workers);
  2466. if (ret)
  2467. goto out;
  2468. btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
  2469. "scrubwrc",
  2470. fs_info->thread_pool_size,
  2471. &fs_info->generic_worker);
  2472. fs_info->scrub_wr_completion_workers.idle_thresh = 2;
  2473. ret = btrfs_start_workers(
  2474. &fs_info->scrub_wr_completion_workers);
  2475. if (ret)
  2476. goto out;
  2477. btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
  2478. &fs_info->generic_worker);
  2479. ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
  2480. if (ret)
  2481. goto out;
  2482. }
  2483. ++fs_info->scrub_workers_refcnt;
  2484. out:
  2485. mutex_unlock(&fs_info->scrub_lock);
  2486. return ret;
  2487. }
  2488. static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
  2489. {
  2490. mutex_lock(&fs_info->scrub_lock);
  2491. if (--fs_info->scrub_workers_refcnt == 0) {
  2492. btrfs_stop_workers(&fs_info->scrub_workers);
  2493. btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
  2494. btrfs_stop_workers(&fs_info->scrub_nocow_workers);
  2495. }
  2496. WARN_ON(fs_info->scrub_workers_refcnt < 0);
  2497. mutex_unlock(&fs_info->scrub_lock);
  2498. }
  2499. int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
  2500. u64 end, struct btrfs_scrub_progress *progress,
  2501. int readonly, int is_dev_replace)
  2502. {
  2503. struct scrub_ctx *sctx;
  2504. int ret;
  2505. struct btrfs_device *dev;
  2506. if (btrfs_fs_closing(fs_info))
  2507. return -EINVAL;
  2508. /*
  2509. * check some assumptions
  2510. */
  2511. if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
  2512. printk(KERN_ERR
  2513. "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
  2514. fs_info->chunk_root->nodesize,
  2515. fs_info->chunk_root->leafsize);
  2516. return -EINVAL;
  2517. }
  2518. if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
  2519. /*
  2520. * in this case scrub is unable to calculate the checksum
  2521. * the way scrub is implemented. Do not handle this
  2522. * situation at all because it won't ever happen.
  2523. */
  2524. printk(KERN_ERR
  2525. "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
  2526. fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
  2527. return -EINVAL;
  2528. }
  2529. if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
  2530. /* not supported for data w/o checksums */
  2531. printk(KERN_ERR
  2532. "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
  2533. fs_info->chunk_root->sectorsize, PAGE_SIZE);
  2534. return -EINVAL;
  2535. }
  2536. if (fs_info->chunk_root->nodesize >
  2537. PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
  2538. fs_info->chunk_root->sectorsize >
  2539. PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
  2540. /*
  2541. * would exhaust the array bounds of pagev member in
  2542. * struct scrub_block
  2543. */
  2544. pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
  2545. fs_info->chunk_root->nodesize,
  2546. SCRUB_MAX_PAGES_PER_BLOCK,
  2547. fs_info->chunk_root->sectorsize,
  2548. SCRUB_MAX_PAGES_PER_BLOCK);
  2549. return -EINVAL;
  2550. }
  2551. ret = scrub_workers_get(fs_info, is_dev_replace);
  2552. if (ret)
  2553. return ret;
  2554. mutex_lock(&fs_info->fs_devices->device_list_mutex);
  2555. dev = btrfs_find_device(fs_info, devid, NULL, NULL);
  2556. if (!dev || (dev->missing && !is_dev_replace)) {
  2557. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2558. scrub_workers_put(fs_info);
  2559. return -ENODEV;
  2560. }
  2561. mutex_lock(&fs_info->scrub_lock);
  2562. if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
  2563. mutex_unlock(&fs_info->scrub_lock);
  2564. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2565. scrub_workers_put(fs_info);
  2566. return -EIO;
  2567. }
  2568. btrfs_dev_replace_lock(&fs_info->dev_replace);
  2569. if (dev->scrub_device ||
  2570. (!is_dev_replace &&
  2571. btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
  2572. btrfs_dev_replace_unlock(&fs_info->dev_replace);
  2573. mutex_unlock(&fs_info->scrub_lock);
  2574. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2575. scrub_workers_put(fs_info);
  2576. return -EINPROGRESS;
  2577. }
  2578. btrfs_dev_replace_unlock(&fs_info->dev_replace);
  2579. sctx = scrub_setup_ctx(dev, is_dev_replace);
  2580. if (IS_ERR(sctx)) {
  2581. mutex_unlock(&fs_info->scrub_lock);
  2582. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2583. scrub_workers_put(fs_info);
  2584. return PTR_ERR(sctx);
  2585. }
  2586. sctx->readonly = readonly;
  2587. dev->scrub_device = sctx;
  2588. atomic_inc(&fs_info->scrubs_running);
  2589. mutex_unlock(&fs_info->scrub_lock);
  2590. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2591. if (!is_dev_replace) {
  2592. down_read(&fs_info->scrub_super_lock);
  2593. ret = scrub_supers(sctx, dev);
  2594. up_read(&fs_info->scrub_super_lock);
  2595. }
  2596. if (!ret)
  2597. ret = scrub_enumerate_chunks(sctx, dev, start, end,
  2598. is_dev_replace);
  2599. wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
  2600. atomic_dec(&fs_info->scrubs_running);
  2601. wake_up(&fs_info->scrub_pause_wait);
  2602. wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
  2603. if (progress)
  2604. memcpy(progress, &sctx->stat, sizeof(*progress));
  2605. mutex_lock(&fs_info->scrub_lock);
  2606. dev->scrub_device = NULL;
  2607. mutex_unlock(&fs_info->scrub_lock);
  2608. scrub_free_ctx(sctx);
  2609. scrub_workers_put(fs_info);
  2610. return ret;
  2611. }
  2612. void btrfs_scrub_pause(struct btrfs_root *root)
  2613. {
  2614. struct btrfs_fs_info *fs_info = root->fs_info;
  2615. mutex_lock(&fs_info->scrub_lock);
  2616. atomic_inc(&fs_info->scrub_pause_req);
  2617. while (atomic_read(&fs_info->scrubs_paused) !=
  2618. atomic_read(&fs_info->scrubs_running)) {
  2619. mutex_unlock(&fs_info->scrub_lock);
  2620. wait_event(fs_info->scrub_pause_wait,
  2621. atomic_read(&fs_info->scrubs_paused) ==
  2622. atomic_read(&fs_info->scrubs_running));
  2623. mutex_lock(&fs_info->scrub_lock);
  2624. }
  2625. mutex_unlock(&fs_info->scrub_lock);
  2626. }
  2627. void btrfs_scrub_continue(struct btrfs_root *root)
  2628. {
  2629. struct btrfs_fs_info *fs_info = root->fs_info;
  2630. atomic_dec(&fs_info->scrub_pause_req);
  2631. wake_up(&fs_info->scrub_pause_wait);
  2632. }
  2633. void btrfs_scrub_pause_super(struct btrfs_root *root)
  2634. {
  2635. down_write(&root->fs_info->scrub_super_lock);
  2636. }
  2637. void btrfs_scrub_continue_super(struct btrfs_root *root)
  2638. {
  2639. up_write(&root->fs_info->scrub_super_lock);
  2640. }
  2641. int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
  2642. {
  2643. mutex_lock(&fs_info->scrub_lock);
  2644. if (!atomic_read(&fs_info->scrubs_running)) {
  2645. mutex_unlock(&fs_info->scrub_lock);
  2646. return -ENOTCONN;
  2647. }
  2648. atomic_inc(&fs_info->scrub_cancel_req);
  2649. while (atomic_read(&fs_info->scrubs_running)) {
  2650. mutex_unlock(&fs_info->scrub_lock);
  2651. wait_event(fs_info->scrub_pause_wait,
  2652. atomic_read(&fs_info->scrubs_running) == 0);
  2653. mutex_lock(&fs_info->scrub_lock);
  2654. }
  2655. atomic_dec(&fs_info->scrub_cancel_req);
  2656. mutex_unlock(&fs_info->scrub_lock);
  2657. return 0;
  2658. }
  2659. int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
  2660. struct btrfs_device *dev)
  2661. {
  2662. struct scrub_ctx *sctx;
  2663. mutex_lock(&fs_info->scrub_lock);
  2664. sctx = dev->scrub_device;
  2665. if (!sctx) {
  2666. mutex_unlock(&fs_info->scrub_lock);
  2667. return -ENOTCONN;
  2668. }
  2669. atomic_inc(&sctx->cancel_req);
  2670. while (dev->scrub_device) {
  2671. mutex_unlock(&fs_info->scrub_lock);
  2672. wait_event(fs_info->scrub_pause_wait,
  2673. dev->scrub_device == NULL);
  2674. mutex_lock(&fs_info->scrub_lock);
  2675. }
  2676. mutex_unlock(&fs_info->scrub_lock);
  2677. return 0;
  2678. }
  2679. int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
  2680. struct btrfs_scrub_progress *progress)
  2681. {
  2682. struct btrfs_device *dev;
  2683. struct scrub_ctx *sctx = NULL;
  2684. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  2685. dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
  2686. if (dev)
  2687. sctx = dev->scrub_device;
  2688. if (sctx)
  2689. memcpy(progress, &sctx->stat, sizeof(*progress));
  2690. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  2691. return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
  2692. }
  2693. static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
  2694. u64 extent_logical, u64 extent_len,
  2695. u64 *extent_physical,
  2696. struct btrfs_device **extent_dev,
  2697. int *extent_mirror_num)
  2698. {
  2699. u64 mapped_length;
  2700. struct btrfs_bio *bbio = NULL;
  2701. int ret;
  2702. mapped_length = extent_len;
  2703. ret = btrfs_map_block(fs_info, READ, extent_logical,
  2704. &mapped_length, &bbio, 0);
  2705. if (ret || !bbio || mapped_length < extent_len ||
  2706. !bbio->stripes[0].dev->bdev) {
  2707. kfree(bbio);
  2708. return;
  2709. }
  2710. *extent_physical = bbio->stripes[0].physical;
  2711. *extent_mirror_num = bbio->mirror_num;
  2712. *extent_dev = bbio->stripes[0].dev;
  2713. kfree(bbio);
  2714. }
  2715. static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
  2716. struct scrub_wr_ctx *wr_ctx,
  2717. struct btrfs_fs_info *fs_info,
  2718. struct btrfs_device *dev,
  2719. int is_dev_replace)
  2720. {
  2721. WARN_ON(wr_ctx->wr_curr_bio != NULL);
  2722. mutex_init(&wr_ctx->wr_lock);
  2723. wr_ctx->wr_curr_bio = NULL;
  2724. if (!is_dev_replace)
  2725. return 0;
  2726. WARN_ON(!dev->bdev);
  2727. wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
  2728. bio_get_nr_vecs(dev->bdev));
  2729. wr_ctx->tgtdev = dev;
  2730. atomic_set(&wr_ctx->flush_all_writes, 0);
  2731. return 0;
  2732. }
  2733. static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
  2734. {
  2735. mutex_lock(&wr_ctx->wr_lock);
  2736. kfree(wr_ctx->wr_curr_bio);
  2737. wr_ctx->wr_curr_bio = NULL;
  2738. mutex_unlock(&wr_ctx->wr_lock);
  2739. }
  2740. static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  2741. int mirror_num, u64 physical_for_dev_replace)
  2742. {
  2743. struct scrub_copy_nocow_ctx *nocow_ctx;
  2744. struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  2745. nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
  2746. if (!nocow_ctx) {
  2747. spin_lock(&sctx->stat_lock);
  2748. sctx->stat.malloc_errors++;
  2749. spin_unlock(&sctx->stat_lock);
  2750. return -ENOMEM;
  2751. }
  2752. scrub_pending_trans_workers_inc(sctx);
  2753. nocow_ctx->sctx = sctx;
  2754. nocow_ctx->logical = logical;
  2755. nocow_ctx->len = len;
  2756. nocow_ctx->mirror_num = mirror_num;
  2757. nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
  2758. nocow_ctx->work.func = copy_nocow_pages_worker;
  2759. INIT_LIST_HEAD(&nocow_ctx->inodes);
  2760. btrfs_queue_worker(&fs_info->scrub_nocow_workers,
  2761. &nocow_ctx->work);
  2762. return 0;
  2763. }
  2764. static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
  2765. {
  2766. struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
  2767. struct scrub_nocow_inode *nocow_inode;
  2768. nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
  2769. if (!nocow_inode)
  2770. return -ENOMEM;
  2771. nocow_inode->inum = inum;
  2772. nocow_inode->offset = offset;
  2773. nocow_inode->root = root;
  2774. list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
  2775. return 0;
  2776. }
  2777. #define COPY_COMPLETE 1
  2778. static void copy_nocow_pages_worker(struct btrfs_work *work)
  2779. {
  2780. struct scrub_copy_nocow_ctx *nocow_ctx =
  2781. container_of(work, struct scrub_copy_nocow_ctx, work);
  2782. struct scrub_ctx *sctx = nocow_ctx->sctx;
  2783. u64 logical = nocow_ctx->logical;
  2784. u64 len = nocow_ctx->len;
  2785. int mirror_num = nocow_ctx->mirror_num;
  2786. u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
  2787. int ret;
  2788. struct btrfs_trans_handle *trans = NULL;
  2789. struct btrfs_fs_info *fs_info;
  2790. struct btrfs_path *path;
  2791. struct btrfs_root *root;
  2792. int not_written = 0;
  2793. fs_info = sctx->dev_root->fs_info;
  2794. root = fs_info->extent_root;
  2795. path = btrfs_alloc_path();
  2796. if (!path) {
  2797. spin_lock(&sctx->stat_lock);
  2798. sctx->stat.malloc_errors++;
  2799. spin_unlock(&sctx->stat_lock);
  2800. not_written = 1;
  2801. goto out;
  2802. }
  2803. trans = btrfs_join_transaction(root);
  2804. if (IS_ERR(trans)) {
  2805. not_written = 1;
  2806. goto out;
  2807. }
  2808. ret = iterate_inodes_from_logical(logical, fs_info, path,
  2809. record_inode_for_nocow, nocow_ctx);
  2810. if (ret != 0 && ret != -ENOENT) {
  2811. pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
  2812. logical, physical_for_dev_replace, len, mirror_num,
  2813. ret);
  2814. not_written = 1;
  2815. goto out;
  2816. }
  2817. btrfs_end_transaction(trans, root);
  2818. trans = NULL;
  2819. while (!list_empty(&nocow_ctx->inodes)) {
  2820. struct scrub_nocow_inode *entry;
  2821. entry = list_first_entry(&nocow_ctx->inodes,
  2822. struct scrub_nocow_inode,
  2823. list);
  2824. list_del_init(&entry->list);
  2825. ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
  2826. entry->root, nocow_ctx);
  2827. kfree(entry);
  2828. if (ret == COPY_COMPLETE) {
  2829. ret = 0;
  2830. break;
  2831. } else if (ret) {
  2832. break;
  2833. }
  2834. }
  2835. out:
  2836. while (!list_empty(&nocow_ctx->inodes)) {
  2837. struct scrub_nocow_inode *entry;
  2838. entry = list_first_entry(&nocow_ctx->inodes,
  2839. struct scrub_nocow_inode,
  2840. list);
  2841. list_del_init(&entry->list);
  2842. kfree(entry);
  2843. }
  2844. if (trans && !IS_ERR(trans))
  2845. btrfs_end_transaction(trans, root);
  2846. if (not_written)
  2847. btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
  2848. num_uncorrectable_read_errors);
  2849. btrfs_free_path(path);
  2850. kfree(nocow_ctx);
  2851. scrub_pending_trans_workers_dec(sctx);
  2852. }
  2853. static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
  2854. struct scrub_copy_nocow_ctx *nocow_ctx)
  2855. {
  2856. struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
  2857. struct btrfs_key key;
  2858. struct inode *inode;
  2859. struct page *page;
  2860. struct btrfs_root *local_root;
  2861. struct btrfs_ordered_extent *ordered;
  2862. struct extent_map *em;
  2863. struct extent_state *cached_state = NULL;
  2864. struct extent_io_tree *io_tree;
  2865. u64 physical_for_dev_replace;
  2866. u64 len = nocow_ctx->len;
  2867. u64 lockstart = offset, lockend = offset + len - 1;
  2868. unsigned long index;
  2869. int srcu_index;
  2870. int ret = 0;
  2871. int err = 0;
  2872. key.objectid = root;
  2873. key.type = BTRFS_ROOT_ITEM_KEY;
  2874. key.offset = (u64)-1;
  2875. srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
  2876. local_root = btrfs_read_fs_root_no_name(fs_info, &key);
  2877. if (IS_ERR(local_root)) {
  2878. srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
  2879. return PTR_ERR(local_root);
  2880. }
  2881. key.type = BTRFS_INODE_ITEM_KEY;
  2882. key.objectid = inum;
  2883. key.offset = 0;
  2884. inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
  2885. srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
  2886. if (IS_ERR(inode))
  2887. return PTR_ERR(inode);
  2888. /* Avoid truncate/dio/punch hole.. */
  2889. mutex_lock(&inode->i_mutex);
  2890. inode_dio_wait(inode);
  2891. physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
  2892. io_tree = &BTRFS_I(inode)->io_tree;
  2893. lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
  2894. ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
  2895. if (ordered) {
  2896. btrfs_put_ordered_extent(ordered);
  2897. goto out_unlock;
  2898. }
  2899. em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
  2900. if (IS_ERR(em)) {
  2901. ret = PTR_ERR(em);
  2902. goto out_unlock;
  2903. }
  2904. /*
  2905. * This extent does not actually cover the logical extent anymore,
  2906. * move on to the next inode.
  2907. */
  2908. if (em->block_start > nocow_ctx->logical ||
  2909. em->block_start + em->block_len < nocow_ctx->logical + len) {
  2910. free_extent_map(em);
  2911. goto out_unlock;
  2912. }
  2913. free_extent_map(em);
  2914. while (len >= PAGE_CACHE_SIZE) {
  2915. index = offset >> PAGE_CACHE_SHIFT;
  2916. again:
  2917. page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
  2918. if (!page) {
  2919. pr_err("find_or_create_page() failed\n");
  2920. ret = -ENOMEM;
  2921. goto out;
  2922. }
  2923. if (PageUptodate(page)) {
  2924. if (PageDirty(page))
  2925. goto next_page;
  2926. } else {
  2927. ClearPageError(page);
  2928. err = extent_read_full_page_nolock(io_tree, page,
  2929. btrfs_get_extent,
  2930. nocow_ctx->mirror_num);
  2931. if (err) {
  2932. ret = err;
  2933. goto next_page;
  2934. }
  2935. lock_page(page);
  2936. /*
  2937. * If the page has been remove from the page cache,
  2938. * the data on it is meaningless, because it may be
  2939. * old one, the new data may be written into the new
  2940. * page in the page cache.
  2941. */
  2942. if (page->mapping != inode->i_mapping) {
  2943. unlock_page(page);
  2944. page_cache_release(page);
  2945. goto again;
  2946. }
  2947. if (!PageUptodate(page)) {
  2948. ret = -EIO;
  2949. goto next_page;
  2950. }
  2951. }
  2952. err = write_page_nocow(nocow_ctx->sctx,
  2953. physical_for_dev_replace, page);
  2954. if (err)
  2955. ret = err;
  2956. next_page:
  2957. unlock_page(page);
  2958. page_cache_release(page);
  2959. if (ret)
  2960. break;
  2961. offset += PAGE_CACHE_SIZE;
  2962. physical_for_dev_replace += PAGE_CACHE_SIZE;
  2963. len -= PAGE_CACHE_SIZE;
  2964. }
  2965. ret = COPY_COMPLETE;
  2966. out_unlock:
  2967. unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
  2968. GFP_NOFS);
  2969. out:
  2970. mutex_unlock(&inode->i_mutex);
  2971. iput(inode);
  2972. return ret;
  2973. }
  2974. static int write_page_nocow(struct scrub_ctx *sctx,
  2975. u64 physical_for_dev_replace, struct page *page)
  2976. {
  2977. struct bio *bio;
  2978. struct btrfs_device *dev;
  2979. int ret;
  2980. DECLARE_COMPLETION_ONSTACK(compl);
  2981. dev = sctx->wr_ctx.tgtdev;
  2982. if (!dev)
  2983. return -EIO;
  2984. if (!dev->bdev) {
  2985. printk_ratelimited(KERN_WARNING
  2986. "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
  2987. return -EIO;
  2988. }
  2989. bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
  2990. if (!bio) {
  2991. spin_lock(&sctx->stat_lock);
  2992. sctx->stat.malloc_errors++;
  2993. spin_unlock(&sctx->stat_lock);
  2994. return -ENOMEM;
  2995. }
  2996. bio->bi_private = &compl;
  2997. bio->bi_end_io = scrub_complete_bio_end_io;
  2998. bio->bi_size = 0;
  2999. bio->bi_sector = physical_for_dev_replace >> 9;
  3000. bio->bi_bdev = dev->bdev;
  3001. ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
  3002. if (ret != PAGE_CACHE_SIZE) {
  3003. leave_with_eio:
  3004. bio_put(bio);
  3005. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
  3006. return -EIO;
  3007. }
  3008. btrfsic_submit_bio(WRITE_SYNC, bio);
  3009. wait_for_completion(&compl);
  3010. if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
  3011. goto leave_with_eio;
  3012. bio_put(bio);
  3013. return 0;
  3014. }