raid56.c 50 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099
  1. /*
  2. * Copyright (C) 2012 Fusion-io All rights reserved.
  3. * Copyright (C) 2012 Intel Corp. All rights reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public
  7. * License v2 as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public
  15. * License along with this program; if not, write to the
  16. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17. * Boston, MA 021110-1307, USA.
  18. */
  19. #include <linux/sched.h>
  20. #include <linux/wait.h>
  21. #include <linux/bio.h>
  22. #include <linux/slab.h>
  23. #include <linux/buffer_head.h>
  24. #include <linux/blkdev.h>
  25. #include <linux/random.h>
  26. #include <linux/iocontext.h>
  27. #include <linux/capability.h>
  28. #include <linux/ratelimit.h>
  29. #include <linux/kthread.h>
  30. #include <linux/raid/pq.h>
  31. #include <linux/hash.h>
  32. #include <linux/list_sort.h>
  33. #include <linux/raid/xor.h>
  34. #include <asm/div64.h>
  35. #include "compat.h"
  36. #include "ctree.h"
  37. #include "extent_map.h"
  38. #include "disk-io.h"
  39. #include "transaction.h"
  40. #include "print-tree.h"
  41. #include "volumes.h"
  42. #include "raid56.h"
  43. #include "async-thread.h"
  44. #include "check-integrity.h"
  45. #include "rcu-string.h"
  46. /* set when additional merges to this rbio are not allowed */
  47. #define RBIO_RMW_LOCKED_BIT 1
  48. /*
  49. * set when this rbio is sitting in the hash, but it is just a cache
  50. * of past RMW
  51. */
  52. #define RBIO_CACHE_BIT 2
  53. /*
  54. * set when it is safe to trust the stripe_pages for caching
  55. */
  56. #define RBIO_CACHE_READY_BIT 3
  57. #define RBIO_CACHE_SIZE 1024
  58. struct btrfs_raid_bio {
  59. struct btrfs_fs_info *fs_info;
  60. struct btrfs_bio *bbio;
  61. /*
  62. * logical block numbers for the start of each stripe
  63. * The last one or two are p/q. These are sorted,
  64. * so raid_map[0] is the start of our full stripe
  65. */
  66. u64 *raid_map;
  67. /* while we're doing rmw on a stripe
  68. * we put it into a hash table so we can
  69. * lock the stripe and merge more rbios
  70. * into it.
  71. */
  72. struct list_head hash_list;
  73. /*
  74. * LRU list for the stripe cache
  75. */
  76. struct list_head stripe_cache;
  77. /*
  78. * for scheduling work in the helper threads
  79. */
  80. struct btrfs_work work;
  81. /*
  82. * bio list and bio_list_lock are used
  83. * to add more bios into the stripe
  84. * in hopes of avoiding the full rmw
  85. */
  86. struct bio_list bio_list;
  87. spinlock_t bio_list_lock;
  88. /* also protected by the bio_list_lock, the
  89. * plug list is used by the plugging code
  90. * to collect partial bios while plugged. The
  91. * stripe locking code also uses it to hand off
  92. * the stripe lock to the next pending IO
  93. */
  94. struct list_head plug_list;
  95. /*
  96. * flags that tell us if it is safe to
  97. * merge with this bio
  98. */
  99. unsigned long flags;
  100. /* size of each individual stripe on disk */
  101. int stripe_len;
  102. /* number of data stripes (no p/q) */
  103. int nr_data;
  104. /*
  105. * set if we're doing a parity rebuild
  106. * for a read from higher up, which is handled
  107. * differently from a parity rebuild as part of
  108. * rmw
  109. */
  110. int read_rebuild;
  111. /* first bad stripe */
  112. int faila;
  113. /* second bad stripe (for raid6 use) */
  114. int failb;
  115. /*
  116. * number of pages needed to represent the full
  117. * stripe
  118. */
  119. int nr_pages;
  120. /*
  121. * size of all the bios in the bio_list. This
  122. * helps us decide if the rbio maps to a full
  123. * stripe or not
  124. */
  125. int bio_list_bytes;
  126. atomic_t refs;
  127. /*
  128. * these are two arrays of pointers. We allocate the
  129. * rbio big enough to hold them both and setup their
  130. * locations when the rbio is allocated
  131. */
  132. /* pointers to pages that we allocated for
  133. * reading/writing stripes directly from the disk (including P/Q)
  134. */
  135. struct page **stripe_pages;
  136. /*
  137. * pointers to the pages in the bio_list. Stored
  138. * here for faster lookup
  139. */
  140. struct page **bio_pages;
  141. };
  142. static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
  143. static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
  144. static void rmw_work(struct btrfs_work *work);
  145. static void read_rebuild_work(struct btrfs_work *work);
  146. static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
  147. static void async_read_rebuild(struct btrfs_raid_bio *rbio);
  148. static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
  149. static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
  150. static void __free_raid_bio(struct btrfs_raid_bio *rbio);
  151. static void index_rbio_pages(struct btrfs_raid_bio *rbio);
  152. static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
  153. /*
  154. * the stripe hash table is used for locking, and to collect
  155. * bios in hopes of making a full stripe
  156. */
  157. int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
  158. {
  159. struct btrfs_stripe_hash_table *table;
  160. struct btrfs_stripe_hash_table *x;
  161. struct btrfs_stripe_hash *cur;
  162. struct btrfs_stripe_hash *h;
  163. int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
  164. int i;
  165. int table_size;
  166. if (info->stripe_hash_table)
  167. return 0;
  168. /*
  169. * The table is large, starting with order 4 and can go as high as
  170. * order 7 in case lock debugging is turned on.
  171. *
  172. * Try harder to allocate and fallback to vmalloc to lower the chance
  173. * of a failing mount.
  174. */
  175. table_size = sizeof(*table) + sizeof(*h) * num_entries;
  176. table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
  177. if (!table) {
  178. table = vzalloc(table_size);
  179. if (!table)
  180. return -ENOMEM;
  181. }
  182. spin_lock_init(&table->cache_lock);
  183. INIT_LIST_HEAD(&table->stripe_cache);
  184. h = table->table;
  185. for (i = 0; i < num_entries; i++) {
  186. cur = h + i;
  187. INIT_LIST_HEAD(&cur->hash_list);
  188. spin_lock_init(&cur->lock);
  189. init_waitqueue_head(&cur->wait);
  190. }
  191. x = cmpxchg(&info->stripe_hash_table, NULL, table);
  192. if (x) {
  193. if (is_vmalloc_addr(x))
  194. vfree(x);
  195. else
  196. kfree(x);
  197. }
  198. return 0;
  199. }
  200. /*
  201. * caching an rbio means to copy anything from the
  202. * bio_pages array into the stripe_pages array. We
  203. * use the page uptodate bit in the stripe cache array
  204. * to indicate if it has valid data
  205. *
  206. * once the caching is done, we set the cache ready
  207. * bit.
  208. */
  209. static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
  210. {
  211. int i;
  212. char *s;
  213. char *d;
  214. int ret;
  215. ret = alloc_rbio_pages(rbio);
  216. if (ret)
  217. return;
  218. for (i = 0; i < rbio->nr_pages; i++) {
  219. if (!rbio->bio_pages[i])
  220. continue;
  221. s = kmap(rbio->bio_pages[i]);
  222. d = kmap(rbio->stripe_pages[i]);
  223. memcpy(d, s, PAGE_CACHE_SIZE);
  224. kunmap(rbio->bio_pages[i]);
  225. kunmap(rbio->stripe_pages[i]);
  226. SetPageUptodate(rbio->stripe_pages[i]);
  227. }
  228. set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  229. }
  230. /*
  231. * we hash on the first logical address of the stripe
  232. */
  233. static int rbio_bucket(struct btrfs_raid_bio *rbio)
  234. {
  235. u64 num = rbio->raid_map[0];
  236. /*
  237. * we shift down quite a bit. We're using byte
  238. * addressing, and most of the lower bits are zeros.
  239. * This tends to upset hash_64, and it consistently
  240. * returns just one or two different values.
  241. *
  242. * shifting off the lower bits fixes things.
  243. */
  244. return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
  245. }
  246. /*
  247. * stealing an rbio means taking all the uptodate pages from the stripe
  248. * array in the source rbio and putting them into the destination rbio
  249. */
  250. static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
  251. {
  252. int i;
  253. struct page *s;
  254. struct page *d;
  255. if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
  256. return;
  257. for (i = 0; i < dest->nr_pages; i++) {
  258. s = src->stripe_pages[i];
  259. if (!s || !PageUptodate(s)) {
  260. continue;
  261. }
  262. d = dest->stripe_pages[i];
  263. if (d)
  264. __free_page(d);
  265. dest->stripe_pages[i] = s;
  266. src->stripe_pages[i] = NULL;
  267. }
  268. }
  269. /*
  270. * merging means we take the bio_list from the victim and
  271. * splice it into the destination. The victim should
  272. * be discarded afterwards.
  273. *
  274. * must be called with dest->rbio_list_lock held
  275. */
  276. static void merge_rbio(struct btrfs_raid_bio *dest,
  277. struct btrfs_raid_bio *victim)
  278. {
  279. bio_list_merge(&dest->bio_list, &victim->bio_list);
  280. dest->bio_list_bytes += victim->bio_list_bytes;
  281. bio_list_init(&victim->bio_list);
  282. }
  283. /*
  284. * used to prune items that are in the cache. The caller
  285. * must hold the hash table lock.
  286. */
  287. static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
  288. {
  289. int bucket = rbio_bucket(rbio);
  290. struct btrfs_stripe_hash_table *table;
  291. struct btrfs_stripe_hash *h;
  292. int freeit = 0;
  293. /*
  294. * check the bit again under the hash table lock.
  295. */
  296. if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
  297. return;
  298. table = rbio->fs_info->stripe_hash_table;
  299. h = table->table + bucket;
  300. /* hold the lock for the bucket because we may be
  301. * removing it from the hash table
  302. */
  303. spin_lock(&h->lock);
  304. /*
  305. * hold the lock for the bio list because we need
  306. * to make sure the bio list is empty
  307. */
  308. spin_lock(&rbio->bio_list_lock);
  309. if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
  310. list_del_init(&rbio->stripe_cache);
  311. table->cache_size -= 1;
  312. freeit = 1;
  313. /* if the bio list isn't empty, this rbio is
  314. * still involved in an IO. We take it out
  315. * of the cache list, and drop the ref that
  316. * was held for the list.
  317. *
  318. * If the bio_list was empty, we also remove
  319. * the rbio from the hash_table, and drop
  320. * the corresponding ref
  321. */
  322. if (bio_list_empty(&rbio->bio_list)) {
  323. if (!list_empty(&rbio->hash_list)) {
  324. list_del_init(&rbio->hash_list);
  325. atomic_dec(&rbio->refs);
  326. BUG_ON(!list_empty(&rbio->plug_list));
  327. }
  328. }
  329. }
  330. spin_unlock(&rbio->bio_list_lock);
  331. spin_unlock(&h->lock);
  332. if (freeit)
  333. __free_raid_bio(rbio);
  334. }
  335. /*
  336. * prune a given rbio from the cache
  337. */
  338. static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
  339. {
  340. struct btrfs_stripe_hash_table *table;
  341. unsigned long flags;
  342. if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
  343. return;
  344. table = rbio->fs_info->stripe_hash_table;
  345. spin_lock_irqsave(&table->cache_lock, flags);
  346. __remove_rbio_from_cache(rbio);
  347. spin_unlock_irqrestore(&table->cache_lock, flags);
  348. }
  349. /*
  350. * remove everything in the cache
  351. */
  352. void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
  353. {
  354. struct btrfs_stripe_hash_table *table;
  355. unsigned long flags;
  356. struct btrfs_raid_bio *rbio;
  357. table = info->stripe_hash_table;
  358. spin_lock_irqsave(&table->cache_lock, flags);
  359. while (!list_empty(&table->stripe_cache)) {
  360. rbio = list_entry(table->stripe_cache.next,
  361. struct btrfs_raid_bio,
  362. stripe_cache);
  363. __remove_rbio_from_cache(rbio);
  364. }
  365. spin_unlock_irqrestore(&table->cache_lock, flags);
  366. }
  367. /*
  368. * remove all cached entries and free the hash table
  369. * used by unmount
  370. */
  371. void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
  372. {
  373. if (!info->stripe_hash_table)
  374. return;
  375. btrfs_clear_rbio_cache(info);
  376. if (is_vmalloc_addr(info->stripe_hash_table))
  377. vfree(info->stripe_hash_table);
  378. else
  379. kfree(info->stripe_hash_table);
  380. info->stripe_hash_table = NULL;
  381. }
  382. /*
  383. * insert an rbio into the stripe cache. It
  384. * must have already been prepared by calling
  385. * cache_rbio_pages
  386. *
  387. * If this rbio was already cached, it gets
  388. * moved to the front of the lru.
  389. *
  390. * If the size of the rbio cache is too big, we
  391. * prune an item.
  392. */
  393. static void cache_rbio(struct btrfs_raid_bio *rbio)
  394. {
  395. struct btrfs_stripe_hash_table *table;
  396. unsigned long flags;
  397. if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
  398. return;
  399. table = rbio->fs_info->stripe_hash_table;
  400. spin_lock_irqsave(&table->cache_lock, flags);
  401. spin_lock(&rbio->bio_list_lock);
  402. /* bump our ref if we were not in the list before */
  403. if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
  404. atomic_inc(&rbio->refs);
  405. if (!list_empty(&rbio->stripe_cache)){
  406. list_move(&rbio->stripe_cache, &table->stripe_cache);
  407. } else {
  408. list_add(&rbio->stripe_cache, &table->stripe_cache);
  409. table->cache_size += 1;
  410. }
  411. spin_unlock(&rbio->bio_list_lock);
  412. if (table->cache_size > RBIO_CACHE_SIZE) {
  413. struct btrfs_raid_bio *found;
  414. found = list_entry(table->stripe_cache.prev,
  415. struct btrfs_raid_bio,
  416. stripe_cache);
  417. if (found != rbio)
  418. __remove_rbio_from_cache(found);
  419. }
  420. spin_unlock_irqrestore(&table->cache_lock, flags);
  421. return;
  422. }
  423. /*
  424. * helper function to run the xor_blocks api. It is only
  425. * able to do MAX_XOR_BLOCKS at a time, so we need to
  426. * loop through.
  427. */
  428. static void run_xor(void **pages, int src_cnt, ssize_t len)
  429. {
  430. int src_off = 0;
  431. int xor_src_cnt = 0;
  432. void *dest = pages[src_cnt];
  433. while(src_cnt > 0) {
  434. xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
  435. xor_blocks(xor_src_cnt, len, dest, pages + src_off);
  436. src_cnt -= xor_src_cnt;
  437. src_off += xor_src_cnt;
  438. }
  439. }
  440. /*
  441. * returns true if the bio list inside this rbio
  442. * covers an entire stripe (no rmw required).
  443. * Must be called with the bio list lock held, or
  444. * at a time when you know it is impossible to add
  445. * new bios into the list
  446. */
  447. static int __rbio_is_full(struct btrfs_raid_bio *rbio)
  448. {
  449. unsigned long size = rbio->bio_list_bytes;
  450. int ret = 1;
  451. if (size != rbio->nr_data * rbio->stripe_len)
  452. ret = 0;
  453. BUG_ON(size > rbio->nr_data * rbio->stripe_len);
  454. return ret;
  455. }
  456. static int rbio_is_full(struct btrfs_raid_bio *rbio)
  457. {
  458. unsigned long flags;
  459. int ret;
  460. spin_lock_irqsave(&rbio->bio_list_lock, flags);
  461. ret = __rbio_is_full(rbio);
  462. spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
  463. return ret;
  464. }
  465. /*
  466. * returns 1 if it is safe to merge two rbios together.
  467. * The merging is safe if the two rbios correspond to
  468. * the same stripe and if they are both going in the same
  469. * direction (read vs write), and if neither one is
  470. * locked for final IO
  471. *
  472. * The caller is responsible for locking such that
  473. * rmw_locked is safe to test
  474. */
  475. static int rbio_can_merge(struct btrfs_raid_bio *last,
  476. struct btrfs_raid_bio *cur)
  477. {
  478. if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
  479. test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
  480. return 0;
  481. /*
  482. * we can't merge with cached rbios, since the
  483. * idea is that when we merge the destination
  484. * rbio is going to run our IO for us. We can
  485. * steal from cached rbio's though, other functions
  486. * handle that.
  487. */
  488. if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
  489. test_bit(RBIO_CACHE_BIT, &cur->flags))
  490. return 0;
  491. if (last->raid_map[0] !=
  492. cur->raid_map[0])
  493. return 0;
  494. /* reads can't merge with writes */
  495. if (last->read_rebuild !=
  496. cur->read_rebuild) {
  497. return 0;
  498. }
  499. return 1;
  500. }
  501. /*
  502. * helper to index into the pstripe
  503. */
  504. static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
  505. {
  506. index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
  507. return rbio->stripe_pages[index];
  508. }
  509. /*
  510. * helper to index into the qstripe, returns null
  511. * if there is no qstripe
  512. */
  513. static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
  514. {
  515. if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
  516. return NULL;
  517. index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
  518. PAGE_CACHE_SHIFT;
  519. return rbio->stripe_pages[index];
  520. }
  521. /*
  522. * The first stripe in the table for a logical address
  523. * has the lock. rbios are added in one of three ways:
  524. *
  525. * 1) Nobody has the stripe locked yet. The rbio is given
  526. * the lock and 0 is returned. The caller must start the IO
  527. * themselves.
  528. *
  529. * 2) Someone has the stripe locked, but we're able to merge
  530. * with the lock owner. The rbio is freed and the IO will
  531. * start automatically along with the existing rbio. 1 is returned.
  532. *
  533. * 3) Someone has the stripe locked, but we're not able to merge.
  534. * The rbio is added to the lock owner's plug list, or merged into
  535. * an rbio already on the plug list. When the lock owner unlocks,
  536. * the next rbio on the list is run and the IO is started automatically.
  537. * 1 is returned
  538. *
  539. * If we return 0, the caller still owns the rbio and must continue with
  540. * IO submission. If we return 1, the caller must assume the rbio has
  541. * already been freed.
  542. */
  543. static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
  544. {
  545. int bucket = rbio_bucket(rbio);
  546. struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
  547. struct btrfs_raid_bio *cur;
  548. struct btrfs_raid_bio *pending;
  549. unsigned long flags;
  550. DEFINE_WAIT(wait);
  551. struct btrfs_raid_bio *freeit = NULL;
  552. struct btrfs_raid_bio *cache_drop = NULL;
  553. int ret = 0;
  554. int walk = 0;
  555. spin_lock_irqsave(&h->lock, flags);
  556. list_for_each_entry(cur, &h->hash_list, hash_list) {
  557. walk++;
  558. if (cur->raid_map[0] == rbio->raid_map[0]) {
  559. spin_lock(&cur->bio_list_lock);
  560. /* can we steal this cached rbio's pages? */
  561. if (bio_list_empty(&cur->bio_list) &&
  562. list_empty(&cur->plug_list) &&
  563. test_bit(RBIO_CACHE_BIT, &cur->flags) &&
  564. !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
  565. list_del_init(&cur->hash_list);
  566. atomic_dec(&cur->refs);
  567. steal_rbio(cur, rbio);
  568. cache_drop = cur;
  569. spin_unlock(&cur->bio_list_lock);
  570. goto lockit;
  571. }
  572. /* can we merge into the lock owner? */
  573. if (rbio_can_merge(cur, rbio)) {
  574. merge_rbio(cur, rbio);
  575. spin_unlock(&cur->bio_list_lock);
  576. freeit = rbio;
  577. ret = 1;
  578. goto out;
  579. }
  580. /*
  581. * we couldn't merge with the running
  582. * rbio, see if we can merge with the
  583. * pending ones. We don't have to
  584. * check for rmw_locked because there
  585. * is no way they are inside finish_rmw
  586. * right now
  587. */
  588. list_for_each_entry(pending, &cur->plug_list,
  589. plug_list) {
  590. if (rbio_can_merge(pending, rbio)) {
  591. merge_rbio(pending, rbio);
  592. spin_unlock(&cur->bio_list_lock);
  593. freeit = rbio;
  594. ret = 1;
  595. goto out;
  596. }
  597. }
  598. /* no merging, put us on the tail of the plug list,
  599. * our rbio will be started with the currently
  600. * running rbio unlocks
  601. */
  602. list_add_tail(&rbio->plug_list, &cur->plug_list);
  603. spin_unlock(&cur->bio_list_lock);
  604. ret = 1;
  605. goto out;
  606. }
  607. }
  608. lockit:
  609. atomic_inc(&rbio->refs);
  610. list_add(&rbio->hash_list, &h->hash_list);
  611. out:
  612. spin_unlock_irqrestore(&h->lock, flags);
  613. if (cache_drop)
  614. remove_rbio_from_cache(cache_drop);
  615. if (freeit)
  616. __free_raid_bio(freeit);
  617. return ret;
  618. }
  619. /*
  620. * called as rmw or parity rebuild is completed. If the plug list has more
  621. * rbios waiting for this stripe, the next one on the list will be started
  622. */
  623. static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
  624. {
  625. int bucket;
  626. struct btrfs_stripe_hash *h;
  627. unsigned long flags;
  628. int keep_cache = 0;
  629. bucket = rbio_bucket(rbio);
  630. h = rbio->fs_info->stripe_hash_table->table + bucket;
  631. if (list_empty(&rbio->plug_list))
  632. cache_rbio(rbio);
  633. spin_lock_irqsave(&h->lock, flags);
  634. spin_lock(&rbio->bio_list_lock);
  635. if (!list_empty(&rbio->hash_list)) {
  636. /*
  637. * if we're still cached and there is no other IO
  638. * to perform, just leave this rbio here for others
  639. * to steal from later
  640. */
  641. if (list_empty(&rbio->plug_list) &&
  642. test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
  643. keep_cache = 1;
  644. clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  645. BUG_ON(!bio_list_empty(&rbio->bio_list));
  646. goto done;
  647. }
  648. list_del_init(&rbio->hash_list);
  649. atomic_dec(&rbio->refs);
  650. /*
  651. * we use the plug list to hold all the rbios
  652. * waiting for the chance to lock this stripe.
  653. * hand the lock over to one of them.
  654. */
  655. if (!list_empty(&rbio->plug_list)) {
  656. struct btrfs_raid_bio *next;
  657. struct list_head *head = rbio->plug_list.next;
  658. next = list_entry(head, struct btrfs_raid_bio,
  659. plug_list);
  660. list_del_init(&rbio->plug_list);
  661. list_add(&next->hash_list, &h->hash_list);
  662. atomic_inc(&next->refs);
  663. spin_unlock(&rbio->bio_list_lock);
  664. spin_unlock_irqrestore(&h->lock, flags);
  665. if (next->read_rebuild)
  666. async_read_rebuild(next);
  667. else {
  668. steal_rbio(rbio, next);
  669. async_rmw_stripe(next);
  670. }
  671. goto done_nolock;
  672. } else if (waitqueue_active(&h->wait)) {
  673. spin_unlock(&rbio->bio_list_lock);
  674. spin_unlock_irqrestore(&h->lock, flags);
  675. wake_up(&h->wait);
  676. goto done_nolock;
  677. }
  678. }
  679. done:
  680. spin_unlock(&rbio->bio_list_lock);
  681. spin_unlock_irqrestore(&h->lock, flags);
  682. done_nolock:
  683. if (!keep_cache)
  684. remove_rbio_from_cache(rbio);
  685. }
  686. static void __free_raid_bio(struct btrfs_raid_bio *rbio)
  687. {
  688. int i;
  689. WARN_ON(atomic_read(&rbio->refs) < 0);
  690. if (!atomic_dec_and_test(&rbio->refs))
  691. return;
  692. WARN_ON(!list_empty(&rbio->stripe_cache));
  693. WARN_ON(!list_empty(&rbio->hash_list));
  694. WARN_ON(!bio_list_empty(&rbio->bio_list));
  695. for (i = 0; i < rbio->nr_pages; i++) {
  696. if (rbio->stripe_pages[i]) {
  697. __free_page(rbio->stripe_pages[i]);
  698. rbio->stripe_pages[i] = NULL;
  699. }
  700. }
  701. kfree(rbio->raid_map);
  702. kfree(rbio->bbio);
  703. kfree(rbio);
  704. }
  705. static void free_raid_bio(struct btrfs_raid_bio *rbio)
  706. {
  707. unlock_stripe(rbio);
  708. __free_raid_bio(rbio);
  709. }
  710. /*
  711. * this frees the rbio and runs through all the bios in the
  712. * bio_list and calls end_io on them
  713. */
  714. static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
  715. {
  716. struct bio *cur = bio_list_get(&rbio->bio_list);
  717. struct bio *next;
  718. free_raid_bio(rbio);
  719. while (cur) {
  720. next = cur->bi_next;
  721. cur->bi_next = NULL;
  722. if (uptodate)
  723. set_bit(BIO_UPTODATE, &cur->bi_flags);
  724. bio_endio(cur, err);
  725. cur = next;
  726. }
  727. }
  728. /*
  729. * end io function used by finish_rmw. When we finally
  730. * get here, we've written a full stripe
  731. */
  732. static void raid_write_end_io(struct bio *bio, int err)
  733. {
  734. struct btrfs_raid_bio *rbio = bio->bi_private;
  735. if (err)
  736. fail_bio_stripe(rbio, bio);
  737. bio_put(bio);
  738. if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
  739. return;
  740. err = 0;
  741. /* OK, we have read all the stripes we need to. */
  742. if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
  743. err = -EIO;
  744. rbio_orig_end_io(rbio, err, 0);
  745. return;
  746. }
  747. /*
  748. * the read/modify/write code wants to use the original bio for
  749. * any pages it included, and then use the rbio for everything
  750. * else. This function decides if a given index (stripe number)
  751. * and page number in that stripe fall inside the original bio
  752. * or the rbio.
  753. *
  754. * if you set bio_list_only, you'll get a NULL back for any ranges
  755. * that are outside the bio_list
  756. *
  757. * This doesn't take any refs on anything, you get a bare page pointer
  758. * and the caller must bump refs as required.
  759. *
  760. * You must call index_rbio_pages once before you can trust
  761. * the answers from this function.
  762. */
  763. static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
  764. int index, int pagenr, int bio_list_only)
  765. {
  766. int chunk_page;
  767. struct page *p = NULL;
  768. chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
  769. spin_lock_irq(&rbio->bio_list_lock);
  770. p = rbio->bio_pages[chunk_page];
  771. spin_unlock_irq(&rbio->bio_list_lock);
  772. if (p || bio_list_only)
  773. return p;
  774. return rbio->stripe_pages[chunk_page];
  775. }
  776. /*
  777. * number of pages we need for the entire stripe across all the
  778. * drives
  779. */
  780. static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
  781. {
  782. unsigned long nr = stripe_len * nr_stripes;
  783. return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  784. }
  785. /*
  786. * allocation and initial setup for the btrfs_raid_bio. Not
  787. * this does not allocate any pages for rbio->pages.
  788. */
  789. static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
  790. struct btrfs_bio *bbio, u64 *raid_map,
  791. u64 stripe_len)
  792. {
  793. struct btrfs_raid_bio *rbio;
  794. int nr_data = 0;
  795. int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
  796. void *p;
  797. rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
  798. GFP_NOFS);
  799. if (!rbio) {
  800. kfree(raid_map);
  801. kfree(bbio);
  802. return ERR_PTR(-ENOMEM);
  803. }
  804. bio_list_init(&rbio->bio_list);
  805. INIT_LIST_HEAD(&rbio->plug_list);
  806. spin_lock_init(&rbio->bio_list_lock);
  807. INIT_LIST_HEAD(&rbio->stripe_cache);
  808. INIT_LIST_HEAD(&rbio->hash_list);
  809. rbio->bbio = bbio;
  810. rbio->raid_map = raid_map;
  811. rbio->fs_info = root->fs_info;
  812. rbio->stripe_len = stripe_len;
  813. rbio->nr_pages = num_pages;
  814. rbio->faila = -1;
  815. rbio->failb = -1;
  816. atomic_set(&rbio->refs, 1);
  817. /*
  818. * the stripe_pages and bio_pages array point to the extra
  819. * memory we allocated past the end of the rbio
  820. */
  821. p = rbio + 1;
  822. rbio->stripe_pages = p;
  823. rbio->bio_pages = p + sizeof(struct page *) * num_pages;
  824. if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
  825. nr_data = bbio->num_stripes - 2;
  826. else
  827. nr_data = bbio->num_stripes - 1;
  828. rbio->nr_data = nr_data;
  829. return rbio;
  830. }
  831. /* allocate pages for all the stripes in the bio, including parity */
  832. static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
  833. {
  834. int i;
  835. struct page *page;
  836. for (i = 0; i < rbio->nr_pages; i++) {
  837. if (rbio->stripe_pages[i])
  838. continue;
  839. page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
  840. if (!page)
  841. return -ENOMEM;
  842. rbio->stripe_pages[i] = page;
  843. ClearPageUptodate(page);
  844. }
  845. return 0;
  846. }
  847. /* allocate pages for just the p/q stripes */
  848. static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
  849. {
  850. int i;
  851. struct page *page;
  852. i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
  853. for (; i < rbio->nr_pages; i++) {
  854. if (rbio->stripe_pages[i])
  855. continue;
  856. page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
  857. if (!page)
  858. return -ENOMEM;
  859. rbio->stripe_pages[i] = page;
  860. }
  861. return 0;
  862. }
  863. /*
  864. * add a single page from a specific stripe into our list of bios for IO
  865. * this will try to merge into existing bios if possible, and returns
  866. * zero if all went well.
  867. */
  868. int rbio_add_io_page(struct btrfs_raid_bio *rbio,
  869. struct bio_list *bio_list,
  870. struct page *page,
  871. int stripe_nr,
  872. unsigned long page_index,
  873. unsigned long bio_max_len)
  874. {
  875. struct bio *last = bio_list->tail;
  876. u64 last_end = 0;
  877. int ret;
  878. struct bio *bio;
  879. struct btrfs_bio_stripe *stripe;
  880. u64 disk_start;
  881. stripe = &rbio->bbio->stripes[stripe_nr];
  882. disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
  883. /* if the device is missing, just fail this stripe */
  884. if (!stripe->dev->bdev)
  885. return fail_rbio_index(rbio, stripe_nr);
  886. /* see if we can add this page onto our existing bio */
  887. if (last) {
  888. last_end = (u64)last->bi_sector << 9;
  889. last_end += last->bi_size;
  890. /*
  891. * we can't merge these if they are from different
  892. * devices or if they are not contiguous
  893. */
  894. if (last_end == disk_start && stripe->dev->bdev &&
  895. test_bit(BIO_UPTODATE, &last->bi_flags) &&
  896. last->bi_bdev == stripe->dev->bdev) {
  897. ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
  898. if (ret == PAGE_CACHE_SIZE)
  899. return 0;
  900. }
  901. }
  902. /* put a new bio on the list */
  903. bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
  904. if (!bio)
  905. return -ENOMEM;
  906. bio->bi_size = 0;
  907. bio->bi_bdev = stripe->dev->bdev;
  908. bio->bi_sector = disk_start >> 9;
  909. set_bit(BIO_UPTODATE, &bio->bi_flags);
  910. bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
  911. bio_list_add(bio_list, bio);
  912. return 0;
  913. }
  914. /*
  915. * while we're doing the read/modify/write cycle, we could
  916. * have errors in reading pages off the disk. This checks
  917. * for errors and if we're not able to read the page it'll
  918. * trigger parity reconstruction. The rmw will be finished
  919. * after we've reconstructed the failed stripes
  920. */
  921. static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
  922. {
  923. if (rbio->faila >= 0 || rbio->failb >= 0) {
  924. BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
  925. __raid56_parity_recover(rbio);
  926. } else {
  927. finish_rmw(rbio);
  928. }
  929. }
  930. /*
  931. * these are just the pages from the rbio array, not from anything
  932. * the FS sent down to us
  933. */
  934. static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
  935. {
  936. int index;
  937. index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
  938. index += page;
  939. return rbio->stripe_pages[index];
  940. }
  941. /*
  942. * helper function to walk our bio list and populate the bio_pages array with
  943. * the result. This seems expensive, but it is faster than constantly
  944. * searching through the bio list as we setup the IO in finish_rmw or stripe
  945. * reconstruction.
  946. *
  947. * This must be called before you trust the answers from page_in_rbio
  948. */
  949. static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  950. {
  951. struct bio *bio;
  952. u64 start;
  953. unsigned long stripe_offset;
  954. unsigned long page_index;
  955. struct page *p;
  956. int i;
  957. spin_lock_irq(&rbio->bio_list_lock);
  958. bio_list_for_each(bio, &rbio->bio_list) {
  959. start = (u64)bio->bi_sector << 9;
  960. stripe_offset = start - rbio->raid_map[0];
  961. page_index = stripe_offset >> PAGE_CACHE_SHIFT;
  962. for (i = 0; i < bio->bi_vcnt; i++) {
  963. p = bio->bi_io_vec[i].bv_page;
  964. rbio->bio_pages[page_index + i] = p;
  965. }
  966. }
  967. spin_unlock_irq(&rbio->bio_list_lock);
  968. }
  969. /*
  970. * this is called from one of two situations. We either
  971. * have a full stripe from the higher layers, or we've read all
  972. * the missing bits off disk.
  973. *
  974. * This will calculate the parity and then send down any
  975. * changed blocks.
  976. */
  977. static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
  978. {
  979. struct btrfs_bio *bbio = rbio->bbio;
  980. void *pointers[bbio->num_stripes];
  981. int stripe_len = rbio->stripe_len;
  982. int nr_data = rbio->nr_data;
  983. int stripe;
  984. int pagenr;
  985. int p_stripe = -1;
  986. int q_stripe = -1;
  987. struct bio_list bio_list;
  988. struct bio *bio;
  989. int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
  990. int ret;
  991. bio_list_init(&bio_list);
  992. if (bbio->num_stripes - rbio->nr_data == 1) {
  993. p_stripe = bbio->num_stripes - 1;
  994. } else if (bbio->num_stripes - rbio->nr_data == 2) {
  995. p_stripe = bbio->num_stripes - 2;
  996. q_stripe = bbio->num_stripes - 1;
  997. } else {
  998. BUG();
  999. }
  1000. /* at this point we either have a full stripe,
  1001. * or we've read the full stripe from the drive.
  1002. * recalculate the parity and write the new results.
  1003. *
  1004. * We're not allowed to add any new bios to the
  1005. * bio list here, anyone else that wants to
  1006. * change this stripe needs to do their own rmw.
  1007. */
  1008. spin_lock_irq(&rbio->bio_list_lock);
  1009. set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  1010. spin_unlock_irq(&rbio->bio_list_lock);
  1011. atomic_set(&rbio->bbio->error, 0);
  1012. /*
  1013. * now that we've set rmw_locked, run through the
  1014. * bio list one last time and map the page pointers
  1015. *
  1016. * We don't cache full rbios because we're assuming
  1017. * the higher layers are unlikely to use this area of
  1018. * the disk again soon. If they do use it again,
  1019. * hopefully they will send another full bio.
  1020. */
  1021. index_rbio_pages(rbio);
  1022. if (!rbio_is_full(rbio))
  1023. cache_rbio_pages(rbio);
  1024. else
  1025. clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  1026. for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
  1027. struct page *p;
  1028. /* first collect one page from each data stripe */
  1029. for (stripe = 0; stripe < nr_data; stripe++) {
  1030. p = page_in_rbio(rbio, stripe, pagenr, 0);
  1031. pointers[stripe] = kmap(p);
  1032. }
  1033. /* then add the parity stripe */
  1034. p = rbio_pstripe_page(rbio, pagenr);
  1035. SetPageUptodate(p);
  1036. pointers[stripe++] = kmap(p);
  1037. if (q_stripe != -1) {
  1038. /*
  1039. * raid6, add the qstripe and call the
  1040. * library function to fill in our p/q
  1041. */
  1042. p = rbio_qstripe_page(rbio, pagenr);
  1043. SetPageUptodate(p);
  1044. pointers[stripe++] = kmap(p);
  1045. raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
  1046. pointers);
  1047. } else {
  1048. /* raid5 */
  1049. memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
  1050. run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
  1051. }
  1052. for (stripe = 0; stripe < bbio->num_stripes; stripe++)
  1053. kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
  1054. }
  1055. /*
  1056. * time to start writing. Make bios for everything from the
  1057. * higher layers (the bio_list in our rbio) and our p/q. Ignore
  1058. * everything else.
  1059. */
  1060. for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
  1061. for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
  1062. struct page *page;
  1063. if (stripe < rbio->nr_data) {
  1064. page = page_in_rbio(rbio, stripe, pagenr, 1);
  1065. if (!page)
  1066. continue;
  1067. } else {
  1068. page = rbio_stripe_page(rbio, stripe, pagenr);
  1069. }
  1070. ret = rbio_add_io_page(rbio, &bio_list,
  1071. page, stripe, pagenr, rbio->stripe_len);
  1072. if (ret)
  1073. goto cleanup;
  1074. }
  1075. }
  1076. atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
  1077. BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
  1078. while (1) {
  1079. bio = bio_list_pop(&bio_list);
  1080. if (!bio)
  1081. break;
  1082. bio->bi_private = rbio;
  1083. bio->bi_end_io = raid_write_end_io;
  1084. BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
  1085. submit_bio(WRITE, bio);
  1086. }
  1087. return;
  1088. cleanup:
  1089. rbio_orig_end_io(rbio, -EIO, 0);
  1090. }
  1091. /*
  1092. * helper to find the stripe number for a given bio. Used to figure out which
  1093. * stripe has failed. This expects the bio to correspond to a physical disk,
  1094. * so it looks up based on physical sector numbers.
  1095. */
  1096. static int find_bio_stripe(struct btrfs_raid_bio *rbio,
  1097. struct bio *bio)
  1098. {
  1099. u64 physical = bio->bi_sector;
  1100. u64 stripe_start;
  1101. int i;
  1102. struct btrfs_bio_stripe *stripe;
  1103. physical <<= 9;
  1104. for (i = 0; i < rbio->bbio->num_stripes; i++) {
  1105. stripe = &rbio->bbio->stripes[i];
  1106. stripe_start = stripe->physical;
  1107. if (physical >= stripe_start &&
  1108. physical < stripe_start + rbio->stripe_len) {
  1109. return i;
  1110. }
  1111. }
  1112. return -1;
  1113. }
  1114. /*
  1115. * helper to find the stripe number for a given
  1116. * bio (before mapping). Used to figure out which stripe has
  1117. * failed. This looks up based on logical block numbers.
  1118. */
  1119. static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
  1120. struct bio *bio)
  1121. {
  1122. u64 logical = bio->bi_sector;
  1123. u64 stripe_start;
  1124. int i;
  1125. logical <<= 9;
  1126. for (i = 0; i < rbio->nr_data; i++) {
  1127. stripe_start = rbio->raid_map[i];
  1128. if (logical >= stripe_start &&
  1129. logical < stripe_start + rbio->stripe_len) {
  1130. return i;
  1131. }
  1132. }
  1133. return -1;
  1134. }
  1135. /*
  1136. * returns -EIO if we had too many failures
  1137. */
  1138. static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
  1139. {
  1140. unsigned long flags;
  1141. int ret = 0;
  1142. spin_lock_irqsave(&rbio->bio_list_lock, flags);
  1143. /* we already know this stripe is bad, move on */
  1144. if (rbio->faila == failed || rbio->failb == failed)
  1145. goto out;
  1146. if (rbio->faila == -1) {
  1147. /* first failure on this rbio */
  1148. rbio->faila = failed;
  1149. atomic_inc(&rbio->bbio->error);
  1150. } else if (rbio->failb == -1) {
  1151. /* second failure on this rbio */
  1152. rbio->failb = failed;
  1153. atomic_inc(&rbio->bbio->error);
  1154. } else {
  1155. ret = -EIO;
  1156. }
  1157. out:
  1158. spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
  1159. return ret;
  1160. }
  1161. /*
  1162. * helper to fail a stripe based on a physical disk
  1163. * bio.
  1164. */
  1165. static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
  1166. struct bio *bio)
  1167. {
  1168. int failed = find_bio_stripe(rbio, bio);
  1169. if (failed < 0)
  1170. return -EIO;
  1171. return fail_rbio_index(rbio, failed);
  1172. }
  1173. /*
  1174. * this sets each page in the bio uptodate. It should only be used on private
  1175. * rbio pages, nothing that comes in from the higher layers
  1176. */
  1177. static void set_bio_pages_uptodate(struct bio *bio)
  1178. {
  1179. int i;
  1180. struct page *p;
  1181. for (i = 0; i < bio->bi_vcnt; i++) {
  1182. p = bio->bi_io_vec[i].bv_page;
  1183. SetPageUptodate(p);
  1184. }
  1185. }
  1186. /*
  1187. * end io for the read phase of the rmw cycle. All the bios here are physical
  1188. * stripe bios we've read from the disk so we can recalculate the parity of the
  1189. * stripe.
  1190. *
  1191. * This will usually kick off finish_rmw once all the bios are read in, but it
  1192. * may trigger parity reconstruction if we had any errors along the way
  1193. */
  1194. static void raid_rmw_end_io(struct bio *bio, int err)
  1195. {
  1196. struct btrfs_raid_bio *rbio = bio->bi_private;
  1197. if (err)
  1198. fail_bio_stripe(rbio, bio);
  1199. else
  1200. set_bio_pages_uptodate(bio);
  1201. bio_put(bio);
  1202. if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
  1203. return;
  1204. err = 0;
  1205. if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
  1206. goto cleanup;
  1207. /*
  1208. * this will normally call finish_rmw to start our write
  1209. * but if there are any failed stripes we'll reconstruct
  1210. * from parity first
  1211. */
  1212. validate_rbio_for_rmw(rbio);
  1213. return;
  1214. cleanup:
  1215. rbio_orig_end_io(rbio, -EIO, 0);
  1216. }
  1217. static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
  1218. {
  1219. rbio->work.flags = 0;
  1220. rbio->work.func = rmw_work;
  1221. btrfs_queue_worker(&rbio->fs_info->rmw_workers,
  1222. &rbio->work);
  1223. }
  1224. static void async_read_rebuild(struct btrfs_raid_bio *rbio)
  1225. {
  1226. rbio->work.flags = 0;
  1227. rbio->work.func = read_rebuild_work;
  1228. btrfs_queue_worker(&rbio->fs_info->rmw_workers,
  1229. &rbio->work);
  1230. }
  1231. /*
  1232. * the stripe must be locked by the caller. It will
  1233. * unlock after all the writes are done
  1234. */
  1235. static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
  1236. {
  1237. int bios_to_read = 0;
  1238. struct btrfs_bio *bbio = rbio->bbio;
  1239. struct bio_list bio_list;
  1240. int ret;
  1241. int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  1242. int pagenr;
  1243. int stripe;
  1244. struct bio *bio;
  1245. bio_list_init(&bio_list);
  1246. ret = alloc_rbio_pages(rbio);
  1247. if (ret)
  1248. goto cleanup;
  1249. index_rbio_pages(rbio);
  1250. atomic_set(&rbio->bbio->error, 0);
  1251. /*
  1252. * build a list of bios to read all the missing parts of this
  1253. * stripe
  1254. */
  1255. for (stripe = 0; stripe < rbio->nr_data; stripe++) {
  1256. for (pagenr = 0; pagenr < nr_pages; pagenr++) {
  1257. struct page *page;
  1258. /*
  1259. * we want to find all the pages missing from
  1260. * the rbio and read them from the disk. If
  1261. * page_in_rbio finds a page in the bio list
  1262. * we don't need to read it off the stripe.
  1263. */
  1264. page = page_in_rbio(rbio, stripe, pagenr, 1);
  1265. if (page)
  1266. continue;
  1267. page = rbio_stripe_page(rbio, stripe, pagenr);
  1268. /*
  1269. * the bio cache may have handed us an uptodate
  1270. * page. If so, be happy and use it
  1271. */
  1272. if (PageUptodate(page))
  1273. continue;
  1274. ret = rbio_add_io_page(rbio, &bio_list, page,
  1275. stripe, pagenr, rbio->stripe_len);
  1276. if (ret)
  1277. goto cleanup;
  1278. }
  1279. }
  1280. bios_to_read = bio_list_size(&bio_list);
  1281. if (!bios_to_read) {
  1282. /*
  1283. * this can happen if others have merged with
  1284. * us, it means there is nothing left to read.
  1285. * But if there are missing devices it may not be
  1286. * safe to do the full stripe write yet.
  1287. */
  1288. goto finish;
  1289. }
  1290. /*
  1291. * the bbio may be freed once we submit the last bio. Make sure
  1292. * not to touch it after that
  1293. */
  1294. atomic_set(&bbio->stripes_pending, bios_to_read);
  1295. while (1) {
  1296. bio = bio_list_pop(&bio_list);
  1297. if (!bio)
  1298. break;
  1299. bio->bi_private = rbio;
  1300. bio->bi_end_io = raid_rmw_end_io;
  1301. btrfs_bio_wq_end_io(rbio->fs_info, bio,
  1302. BTRFS_WQ_ENDIO_RAID56);
  1303. BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
  1304. submit_bio(READ, bio);
  1305. }
  1306. /* the actual write will happen once the reads are done */
  1307. return 0;
  1308. cleanup:
  1309. rbio_orig_end_io(rbio, -EIO, 0);
  1310. return -EIO;
  1311. finish:
  1312. validate_rbio_for_rmw(rbio);
  1313. return 0;
  1314. }
  1315. /*
  1316. * if the upper layers pass in a full stripe, we thank them by only allocating
  1317. * enough pages to hold the parity, and sending it all down quickly.
  1318. */
  1319. static int full_stripe_write(struct btrfs_raid_bio *rbio)
  1320. {
  1321. int ret;
  1322. ret = alloc_rbio_parity_pages(rbio);
  1323. if (ret)
  1324. return ret;
  1325. ret = lock_stripe_add(rbio);
  1326. if (ret == 0)
  1327. finish_rmw(rbio);
  1328. return 0;
  1329. }
  1330. /*
  1331. * partial stripe writes get handed over to async helpers.
  1332. * We're really hoping to merge a few more writes into this
  1333. * rbio before calculating new parity
  1334. */
  1335. static int partial_stripe_write(struct btrfs_raid_bio *rbio)
  1336. {
  1337. int ret;
  1338. ret = lock_stripe_add(rbio);
  1339. if (ret == 0)
  1340. async_rmw_stripe(rbio);
  1341. return 0;
  1342. }
  1343. /*
  1344. * sometimes while we were reading from the drive to
  1345. * recalculate parity, enough new bios come into create
  1346. * a full stripe. So we do a check here to see if we can
  1347. * go directly to finish_rmw
  1348. */
  1349. static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
  1350. {
  1351. /* head off into rmw land if we don't have a full stripe */
  1352. if (!rbio_is_full(rbio))
  1353. return partial_stripe_write(rbio);
  1354. return full_stripe_write(rbio);
  1355. }
  1356. /*
  1357. * We use plugging call backs to collect full stripes.
  1358. * Any time we get a partial stripe write while plugged
  1359. * we collect it into a list. When the unplug comes down,
  1360. * we sort the list by logical block number and merge
  1361. * everything we can into the same rbios
  1362. */
  1363. struct btrfs_plug_cb {
  1364. struct blk_plug_cb cb;
  1365. struct btrfs_fs_info *info;
  1366. struct list_head rbio_list;
  1367. struct btrfs_work work;
  1368. };
  1369. /*
  1370. * rbios on the plug list are sorted for easier merging.
  1371. */
  1372. static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
  1373. {
  1374. struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
  1375. plug_list);
  1376. struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
  1377. plug_list);
  1378. u64 a_sector = ra->bio_list.head->bi_sector;
  1379. u64 b_sector = rb->bio_list.head->bi_sector;
  1380. if (a_sector < b_sector)
  1381. return -1;
  1382. if (a_sector > b_sector)
  1383. return 1;
  1384. return 0;
  1385. }
  1386. static void run_plug(struct btrfs_plug_cb *plug)
  1387. {
  1388. struct btrfs_raid_bio *cur;
  1389. struct btrfs_raid_bio *last = NULL;
  1390. /*
  1391. * sort our plug list then try to merge
  1392. * everything we can in hopes of creating full
  1393. * stripes.
  1394. */
  1395. list_sort(NULL, &plug->rbio_list, plug_cmp);
  1396. while (!list_empty(&plug->rbio_list)) {
  1397. cur = list_entry(plug->rbio_list.next,
  1398. struct btrfs_raid_bio, plug_list);
  1399. list_del_init(&cur->plug_list);
  1400. if (rbio_is_full(cur)) {
  1401. /* we have a full stripe, send it down */
  1402. full_stripe_write(cur);
  1403. continue;
  1404. }
  1405. if (last) {
  1406. if (rbio_can_merge(last, cur)) {
  1407. merge_rbio(last, cur);
  1408. __free_raid_bio(cur);
  1409. continue;
  1410. }
  1411. __raid56_parity_write(last);
  1412. }
  1413. last = cur;
  1414. }
  1415. if (last) {
  1416. __raid56_parity_write(last);
  1417. }
  1418. kfree(plug);
  1419. }
  1420. /*
  1421. * if the unplug comes from schedule, we have to push the
  1422. * work off to a helper thread
  1423. */
  1424. static void unplug_work(struct btrfs_work *work)
  1425. {
  1426. struct btrfs_plug_cb *plug;
  1427. plug = container_of(work, struct btrfs_plug_cb, work);
  1428. run_plug(plug);
  1429. }
  1430. static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
  1431. {
  1432. struct btrfs_plug_cb *plug;
  1433. plug = container_of(cb, struct btrfs_plug_cb, cb);
  1434. if (from_schedule) {
  1435. plug->work.flags = 0;
  1436. plug->work.func = unplug_work;
  1437. btrfs_queue_worker(&plug->info->rmw_workers,
  1438. &plug->work);
  1439. return;
  1440. }
  1441. run_plug(plug);
  1442. }
  1443. /*
  1444. * our main entry point for writes from the rest of the FS.
  1445. */
  1446. int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
  1447. struct btrfs_bio *bbio, u64 *raid_map,
  1448. u64 stripe_len)
  1449. {
  1450. struct btrfs_raid_bio *rbio;
  1451. struct btrfs_plug_cb *plug = NULL;
  1452. struct blk_plug_cb *cb;
  1453. rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
  1454. if (IS_ERR(rbio)) {
  1455. kfree(raid_map);
  1456. kfree(bbio);
  1457. return PTR_ERR(rbio);
  1458. }
  1459. bio_list_add(&rbio->bio_list, bio);
  1460. rbio->bio_list_bytes = bio->bi_size;
  1461. /*
  1462. * don't plug on full rbios, just get them out the door
  1463. * as quickly as we can
  1464. */
  1465. if (rbio_is_full(rbio))
  1466. return full_stripe_write(rbio);
  1467. cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
  1468. sizeof(*plug));
  1469. if (cb) {
  1470. plug = container_of(cb, struct btrfs_plug_cb, cb);
  1471. if (!plug->info) {
  1472. plug->info = root->fs_info;
  1473. INIT_LIST_HEAD(&plug->rbio_list);
  1474. }
  1475. list_add_tail(&rbio->plug_list, &plug->rbio_list);
  1476. } else {
  1477. return __raid56_parity_write(rbio);
  1478. }
  1479. return 0;
  1480. }
  1481. /*
  1482. * all parity reconstruction happens here. We've read in everything
  1483. * we can find from the drives and this does the heavy lifting of
  1484. * sorting the good from the bad.
  1485. */
  1486. static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
  1487. {
  1488. int pagenr, stripe;
  1489. void **pointers;
  1490. int faila = -1, failb = -1;
  1491. int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  1492. struct page *page;
  1493. int err;
  1494. int i;
  1495. pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
  1496. GFP_NOFS);
  1497. if (!pointers) {
  1498. err = -ENOMEM;
  1499. goto cleanup_io;
  1500. }
  1501. faila = rbio->faila;
  1502. failb = rbio->failb;
  1503. if (rbio->read_rebuild) {
  1504. spin_lock_irq(&rbio->bio_list_lock);
  1505. set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  1506. spin_unlock_irq(&rbio->bio_list_lock);
  1507. }
  1508. index_rbio_pages(rbio);
  1509. for (pagenr = 0; pagenr < nr_pages; pagenr++) {
  1510. /* setup our array of pointers with pages
  1511. * from each stripe
  1512. */
  1513. for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
  1514. /*
  1515. * if we're rebuilding a read, we have to use
  1516. * pages from the bio list
  1517. */
  1518. if (rbio->read_rebuild &&
  1519. (stripe == faila || stripe == failb)) {
  1520. page = page_in_rbio(rbio, stripe, pagenr, 0);
  1521. } else {
  1522. page = rbio_stripe_page(rbio, stripe, pagenr);
  1523. }
  1524. pointers[stripe] = kmap(page);
  1525. }
  1526. /* all raid6 handling here */
  1527. if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
  1528. RAID6_Q_STRIPE) {
  1529. /*
  1530. * single failure, rebuild from parity raid5
  1531. * style
  1532. */
  1533. if (failb < 0) {
  1534. if (faila == rbio->nr_data) {
  1535. /*
  1536. * Just the P stripe has failed, without
  1537. * a bad data or Q stripe.
  1538. * TODO, we should redo the xor here.
  1539. */
  1540. err = -EIO;
  1541. goto cleanup;
  1542. }
  1543. /*
  1544. * a single failure in raid6 is rebuilt
  1545. * in the pstripe code below
  1546. */
  1547. goto pstripe;
  1548. }
  1549. /* make sure our ps and qs are in order */
  1550. if (faila > failb) {
  1551. int tmp = failb;
  1552. failb = faila;
  1553. faila = tmp;
  1554. }
  1555. /* if the q stripe is failed, do a pstripe reconstruction
  1556. * from the xors.
  1557. * If both the q stripe and the P stripe are failed, we're
  1558. * here due to a crc mismatch and we can't give them the
  1559. * data they want
  1560. */
  1561. if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
  1562. if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
  1563. err = -EIO;
  1564. goto cleanup;
  1565. }
  1566. /*
  1567. * otherwise we have one bad data stripe and
  1568. * a good P stripe. raid5!
  1569. */
  1570. goto pstripe;
  1571. }
  1572. if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
  1573. raid6_datap_recov(rbio->bbio->num_stripes,
  1574. PAGE_SIZE, faila, pointers);
  1575. } else {
  1576. raid6_2data_recov(rbio->bbio->num_stripes,
  1577. PAGE_SIZE, faila, failb,
  1578. pointers);
  1579. }
  1580. } else {
  1581. void *p;
  1582. /* rebuild from P stripe here (raid5 or raid6) */
  1583. BUG_ON(failb != -1);
  1584. pstripe:
  1585. /* Copy parity block into failed block to start with */
  1586. memcpy(pointers[faila],
  1587. pointers[rbio->nr_data],
  1588. PAGE_CACHE_SIZE);
  1589. /* rearrange the pointer array */
  1590. p = pointers[faila];
  1591. for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
  1592. pointers[stripe] = pointers[stripe + 1];
  1593. pointers[rbio->nr_data - 1] = p;
  1594. /* xor in the rest */
  1595. run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
  1596. }
  1597. /* if we're doing this rebuild as part of an rmw, go through
  1598. * and set all of our private rbio pages in the
  1599. * failed stripes as uptodate. This way finish_rmw will
  1600. * know they can be trusted. If this was a read reconstruction,
  1601. * other endio functions will fiddle the uptodate bits
  1602. */
  1603. if (!rbio->read_rebuild) {
  1604. for (i = 0; i < nr_pages; i++) {
  1605. if (faila != -1) {
  1606. page = rbio_stripe_page(rbio, faila, i);
  1607. SetPageUptodate(page);
  1608. }
  1609. if (failb != -1) {
  1610. page = rbio_stripe_page(rbio, failb, i);
  1611. SetPageUptodate(page);
  1612. }
  1613. }
  1614. }
  1615. for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
  1616. /*
  1617. * if we're rebuilding a read, we have to use
  1618. * pages from the bio list
  1619. */
  1620. if (rbio->read_rebuild &&
  1621. (stripe == faila || stripe == failb)) {
  1622. page = page_in_rbio(rbio, stripe, pagenr, 0);
  1623. } else {
  1624. page = rbio_stripe_page(rbio, stripe, pagenr);
  1625. }
  1626. kunmap(page);
  1627. }
  1628. }
  1629. err = 0;
  1630. cleanup:
  1631. kfree(pointers);
  1632. cleanup_io:
  1633. if (rbio->read_rebuild) {
  1634. if (err == 0)
  1635. cache_rbio_pages(rbio);
  1636. else
  1637. clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  1638. rbio_orig_end_io(rbio, err, err == 0);
  1639. } else if (err == 0) {
  1640. rbio->faila = -1;
  1641. rbio->failb = -1;
  1642. finish_rmw(rbio);
  1643. } else {
  1644. rbio_orig_end_io(rbio, err, 0);
  1645. }
  1646. }
  1647. /*
  1648. * This is called only for stripes we've read from disk to
  1649. * reconstruct the parity.
  1650. */
  1651. static void raid_recover_end_io(struct bio *bio, int err)
  1652. {
  1653. struct btrfs_raid_bio *rbio = bio->bi_private;
  1654. /*
  1655. * we only read stripe pages off the disk, set them
  1656. * up to date if there were no errors
  1657. */
  1658. if (err)
  1659. fail_bio_stripe(rbio, bio);
  1660. else
  1661. set_bio_pages_uptodate(bio);
  1662. bio_put(bio);
  1663. if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
  1664. return;
  1665. if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
  1666. rbio_orig_end_io(rbio, -EIO, 0);
  1667. else
  1668. __raid_recover_end_io(rbio);
  1669. }
  1670. /*
  1671. * reads everything we need off the disk to reconstruct
  1672. * the parity. endio handlers trigger final reconstruction
  1673. * when the IO is done.
  1674. *
  1675. * This is used both for reads from the higher layers and for
  1676. * parity construction required to finish a rmw cycle.
  1677. */
  1678. static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
  1679. {
  1680. int bios_to_read = 0;
  1681. struct btrfs_bio *bbio = rbio->bbio;
  1682. struct bio_list bio_list;
  1683. int ret;
  1684. int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  1685. int pagenr;
  1686. int stripe;
  1687. struct bio *bio;
  1688. bio_list_init(&bio_list);
  1689. ret = alloc_rbio_pages(rbio);
  1690. if (ret)
  1691. goto cleanup;
  1692. atomic_set(&rbio->bbio->error, 0);
  1693. /*
  1694. * read everything that hasn't failed. Thanks to the
  1695. * stripe cache, it is possible that some or all of these
  1696. * pages are going to be uptodate.
  1697. */
  1698. for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
  1699. if (rbio->faila == stripe ||
  1700. rbio->failb == stripe)
  1701. continue;
  1702. for (pagenr = 0; pagenr < nr_pages; pagenr++) {
  1703. struct page *p;
  1704. /*
  1705. * the rmw code may have already read this
  1706. * page in
  1707. */
  1708. p = rbio_stripe_page(rbio, stripe, pagenr);
  1709. if (PageUptodate(p))
  1710. continue;
  1711. ret = rbio_add_io_page(rbio, &bio_list,
  1712. rbio_stripe_page(rbio, stripe, pagenr),
  1713. stripe, pagenr, rbio->stripe_len);
  1714. if (ret < 0)
  1715. goto cleanup;
  1716. }
  1717. }
  1718. bios_to_read = bio_list_size(&bio_list);
  1719. if (!bios_to_read) {
  1720. /*
  1721. * we might have no bios to read just because the pages
  1722. * were up to date, or we might have no bios to read because
  1723. * the devices were gone.
  1724. */
  1725. if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
  1726. __raid_recover_end_io(rbio);
  1727. goto out;
  1728. } else {
  1729. goto cleanup;
  1730. }
  1731. }
  1732. /*
  1733. * the bbio may be freed once we submit the last bio. Make sure
  1734. * not to touch it after that
  1735. */
  1736. atomic_set(&bbio->stripes_pending, bios_to_read);
  1737. while (1) {
  1738. bio = bio_list_pop(&bio_list);
  1739. if (!bio)
  1740. break;
  1741. bio->bi_private = rbio;
  1742. bio->bi_end_io = raid_recover_end_io;
  1743. btrfs_bio_wq_end_io(rbio->fs_info, bio,
  1744. BTRFS_WQ_ENDIO_RAID56);
  1745. BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
  1746. submit_bio(READ, bio);
  1747. }
  1748. out:
  1749. return 0;
  1750. cleanup:
  1751. if (rbio->read_rebuild)
  1752. rbio_orig_end_io(rbio, -EIO, 0);
  1753. return -EIO;
  1754. }
  1755. /*
  1756. * the main entry point for reads from the higher layers. This
  1757. * is really only called when the normal read path had a failure,
  1758. * so we assume the bio they send down corresponds to a failed part
  1759. * of the drive.
  1760. */
  1761. int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
  1762. struct btrfs_bio *bbio, u64 *raid_map,
  1763. u64 stripe_len, int mirror_num)
  1764. {
  1765. struct btrfs_raid_bio *rbio;
  1766. int ret;
  1767. rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
  1768. if (IS_ERR(rbio)) {
  1769. return PTR_ERR(rbio);
  1770. }
  1771. rbio->read_rebuild = 1;
  1772. bio_list_add(&rbio->bio_list, bio);
  1773. rbio->bio_list_bytes = bio->bi_size;
  1774. rbio->faila = find_logical_bio_stripe(rbio, bio);
  1775. if (rbio->faila == -1) {
  1776. BUG();
  1777. kfree(rbio);
  1778. return -EIO;
  1779. }
  1780. /*
  1781. * reconstruct from the q stripe if they are
  1782. * asking for mirror 3
  1783. */
  1784. if (mirror_num == 3)
  1785. rbio->failb = bbio->num_stripes - 2;
  1786. ret = lock_stripe_add(rbio);
  1787. /*
  1788. * __raid56_parity_recover will end the bio with
  1789. * any errors it hits. We don't want to return
  1790. * its error value up the stack because our caller
  1791. * will end up calling bio_endio with any nonzero
  1792. * return
  1793. */
  1794. if (ret == 0)
  1795. __raid56_parity_recover(rbio);
  1796. /*
  1797. * our rbio has been added to the list of
  1798. * rbios that will be handled after the
  1799. * currently lock owner is done
  1800. */
  1801. return 0;
  1802. }
  1803. static void rmw_work(struct btrfs_work *work)
  1804. {
  1805. struct btrfs_raid_bio *rbio;
  1806. rbio = container_of(work, struct btrfs_raid_bio, work);
  1807. raid56_rmw_stripe(rbio);
  1808. }
  1809. static void read_rebuild_work(struct btrfs_work *work)
  1810. {
  1811. struct btrfs_raid_bio *rbio;
  1812. rbio = container_of(work, struct btrfs_raid_bio, work);
  1813. __raid56_parity_recover(rbio);
  1814. }